diff options
Diffstat (limited to 'kernel')
114 files changed, 5349 insertions, 3590 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 22bb4f24f071..8d528f9930da 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1883,6 +1883,23 @@ out_null: | |||
1883 | audit_log_format(ab, " exe=(null)"); | 1883 | audit_log_format(ab, " exe=(null)"); |
1884 | } | 1884 | } |
1885 | 1885 | ||
1886 | struct tty_struct *audit_get_tty(struct task_struct *tsk) | ||
1887 | { | ||
1888 | struct tty_struct *tty = NULL; | ||
1889 | unsigned long flags; | ||
1890 | |||
1891 | spin_lock_irqsave(&tsk->sighand->siglock, flags); | ||
1892 | if (tsk->signal) | ||
1893 | tty = tty_kref_get(tsk->signal->tty); | ||
1894 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); | ||
1895 | return tty; | ||
1896 | } | ||
1897 | |||
1898 | void audit_put_tty(struct tty_struct *tty) | ||
1899 | { | ||
1900 | tty_kref_put(tty); | ||
1901 | } | ||
1902 | |||
1886 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 1903 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
1887 | { | 1904 | { |
1888 | const struct cred *cred; | 1905 | const struct cred *cred; |
diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..a492f4c4e710 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/skbuff.h> | 24 | #include <linux/skbuff.h> |
25 | #include <uapi/linux/mqueue.h> | 25 | #include <uapi/linux/mqueue.h> |
26 | #include <linux/tty.h> | ||
26 | 27 | ||
27 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 28 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
28 | * for saving names from getname(). If we get more names we will allocate | 29 | * for saving names from getname(). If we get more names we will allocate |
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); | |||
262 | extern void audit_log_d_path_exe(struct audit_buffer *ab, | 263 | extern void audit_log_d_path_exe(struct audit_buffer *ab, |
263 | struct mm_struct *mm); | 264 | struct mm_struct *mm); |
264 | 265 | ||
266 | extern struct tty_struct *audit_get_tty(struct task_struct *tsk); | ||
267 | extern void audit_put_tty(struct tty_struct *tty); | ||
268 | |||
265 | /* audit watch functions */ | 269 | /* audit watch functions */ |
266 | #ifdef CONFIG_AUDIT_WATCH | 270 | #ifdef CONFIG_AUDIT_WATCH |
267 | extern void audit_put_watch(struct audit_watch *watch); | 271 | extern void audit_put_watch(struct audit_watch *watch); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 62ab53d7619c..2672d105cffc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -63,7 +63,6 @@ | |||
63 | #include <asm/unistd.h> | 63 | #include <asm/unistd.h> |
64 | #include <linux/security.h> | 64 | #include <linux/security.h> |
65 | #include <linux/list.h> | 65 | #include <linux/list.h> |
66 | #include <linux/tty.h> | ||
67 | #include <linux/binfmts.h> | 66 | #include <linux/binfmts.h> |
68 | #include <linux/highmem.h> | 67 | #include <linux/highmem.h> |
69 | #include <linux/syscalls.h> | 68 | #include <linux/syscalls.h> |
@@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, | |||
1985 | if (!audit_enabled) | 1984 | if (!audit_enabled) |
1986 | return; | 1985 | return; |
1987 | 1986 | ||
1987 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | ||
1988 | if (!ab) | ||
1989 | return; | ||
1990 | |||
1988 | uid = from_kuid(&init_user_ns, task_uid(current)); | 1991 | uid = from_kuid(&init_user_ns, task_uid(current)); |
1989 | oldloginuid = from_kuid(&init_user_ns, koldloginuid); | 1992 | oldloginuid = from_kuid(&init_user_ns, koldloginuid); |
1990 | loginuid = from_kuid(&init_user_ns, kloginuid), | 1993 | loginuid = from_kuid(&init_user_ns, kloginuid), |
1991 | tty = audit_get_tty(current); | 1994 | tty = audit_get_tty(current); |
1992 | 1995 | ||
1993 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | ||
1994 | if (!ab) | ||
1995 | return; | ||
1996 | audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); | 1996 | audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); |
1997 | audit_log_task_context(ab); | 1997 | audit_log_task_context(ab); |
1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", | 1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) | |||
328 | } | 328 | } |
329 | 329 | ||
330 | /* only called from syscall */ | 330 | /* only called from syscall */ |
331 | static int fd_array_map_update_elem(struct bpf_map *map, void *key, | 331 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, |
332 | void *value, u64 map_flags) | 332 | void *key, void *value, u64 map_flags) |
333 | { | 333 | { |
334 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 334 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
335 | void *new_ptr, *old_ptr; | 335 | void *new_ptr, *old_ptr; |
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, | |||
342 | return -E2BIG; | 342 | return -E2BIG; |
343 | 343 | ||
344 | ufd = *(u32 *)value; | 344 | ufd = *(u32 *)value; |
345 | new_ptr = map->ops->map_fd_get_ptr(map, ufd); | 345 | new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); |
346 | if (IS_ERR(new_ptr)) | 346 | if (IS_ERR(new_ptr)) |
347 | return PTR_ERR(new_ptr); | 347 | return PTR_ERR(new_ptr); |
348 | 348 | ||
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) | |||
371 | } | 371 | } |
372 | } | 372 | } |
373 | 373 | ||
374 | static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) | 374 | static void *prog_fd_array_get_ptr(struct bpf_map *map, |
375 | struct file *map_file, int fd) | ||
375 | { | 376 | { |
376 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 377 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
377 | struct bpf_prog *prog = bpf_prog_get(fd); | 378 | struct bpf_prog *prog = bpf_prog_get(fd); |
379 | |||
378 | if (IS_ERR(prog)) | 380 | if (IS_ERR(prog)) |
379 | return prog; | 381 | return prog; |
380 | 382 | ||
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) | |||
382 | bpf_prog_put(prog); | 384 | bpf_prog_put(prog); |
383 | return ERR_PTR(-EINVAL); | 385 | return ERR_PTR(-EINVAL); |
384 | } | 386 | } |
387 | |||
385 | return prog; | 388 | return prog; |
386 | } | 389 | } |
387 | 390 | ||
388 | static void prog_fd_array_put_ptr(void *ptr) | 391 | static void prog_fd_array_put_ptr(void *ptr) |
389 | { | 392 | { |
390 | struct bpf_prog *prog = ptr; | 393 | bpf_prog_put(ptr); |
391 | |||
392 | bpf_prog_put_rcu(prog); | ||
393 | } | 394 | } |
394 | 395 | ||
395 | /* decrement refcnt of all bpf_progs that are stored in this map */ | 396 | /* decrement refcnt of all bpf_progs that are stored in this map */ |
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = { | |||
407 | .map_free = fd_array_map_free, | 408 | .map_free = fd_array_map_free, |
408 | .map_get_next_key = array_map_get_next_key, | 409 | .map_get_next_key = array_map_get_next_key, |
409 | .map_lookup_elem = fd_array_map_lookup_elem, | 410 | .map_lookup_elem = fd_array_map_lookup_elem, |
410 | .map_update_elem = fd_array_map_update_elem, | ||
411 | .map_delete_elem = fd_array_map_delete_elem, | 411 | .map_delete_elem = fd_array_map_delete_elem, |
412 | .map_fd_get_ptr = prog_fd_array_get_ptr, | 412 | .map_fd_get_ptr = prog_fd_array_get_ptr, |
413 | .map_fd_put_ptr = prog_fd_array_put_ptr, | 413 | .map_fd_put_ptr = prog_fd_array_put_ptr, |
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void) | |||
425 | } | 425 | } |
426 | late_initcall(register_prog_array_map); | 426 | late_initcall(register_prog_array_map); |
427 | 427 | ||
428 | static void perf_event_array_map_free(struct bpf_map *map) | 428 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, |
429 | struct file *map_file) | ||
429 | { | 430 | { |
430 | bpf_fd_array_map_clear(map); | 431 | struct bpf_event_entry *ee; |
431 | fd_array_map_free(map); | 432 | |
433 | ee = kzalloc(sizeof(*ee), GFP_ATOMIC); | ||
434 | if (ee) { | ||
435 | ee->event = perf_file->private_data; | ||
436 | ee->perf_file = perf_file; | ||
437 | ee->map_file = map_file; | ||
438 | } | ||
439 | |||
440 | return ee; | ||
432 | } | 441 | } |
433 | 442 | ||
434 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | 443 | static void __bpf_event_entry_free(struct rcu_head *rcu) |
435 | { | 444 | { |
436 | struct perf_event *event; | 445 | struct bpf_event_entry *ee; |
437 | const struct perf_event_attr *attr; | ||
438 | struct file *file; | ||
439 | 446 | ||
440 | file = perf_event_get(fd); | 447 | ee = container_of(rcu, struct bpf_event_entry, rcu); |
441 | if (IS_ERR(file)) | 448 | fput(ee->perf_file); |
442 | return file; | 449 | kfree(ee); |
450 | } | ||
443 | 451 | ||
444 | event = file->private_data; | 452 | static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) |
453 | { | ||
454 | call_rcu(&ee->rcu, __bpf_event_entry_free); | ||
455 | } | ||
445 | 456 | ||
446 | attr = perf_event_attrs(event); | 457 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, |
447 | if (IS_ERR(attr)) | 458 | struct file *map_file, int fd) |
448 | goto err; | 459 | { |
460 | const struct perf_event_attr *attr; | ||
461 | struct bpf_event_entry *ee; | ||
462 | struct perf_event *event; | ||
463 | struct file *perf_file; | ||
449 | 464 | ||
450 | if (attr->inherit) | 465 | perf_file = perf_event_get(fd); |
451 | goto err; | 466 | if (IS_ERR(perf_file)) |
467 | return perf_file; | ||
452 | 468 | ||
453 | if (attr->type == PERF_TYPE_RAW) | 469 | event = perf_file->private_data; |
454 | return file; | 470 | ee = ERR_PTR(-EINVAL); |
455 | 471 | ||
456 | if (attr->type == PERF_TYPE_HARDWARE) | 472 | attr = perf_event_attrs(event); |
457 | return file; | 473 | if (IS_ERR(attr) || attr->inherit) |
474 | goto err_out; | ||
475 | |||
476 | switch (attr->type) { | ||
477 | case PERF_TYPE_SOFTWARE: | ||
478 | if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) | ||
479 | goto err_out; | ||
480 | /* fall-through */ | ||
481 | case PERF_TYPE_RAW: | ||
482 | case PERF_TYPE_HARDWARE: | ||
483 | ee = bpf_event_entry_gen(perf_file, map_file); | ||
484 | if (ee) | ||
485 | return ee; | ||
486 | ee = ERR_PTR(-ENOMEM); | ||
487 | /* fall-through */ | ||
488 | default: | ||
489 | break; | ||
490 | } | ||
458 | 491 | ||
459 | if (attr->type == PERF_TYPE_SOFTWARE && | 492 | err_out: |
460 | attr->config == PERF_COUNT_SW_BPF_OUTPUT) | 493 | fput(perf_file); |
461 | return file; | 494 | return ee; |
462 | err: | ||
463 | fput(file); | ||
464 | return ERR_PTR(-EINVAL); | ||
465 | } | 495 | } |
466 | 496 | ||
467 | static void perf_event_fd_array_put_ptr(void *ptr) | 497 | static void perf_event_fd_array_put_ptr(void *ptr) |
468 | { | 498 | { |
469 | fput((struct file *)ptr); | 499 | bpf_event_entry_free_rcu(ptr); |
500 | } | ||
501 | |||
502 | static void perf_event_fd_array_release(struct bpf_map *map, | ||
503 | struct file *map_file) | ||
504 | { | ||
505 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
506 | struct bpf_event_entry *ee; | ||
507 | int i; | ||
508 | |||
509 | rcu_read_lock(); | ||
510 | for (i = 0; i < array->map.max_entries; i++) { | ||
511 | ee = READ_ONCE(array->ptrs[i]); | ||
512 | if (ee && ee->map_file == map_file) | ||
513 | fd_array_map_delete_elem(map, &i); | ||
514 | } | ||
515 | rcu_read_unlock(); | ||
470 | } | 516 | } |
471 | 517 | ||
472 | static const struct bpf_map_ops perf_event_array_ops = { | 518 | static const struct bpf_map_ops perf_event_array_ops = { |
473 | .map_alloc = fd_array_map_alloc, | 519 | .map_alloc = fd_array_map_alloc, |
474 | .map_free = perf_event_array_map_free, | 520 | .map_free = fd_array_map_free, |
475 | .map_get_next_key = array_map_get_next_key, | 521 | .map_get_next_key = array_map_get_next_key, |
476 | .map_lookup_elem = fd_array_map_lookup_elem, | 522 | .map_lookup_elem = fd_array_map_lookup_elem, |
477 | .map_update_elem = fd_array_map_update_elem, | ||
478 | .map_delete_elem = fd_array_map_delete_elem, | 523 | .map_delete_elem = fd_array_map_delete_elem, |
479 | .map_fd_get_ptr = perf_event_fd_array_get_ptr, | 524 | .map_fd_get_ptr = perf_event_fd_array_get_ptr, |
480 | .map_fd_put_ptr = perf_event_fd_array_put_ptr, | 525 | .map_fd_put_ptr = perf_event_fd_array_put_ptr, |
526 | .map_release = perf_event_fd_array_release, | ||
481 | }; | 527 | }; |
482 | 528 | ||
483 | static struct bpf_map_type_list perf_event_array_type __read_mostly = { | 529 | static struct bpf_map_type_list perf_event_array_type __read_mostly = { |
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void) | |||
491 | return 0; | 537 | return 0; |
492 | } | 538 | } |
493 | late_initcall(register_perf_event_array_map); | 539 | late_initcall(register_perf_event_array_map); |
540 | |||
541 | #ifdef CONFIG_SOCK_CGROUP_DATA | ||
542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, | ||
543 | struct file *map_file /* not used */, | ||
544 | int fd) | ||
545 | { | ||
546 | return cgroup_get_from_fd(fd); | ||
547 | } | ||
548 | |||
549 | static void cgroup_fd_array_put_ptr(void *ptr) | ||
550 | { | ||
551 | /* cgroup_put free cgrp after a rcu grace period */ | ||
552 | cgroup_put(ptr); | ||
553 | } | ||
554 | |||
555 | static void cgroup_fd_array_free(struct bpf_map *map) | ||
556 | { | ||
557 | bpf_fd_array_map_clear(map); | ||
558 | fd_array_map_free(map); | ||
559 | } | ||
560 | |||
561 | static const struct bpf_map_ops cgroup_array_ops = { | ||
562 | .map_alloc = fd_array_map_alloc, | ||
563 | .map_free = cgroup_fd_array_free, | ||
564 | .map_get_next_key = array_map_get_next_key, | ||
565 | .map_lookup_elem = fd_array_map_lookup_elem, | ||
566 | .map_delete_elem = fd_array_map_delete_elem, | ||
567 | .map_fd_get_ptr = cgroup_fd_array_get_ptr, | ||
568 | .map_fd_put_ptr = cgroup_fd_array_put_ptr, | ||
569 | }; | ||
570 | |||
571 | static struct bpf_map_type_list cgroup_array_type __read_mostly = { | ||
572 | .ops = &cgroup_array_ops, | ||
573 | .type = BPF_MAP_TYPE_CGROUP_ARRAY, | ||
574 | }; | ||
575 | |||
576 | static int __init register_cgroup_array_map(void) | ||
577 | { | ||
578 | bpf_register_map_type(&cgroup_array_type); | ||
579 | return 0; | ||
580 | } | ||
581 | late_initcall(register_cgroup_array_map); | ||
582 | #endif | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -719,14 +719,13 @@ select_insn: | |||
719 | 719 | ||
720 | if (unlikely(index >= array->map.max_entries)) | 720 | if (unlikely(index >= array->map.max_entries)) |
721 | goto out; | 721 | goto out; |
722 | |||
723 | if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) | 722 | if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) |
724 | goto out; | 723 | goto out; |
725 | 724 | ||
726 | tail_call_cnt++; | 725 | tail_call_cnt++; |
727 | 726 | ||
728 | prog = READ_ONCE(array->ptrs[index]); | 727 | prog = READ_ONCE(array->ptrs[index]); |
729 | if (unlikely(!prog)) | 728 | if (!prog) |
730 | goto out; | 729 | goto out; |
731 | 730 | ||
732 | /* ARG1 at this point is guaranteed to point to CTX from | 731 | /* ARG1 at this point is guaranteed to point to CTX from |
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) | |||
1055 | return NULL; | 1054 | return NULL; |
1056 | } | 1055 | } |
1057 | 1056 | ||
1058 | const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) | 1057 | u64 __weak |
1058 | bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | ||
1059 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) | ||
1059 | { | 1060 | { |
1060 | return NULL; | 1061 | return -ENOTSUPP; |
1061 | } | 1062 | } |
1062 | 1063 | ||
1063 | /* Always built-in helper functions. */ | 1064 | /* Always built-in helper functions. */ |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { | |||
101 | 101 | ||
102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) |
103 | { | 103 | { |
104 | return raw_smp_processor_id(); | 104 | return smp_processor_id(); |
105 | } | 105 | } |
106 | 106 | ||
107 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | 107 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * version 2 as published by the Free Software Foundation. | 11 | * version 2 as published by the Free Software Foundation. |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/module.h> | 14 | #include <linux/init.h> |
15 | #include <linux/magic.h> | 15 | #include <linux/magic.h> |
16 | #include <linux/major.h> | 16 | #include <linux/major.h> |
17 | #include <linux/mount.h> | 17 | #include <linux/mount.h> |
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = { | |||
367 | .kill_sb = kill_litter_super, | 367 | .kill_sb = kill_litter_super, |
368 | }; | 368 | }; |
369 | 369 | ||
370 | MODULE_ALIAS_FS("bpf"); | ||
371 | |||
372 | static int __init bpf_init(void) | 370 | static int __init bpf_init(void) |
373 | { | 371 | { |
374 | int ret; | 372 | int ret; |
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 080a2dfb5800..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
99 | if (err) | 99 | if (err) |
100 | goto free_smap; | 100 | goto free_smap; |
101 | 101 | ||
102 | err = get_callchain_buffers(); | 102 | err = get_callchain_buffers(sysctl_perf_event_max_stack); |
103 | if (err) | 103 | if (err) |
104 | goto free_smap; | 104 | goto free_smap; |
105 | 105 | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) | |||
124 | 124 | ||
125 | static int bpf_map_release(struct inode *inode, struct file *filp) | 125 | static int bpf_map_release(struct inode *inode, struct file *filp) |
126 | { | 126 | { |
127 | bpf_map_put_with_uref(filp->private_data); | 127 | struct bpf_map *map = filp->private_data; |
128 | |||
129 | if (map->ops->map_release) | ||
130 | map->ops->map_release(map, filp); | ||
131 | |||
132 | bpf_map_put_with_uref(map); | ||
128 | return 0; | 133 | return 0; |
129 | } | 134 | } |
130 | 135 | ||
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr) | |||
387 | err = bpf_percpu_hash_update(map, key, value, attr->flags); | 392 | err = bpf_percpu_hash_update(map, key, value, attr->flags); |
388 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | 393 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { |
389 | err = bpf_percpu_array_update(map, key, value, attr->flags); | 394 | err = bpf_percpu_array_update(map, key, value, attr->flags); |
395 | } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || | ||
396 | map->map_type == BPF_MAP_TYPE_PROG_ARRAY || | ||
397 | map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { | ||
398 | rcu_read_lock(); | ||
399 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, | ||
400 | attr->flags); | ||
401 | rcu_read_unlock(); | ||
390 | } else { | 402 | } else { |
391 | rcu_read_lock(); | 403 | rcu_read_lock(); |
392 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 404 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) | |||
612 | free_uid(user); | 624 | free_uid(user); |
613 | } | 625 | } |
614 | 626 | ||
615 | static void __prog_put_common(struct rcu_head *rcu) | 627 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) |
616 | { | 628 | { |
617 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); | 629 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); |
618 | 630 | ||
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu) | |||
621 | bpf_prog_free(aux->prog); | 633 | bpf_prog_free(aux->prog); |
622 | } | 634 | } |
623 | 635 | ||
624 | /* version of bpf_prog_put() that is called after a grace period */ | ||
625 | void bpf_prog_put_rcu(struct bpf_prog *prog) | ||
626 | { | ||
627 | if (atomic_dec_and_test(&prog->aux->refcnt)) | ||
628 | call_rcu(&prog->aux->rcu, __prog_put_common); | ||
629 | } | ||
630 | |||
631 | void bpf_prog_put(struct bpf_prog *prog) | 636 | void bpf_prog_put(struct bpf_prog *prog) |
632 | { | 637 | { |
633 | if (atomic_dec_and_test(&prog->aux->refcnt)) | 638 | if (atomic_dec_and_test(&prog->aux->refcnt)) |
634 | __prog_put_common(&prog->aux->rcu); | 639 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
635 | } | 640 | } |
636 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 641 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
637 | 642 | ||
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) | |||
639 | { | 644 | { |
640 | struct bpf_prog *prog = filp->private_data; | 645 | struct bpf_prog *prog = filp->private_data; |
641 | 646 | ||
642 | bpf_prog_put_rcu(prog); | 647 | bpf_prog_put(prog); |
643 | return 0; | 648 | return 0; |
644 | } | 649 | } |
645 | 650 | ||
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) | |||
653 | O_RDWR | O_CLOEXEC); | 658 | O_RDWR | O_CLOEXEC); |
654 | } | 659 | } |
655 | 660 | ||
656 | static struct bpf_prog *__bpf_prog_get(struct fd f) | 661 | static struct bpf_prog *____bpf_prog_get(struct fd f) |
657 | { | 662 | { |
658 | if (!f.file) | 663 | if (!f.file) |
659 | return ERR_PTR(-EBADF); | 664 | return ERR_PTR(-EBADF); |
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) | |||
665 | return f.file->private_data; | 670 | return f.file->private_data; |
666 | } | 671 | } |
667 | 672 | ||
668 | struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) | 673 | struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) |
669 | { | 674 | { |
670 | if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { | 675 | if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { |
671 | atomic_dec(&prog->aux->refcnt); | 676 | atomic_sub(i, &prog->aux->refcnt); |
672 | return ERR_PTR(-EBUSY); | 677 | return ERR_PTR(-EBUSY); |
673 | } | 678 | } |
674 | return prog; | 679 | return prog; |
675 | } | 680 | } |
681 | EXPORT_SYMBOL_GPL(bpf_prog_add); | ||
676 | 682 | ||
677 | /* called by sockets/tracing/seccomp before attaching program to an event | 683 | struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) |
678 | * pairs with bpf_prog_put() | 684 | { |
679 | */ | 685 | return bpf_prog_add(prog, 1); |
680 | struct bpf_prog *bpf_prog_get(u32 ufd) | 686 | } |
687 | |||
688 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) | ||
681 | { | 689 | { |
682 | struct fd f = fdget(ufd); | 690 | struct fd f = fdget(ufd); |
683 | struct bpf_prog *prog; | 691 | struct bpf_prog *prog; |
684 | 692 | ||
685 | prog = __bpf_prog_get(f); | 693 | prog = ____bpf_prog_get(f); |
686 | if (IS_ERR(prog)) | 694 | if (IS_ERR(prog)) |
687 | return prog; | 695 | return prog; |
696 | if (type && prog->type != *type) { | ||
697 | prog = ERR_PTR(-EINVAL); | ||
698 | goto out; | ||
699 | } | ||
688 | 700 | ||
689 | prog = bpf_prog_inc(prog); | 701 | prog = bpf_prog_inc(prog); |
702 | out: | ||
690 | fdput(f); | 703 | fdput(f); |
691 | |||
692 | return prog; | 704 | return prog; |
693 | } | 705 | } |
694 | EXPORT_SYMBOL_GPL(bpf_prog_get); | 706 | |
707 | struct bpf_prog *bpf_prog_get(u32 ufd) | ||
708 | { | ||
709 | return __bpf_prog_get(ufd, NULL); | ||
710 | } | ||
711 | |||
712 | struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) | ||
713 | { | ||
714 | return __bpf_prog_get(ufd, &type); | ||
715 | } | ||
716 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); | ||
695 | 717 | ||
696 | /* last field in 'union bpf_attr' used by this command */ | 718 | /* last field in 'union bpf_attr' used by this command */ |
697 | #define BPF_PROG_LOAD_LAST_FIELD kern_version | 719 | #define BPF_PROG_LOAD_LAST_FIELD kern_version |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 668e07903c8f..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -126,31 +126,6 @@ | |||
126 | * are set to NOT_INIT to indicate that they are no longer readable. | 126 | * are set to NOT_INIT to indicate that they are no longer readable. |
127 | */ | 127 | */ |
128 | 128 | ||
129 | /* types of values stored in eBPF registers */ | ||
130 | enum bpf_reg_type { | ||
131 | NOT_INIT = 0, /* nothing was written into register */ | ||
132 | UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ | ||
133 | PTR_TO_CTX, /* reg points to bpf_context */ | ||
134 | CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ | ||
135 | PTR_TO_MAP_VALUE, /* reg points to map element value */ | ||
136 | PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ | ||
137 | FRAME_PTR, /* reg == frame_pointer */ | ||
138 | PTR_TO_STACK, /* reg == frame_pointer + imm */ | ||
139 | CONST_IMM, /* constant integer value */ | ||
140 | |||
141 | /* PTR_TO_PACKET represents: | ||
142 | * skb->data | ||
143 | * skb->data + imm | ||
144 | * skb->data + (u16) var | ||
145 | * skb->data + (u16) var + imm | ||
146 | * if (range > 0) then [ptr, ptr + range - off) is safe to access | ||
147 | * if (id > 0) means that some 'var' was added | ||
148 | * if (off > 0) menas that 'imm' was added | ||
149 | */ | ||
150 | PTR_TO_PACKET, | ||
151 | PTR_TO_PACKET_END, /* skb->data + headlen */ | ||
152 | }; | ||
153 | |||
154 | struct reg_state { | 129 | struct reg_state { |
155 | enum bpf_reg_type type; | 130 | enum bpf_reg_type type; |
156 | union { | 131 | union { |
@@ -678,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, | |||
678 | 653 | ||
679 | #define MAX_PACKET_OFF 0xffff | 654 | #define MAX_PACKET_OFF 0xffff |
680 | 655 | ||
656 | static bool may_write_pkt_data(enum bpf_prog_type type) | ||
657 | { | ||
658 | switch (type) { | ||
659 | case BPF_PROG_TYPE_XDP: | ||
660 | return true; | ||
661 | default: | ||
662 | return false; | ||
663 | } | ||
664 | } | ||
665 | |||
681 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, | 666 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, |
682 | int size) | 667 | int size) |
683 | { | 668 | { |
@@ -695,10 +680,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, | |||
695 | 680 | ||
696 | /* check access to 'struct bpf_context' fields */ | 681 | /* check access to 'struct bpf_context' fields */ |
697 | static int check_ctx_access(struct verifier_env *env, int off, int size, | 682 | static int check_ctx_access(struct verifier_env *env, int off, int size, |
698 | enum bpf_access_type t) | 683 | enum bpf_access_type t, enum bpf_reg_type *reg_type) |
699 | { | 684 | { |
700 | if (env->prog->aux->ops->is_valid_access && | 685 | if (env->prog->aux->ops->is_valid_access && |
701 | env->prog->aux->ops->is_valid_access(off, size, t)) { | 686 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { |
702 | /* remember the offset of last byte accessed in ctx */ | 687 | /* remember the offset of last byte accessed in ctx */ |
703 | if (env->prog->aux->max_ctx_offset < off + size) | 688 | if (env->prog->aux->max_ctx_offset < off + size) |
704 | env->prog->aux->max_ctx_offset = off + size; | 689 | env->prog->aux->max_ctx_offset = off + size; |
@@ -738,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
738 | switch (env->prog->type) { | 723 | switch (env->prog->type) { |
739 | case BPF_PROG_TYPE_SCHED_CLS: | 724 | case BPF_PROG_TYPE_SCHED_CLS: |
740 | case BPF_PROG_TYPE_SCHED_ACT: | 725 | case BPF_PROG_TYPE_SCHED_ACT: |
726 | case BPF_PROG_TYPE_XDP: | ||
741 | break; | 727 | break; |
742 | default: | 728 | default: |
743 | verbose("verifier is misconfigured\n"); | 729 | verbose("verifier is misconfigured\n"); |
@@ -798,21 +784,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
798 | mark_reg_unknown_value(state->regs, value_regno); | 784 | mark_reg_unknown_value(state->regs, value_regno); |
799 | 785 | ||
800 | } else if (reg->type == PTR_TO_CTX) { | 786 | } else if (reg->type == PTR_TO_CTX) { |
787 | enum bpf_reg_type reg_type = UNKNOWN_VALUE; | ||
788 | |||
801 | if (t == BPF_WRITE && value_regno >= 0 && | 789 | if (t == BPF_WRITE && value_regno >= 0 && |
802 | is_pointer_value(env, value_regno)) { | 790 | is_pointer_value(env, value_regno)) { |
803 | verbose("R%d leaks addr into ctx\n", value_regno); | 791 | verbose("R%d leaks addr into ctx\n", value_regno); |
804 | return -EACCES; | 792 | return -EACCES; |
805 | } | 793 | } |
806 | err = check_ctx_access(env, off, size, t); | 794 | err = check_ctx_access(env, off, size, t, ®_type); |
807 | if (!err && t == BPF_READ && value_regno >= 0) { | 795 | if (!err && t == BPF_READ && value_regno >= 0) { |
808 | mark_reg_unknown_value(state->regs, value_regno); | 796 | mark_reg_unknown_value(state->regs, value_regno); |
809 | if (off == offsetof(struct __sk_buff, data) && | 797 | if (env->allow_ptr_leaks) |
810 | env->allow_ptr_leaks) | ||
811 | /* note that reg.[id|off|range] == 0 */ | 798 | /* note that reg.[id|off|range] == 0 */ |
812 | state->regs[value_regno].type = PTR_TO_PACKET; | 799 | state->regs[value_regno].type = reg_type; |
813 | else if (off == offsetof(struct __sk_buff, data_end) && | ||
814 | env->allow_ptr_leaks) | ||
815 | state->regs[value_regno].type = PTR_TO_PACKET_END; | ||
816 | } | 800 | } |
817 | 801 | ||
818 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { | 802 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { |
@@ -832,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
832 | err = check_stack_read(state, off, size, value_regno); | 816 | err = check_stack_read(state, off, size, value_regno); |
833 | } | 817 | } |
834 | } else if (state->regs[regno].type == PTR_TO_PACKET) { | 818 | } else if (state->regs[regno].type == PTR_TO_PACKET) { |
835 | if (t == BPF_WRITE) { | 819 | if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { |
836 | verbose("cannot write into packet\n"); | 820 | verbose("cannot write into packet\n"); |
837 | return -EACCES; | 821 | return -EACCES; |
838 | } | 822 | } |
823 | if (t == BPF_WRITE && value_regno >= 0 && | ||
824 | is_pointer_value(env, value_regno)) { | ||
825 | verbose("R%d leaks addr into packet\n", value_regno); | ||
826 | return -EACCES; | ||
827 | } | ||
839 | err = check_packet_access(env, regno, off, size); | 828 | err = check_packet_access(env, regno, off, size); |
840 | if (!err && t == BPF_READ && value_regno >= 0) | 829 | if (!err && t == BPF_READ && value_regno >= 0) |
841 | mark_reg_unknown_value(state->regs, value_regno); | 830 | mark_reg_unknown_value(state->regs, value_regno); |
@@ -1062,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
1062 | if (func_id != BPF_FUNC_get_stackid) | 1051 | if (func_id != BPF_FUNC_get_stackid) |
1063 | goto error; | 1052 | goto error; |
1064 | break; | 1053 | break; |
1054 | case BPF_MAP_TYPE_CGROUP_ARRAY: | ||
1055 | if (func_id != BPF_FUNC_skb_in_cgroup) | ||
1056 | goto error; | ||
1057 | break; | ||
1065 | default: | 1058 | default: |
1066 | break; | 1059 | break; |
1067 | } | 1060 | } |
@@ -1081,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
1081 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) | 1074 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) |
1082 | goto error; | 1075 | goto error; |
1083 | break; | 1076 | break; |
1077 | case BPF_FUNC_skb_in_cgroup: | ||
1078 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) | ||
1079 | goto error; | ||
1080 | break; | ||
1084 | default: | 1081 | default: |
1085 | break; | 1082 | break; |
1086 | } | 1083 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..d1c51b7f5221 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -61,7 +61,7 @@ | |||
61 | #include <linux/cpuset.h> | 61 | #include <linux/cpuset.h> |
62 | #include <linux/proc_ns.h> | 62 | #include <linux/proc_ns.h> |
63 | #include <linux/nsproxy.h> | 63 | #include <linux/nsproxy.h> |
64 | #include <linux/proc_ns.h> | 64 | #include <linux/file.h> |
65 | #include <net/sock.h> | 65 | #include <net/sock.h> |
66 | 66 | ||
67 | /* | 67 | /* |
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset) | |||
837 | 837 | ||
838 | static void put_css_set(struct css_set *cset) | 838 | static void put_css_set(struct css_set *cset) |
839 | { | 839 | { |
840 | unsigned long flags; | ||
841 | |||
840 | /* | 842 | /* |
841 | * Ensure that the refcount doesn't hit zero while any readers | 843 | * Ensure that the refcount doesn't hit zero while any readers |
842 | * can see it. Similar to atomic_dec_and_lock(), but for an | 844 | * can see it. Similar to atomic_dec_and_lock(), but for an |
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset) | |||
845 | if (atomic_add_unless(&cset->refcount, -1, 1)) | 847 | if (atomic_add_unless(&cset->refcount, -1, 1)) |
846 | return; | 848 | return; |
847 | 849 | ||
848 | spin_lock_bh(&css_set_lock); | 850 | spin_lock_irqsave(&css_set_lock, flags); |
849 | put_css_set_locked(cset); | 851 | put_css_set_locked(cset); |
850 | spin_unlock_bh(&css_set_lock); | 852 | spin_unlock_irqrestore(&css_set_lock, flags); |
851 | } | 853 | } |
852 | 854 | ||
853 | /* | 855 | /* |
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
1070 | 1072 | ||
1071 | /* First see if we already have a cgroup group that matches | 1073 | /* First see if we already have a cgroup group that matches |
1072 | * the desired set */ | 1074 | * the desired set */ |
1073 | spin_lock_bh(&css_set_lock); | 1075 | spin_lock_irq(&css_set_lock); |
1074 | cset = find_existing_css_set(old_cset, cgrp, template); | 1076 | cset = find_existing_css_set(old_cset, cgrp, template); |
1075 | if (cset) | 1077 | if (cset) |
1076 | get_css_set(cset); | 1078 | get_css_set(cset); |
1077 | spin_unlock_bh(&css_set_lock); | 1079 | spin_unlock_irq(&css_set_lock); |
1078 | 1080 | ||
1079 | if (cset) | 1081 | if (cset) |
1080 | return cset; | 1082 | return cset; |
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
1102 | * find_existing_css_set() */ | 1104 | * find_existing_css_set() */ |
1103 | memcpy(cset->subsys, template, sizeof(cset->subsys)); | 1105 | memcpy(cset->subsys, template, sizeof(cset->subsys)); |
1104 | 1106 | ||
1105 | spin_lock_bh(&css_set_lock); | 1107 | spin_lock_irq(&css_set_lock); |
1106 | /* Add reference counts and links from the new css_set. */ | 1108 | /* Add reference counts and links from the new css_set. */ |
1107 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { | 1109 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { |
1108 | struct cgroup *c = link->cgrp; | 1110 | struct cgroup *c = link->cgrp; |
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
1128 | css_get(css); | 1130 | css_get(css); |
1129 | } | 1131 | } |
1130 | 1132 | ||
1131 | spin_unlock_bh(&css_set_lock); | 1133 | spin_unlock_irq(&css_set_lock); |
1132 | 1134 | ||
1133 | return cset; | 1135 | return cset; |
1134 | } | 1136 | } |
@@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) | |||
1158 | { | 1160 | { |
1159 | lockdep_assert_held(&cgroup_mutex); | 1161 | lockdep_assert_held(&cgroup_mutex); |
1160 | 1162 | ||
1161 | if (root->hierarchy_id) { | 1163 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); |
1162 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); | ||
1163 | root->hierarchy_id = 0; | ||
1164 | } | ||
1165 | } | 1164 | } |
1166 | 1165 | ||
1167 | static void cgroup_free_root(struct cgroup_root *root) | 1166 | static void cgroup_free_root(struct cgroup_root *root) |
1168 | { | 1167 | { |
1169 | if (root) { | 1168 | if (root) { |
1170 | /* hierarchy ID should already have been released */ | ||
1171 | WARN_ON_ONCE(root->hierarchy_id); | ||
1172 | |||
1173 | idr_destroy(&root->cgroup_idr); | 1169 | idr_destroy(&root->cgroup_idr); |
1174 | kfree(root); | 1170 | kfree(root); |
1175 | } | 1171 | } |
@@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
1192 | * Release all the links from cset_links to this hierarchy's | 1188 | * Release all the links from cset_links to this hierarchy's |
1193 | * root cgroup | 1189 | * root cgroup |
1194 | */ | 1190 | */ |
1195 | spin_lock_bh(&css_set_lock); | 1191 | spin_lock_irq(&css_set_lock); |
1196 | 1192 | ||
1197 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { | 1193 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { |
1198 | list_del(&link->cset_link); | 1194 | list_del(&link->cset_link); |
@@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
1200 | kfree(link); | 1196 | kfree(link); |
1201 | } | 1197 | } |
1202 | 1198 | ||
1203 | spin_unlock_bh(&css_set_lock); | 1199 | spin_unlock_irq(&css_set_lock); |
1204 | 1200 | ||
1205 | if (!list_empty(&root->root_list)) { | 1201 | if (!list_empty(&root->root_list)) { |
1206 | list_del(&root->root_list); | 1202 | list_del(&root->root_list); |
@@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) | |||
1600 | ss->root = dst_root; | 1596 | ss->root = dst_root; |
1601 | css->cgroup = dcgrp; | 1597 | css->cgroup = dcgrp; |
1602 | 1598 | ||
1603 | spin_lock_bh(&css_set_lock); | 1599 | spin_lock_irq(&css_set_lock); |
1604 | hash_for_each(css_set_table, i, cset, hlist) | 1600 | hash_for_each(css_set_table, i, cset, hlist) |
1605 | list_move_tail(&cset->e_cset_node[ss->id], | 1601 | list_move_tail(&cset->e_cset_node[ss->id], |
1606 | &dcgrp->e_csets[ss->id]); | 1602 | &dcgrp->e_csets[ss->id]); |
1607 | spin_unlock_bh(&css_set_lock); | 1603 | spin_unlock_irq(&css_set_lock); |
1608 | 1604 | ||
1609 | /* default hierarchy doesn't enable controllers by default */ | 1605 | /* default hierarchy doesn't enable controllers by default */ |
1610 | dst_root->subsys_mask |= 1 << ssid; | 1606 | dst_root->subsys_mask |= 1 << ssid; |
@@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
1640 | if (!buf) | 1636 | if (!buf) |
1641 | return -ENOMEM; | 1637 | return -ENOMEM; |
1642 | 1638 | ||
1643 | spin_lock_bh(&css_set_lock); | 1639 | spin_lock_irq(&css_set_lock); |
1644 | ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); | 1640 | ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); |
1645 | len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); | 1641 | len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); |
1646 | spin_unlock_bh(&css_set_lock); | 1642 | spin_unlock_irq(&css_set_lock); |
1647 | 1643 | ||
1648 | if (len >= PATH_MAX) | 1644 | if (len >= PATH_MAX) |
1649 | len = -ERANGE; | 1645 | len = -ERANGE; |
@@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
1897 | { | 1893 | { |
1898 | struct task_struct *p, *g; | 1894 | struct task_struct *p, *g; |
1899 | 1895 | ||
1900 | spin_lock_bh(&css_set_lock); | 1896 | spin_lock_irq(&css_set_lock); |
1901 | 1897 | ||
1902 | if (use_task_css_set_links) | 1898 | if (use_task_css_set_links) |
1903 | goto out_unlock; | 1899 | goto out_unlock; |
@@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void) | |||
1922 | * entry won't be deleted though the process has exited. | 1918 | * entry won't be deleted though the process has exited. |
1923 | * Do it while holding siglock so that we don't end up | 1919 | * Do it while holding siglock so that we don't end up |
1924 | * racing against cgroup_exit(). | 1920 | * racing against cgroup_exit(). |
1921 | * | ||
1922 | * Interrupts were already disabled while acquiring | ||
1923 | * the css_set_lock, so we do not need to disable it | ||
1924 | * again when acquiring the sighand->siglock here. | ||
1925 | */ | 1925 | */ |
1926 | spin_lock_irq(&p->sighand->siglock); | 1926 | spin_lock(&p->sighand->siglock); |
1927 | if (!(p->flags & PF_EXITING)) { | 1927 | if (!(p->flags & PF_EXITING)) { |
1928 | struct css_set *cset = task_css_set(p); | 1928 | struct css_set *cset = task_css_set(p); |
1929 | 1929 | ||
@@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void) | |||
1932 | list_add_tail(&p->cg_list, &cset->tasks); | 1932 | list_add_tail(&p->cg_list, &cset->tasks); |
1933 | get_css_set(cset); | 1933 | get_css_set(cset); |
1934 | } | 1934 | } |
1935 | spin_unlock_irq(&p->sighand->siglock); | 1935 | spin_unlock(&p->sighand->siglock); |
1936 | } while_each_thread(g, p); | 1936 | } while_each_thread(g, p); |
1937 | read_unlock(&tasklist_lock); | 1937 | read_unlock(&tasklist_lock); |
1938 | out_unlock: | 1938 | out_unlock: |
1939 | spin_unlock_bh(&css_set_lock); | 1939 | spin_unlock_irq(&css_set_lock); |
1940 | } | 1940 | } |
1941 | 1941 | ||
1942 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1942 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
@@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) | |||
2043 | * Link the root cgroup in this hierarchy into all the css_set | 2043 | * Link the root cgroup in this hierarchy into all the css_set |
2044 | * objects. | 2044 | * objects. |
2045 | */ | 2045 | */ |
2046 | spin_lock_bh(&css_set_lock); | 2046 | spin_lock_irq(&css_set_lock); |
2047 | hash_for_each(css_set_table, i, cset, hlist) { | 2047 | hash_for_each(css_set_table, i, cset, hlist) { |
2048 | link_css_set(&tmp_links, cset, root_cgrp); | 2048 | link_css_set(&tmp_links, cset, root_cgrp); |
2049 | if (css_set_populated(cset)) | 2049 | if (css_set_populated(cset)) |
2050 | cgroup_update_populated(root_cgrp, true); | 2050 | cgroup_update_populated(root_cgrp, true); |
2051 | } | 2051 | } |
2052 | spin_unlock_bh(&css_set_lock); | 2052 | spin_unlock_irq(&css_set_lock); |
2053 | 2053 | ||
2054 | BUG_ON(!list_empty(&root_cgrp->self.children)); | 2054 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
2055 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 2055 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
@@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
2209 | goto out_unlock; | 2209 | goto out_unlock; |
2210 | } | 2210 | } |
2211 | 2211 | ||
2212 | /* | 2212 | /* Hierarchies may only be created in the initial cgroup namespace. */ |
2213 | * We know this subsystem has not yet been bound. Users in a non-init | 2213 | if (ns != &init_cgroup_ns) { |
2214 | * user namespace may only mount hierarchies with no bound subsystems, | ||
2215 | * i.e. 'none,name=user1' | ||
2216 | */ | ||
2217 | if (!opts.none && !capable(CAP_SYS_ADMIN)) { | ||
2218 | ret = -EPERM; | 2214 | ret = -EPERM; |
2219 | goto out_unlock; | 2215 | goto out_unlock; |
2220 | } | 2216 | } |
@@ -2256,11 +2252,11 @@ out_mount: | |||
2256 | struct cgroup *cgrp; | 2252 | struct cgroup *cgrp; |
2257 | 2253 | ||
2258 | mutex_lock(&cgroup_mutex); | 2254 | mutex_lock(&cgroup_mutex); |
2259 | spin_lock_bh(&css_set_lock); | 2255 | spin_lock_irq(&css_set_lock); |
2260 | 2256 | ||
2261 | cgrp = cset_cgroup_from_root(ns->root_cset, root); | 2257 | cgrp = cset_cgroup_from_root(ns->root_cset, root); |
2262 | 2258 | ||
2263 | spin_unlock_bh(&css_set_lock); | 2259 | spin_unlock_irq(&css_set_lock); |
2264 | mutex_unlock(&cgroup_mutex); | 2260 | mutex_unlock(&cgroup_mutex); |
2265 | 2261 | ||
2266 | nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); | 2262 | nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); |
@@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, | |||
2337 | char *ret; | 2333 | char *ret; |
2338 | 2334 | ||
2339 | mutex_lock(&cgroup_mutex); | 2335 | mutex_lock(&cgroup_mutex); |
2340 | spin_lock_bh(&css_set_lock); | 2336 | spin_lock_irq(&css_set_lock); |
2341 | 2337 | ||
2342 | ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); | 2338 | ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); |
2343 | 2339 | ||
2344 | spin_unlock_bh(&css_set_lock); | 2340 | spin_unlock_irq(&css_set_lock); |
2345 | mutex_unlock(&cgroup_mutex); | 2341 | mutex_unlock(&cgroup_mutex); |
2346 | 2342 | ||
2347 | return ret; | 2343 | return ret; |
@@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
2369 | char *path = NULL; | 2365 | char *path = NULL; |
2370 | 2366 | ||
2371 | mutex_lock(&cgroup_mutex); | 2367 | mutex_lock(&cgroup_mutex); |
2372 | spin_lock_bh(&css_set_lock); | 2368 | spin_lock_irq(&css_set_lock); |
2373 | 2369 | ||
2374 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); | 2370 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); |
2375 | 2371 | ||
@@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
2382 | path = buf; | 2378 | path = buf; |
2383 | } | 2379 | } |
2384 | 2380 | ||
2385 | spin_unlock_bh(&css_set_lock); | 2381 | spin_unlock_irq(&css_set_lock); |
2386 | mutex_unlock(&cgroup_mutex); | 2382 | mutex_unlock(&cgroup_mutex); |
2387 | return path; | 2383 | return path; |
2388 | } | 2384 | } |
@@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, | |||
2557 | * the new cgroup. There are no failure cases after here, so this | 2553 | * the new cgroup. There are no failure cases after here, so this |
2558 | * is the commit point. | 2554 | * is the commit point. |
2559 | */ | 2555 | */ |
2560 | spin_lock_bh(&css_set_lock); | 2556 | spin_lock_irq(&css_set_lock); |
2561 | list_for_each_entry(cset, &tset->src_csets, mg_node) { | 2557 | list_for_each_entry(cset, &tset->src_csets, mg_node) { |
2562 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { | 2558 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { |
2563 | struct css_set *from_cset = task_css_set(task); | 2559 | struct css_set *from_cset = task_css_set(task); |
@@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, | |||
2568 | put_css_set_locked(from_cset); | 2564 | put_css_set_locked(from_cset); |
2569 | } | 2565 | } |
2570 | } | 2566 | } |
2571 | spin_unlock_bh(&css_set_lock); | 2567 | spin_unlock_irq(&css_set_lock); |
2572 | 2568 | ||
2573 | /* | 2569 | /* |
2574 | * Migration is committed, all target tasks are now on dst_csets. | 2570 | * Migration is committed, all target tasks are now on dst_csets. |
@@ -2597,13 +2593,13 @@ out_cancel_attach: | |||
2597 | } | 2593 | } |
2598 | } while_each_subsys_mask(); | 2594 | } while_each_subsys_mask(); |
2599 | out_release_tset: | 2595 | out_release_tset: |
2600 | spin_lock_bh(&css_set_lock); | 2596 | spin_lock_irq(&css_set_lock); |
2601 | list_splice_init(&tset->dst_csets, &tset->src_csets); | 2597 | list_splice_init(&tset->dst_csets, &tset->src_csets); |
2602 | list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { | 2598 | list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { |
2603 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); | 2599 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); |
2604 | list_del_init(&cset->mg_node); | 2600 | list_del_init(&cset->mg_node); |
2605 | } | 2601 | } |
2606 | spin_unlock_bh(&css_set_lock); | 2602 | spin_unlock_irq(&css_set_lock); |
2607 | return ret; | 2603 | return ret; |
2608 | } | 2604 | } |
2609 | 2605 | ||
@@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2634 | 2630 | ||
2635 | lockdep_assert_held(&cgroup_mutex); | 2631 | lockdep_assert_held(&cgroup_mutex); |
2636 | 2632 | ||
2637 | spin_lock_bh(&css_set_lock); | 2633 | spin_lock_irq(&css_set_lock); |
2638 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { | 2634 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { |
2639 | cset->mg_src_cgrp = NULL; | 2635 | cset->mg_src_cgrp = NULL; |
2640 | cset->mg_dst_cgrp = NULL; | 2636 | cset->mg_dst_cgrp = NULL; |
@@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2642 | list_del_init(&cset->mg_preload_node); | 2638 | list_del_init(&cset->mg_preload_node); |
2643 | put_css_set_locked(cset); | 2639 | put_css_set_locked(cset); |
2644 | } | 2640 | } |
2645 | spin_unlock_bh(&css_set_lock); | 2641 | spin_unlock_irq(&css_set_lock); |
2646 | } | 2642 | } |
2647 | 2643 | ||
2648 | /** | 2644 | /** |
@@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, | |||
2783 | * already PF_EXITING could be freed from underneath us unless we | 2779 | * already PF_EXITING could be freed from underneath us unless we |
2784 | * take an rcu_read_lock. | 2780 | * take an rcu_read_lock. |
2785 | */ | 2781 | */ |
2786 | spin_lock_bh(&css_set_lock); | 2782 | spin_lock_irq(&css_set_lock); |
2787 | rcu_read_lock(); | 2783 | rcu_read_lock(); |
2788 | task = leader; | 2784 | task = leader; |
2789 | do { | 2785 | do { |
@@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, | |||
2792 | break; | 2788 | break; |
2793 | } while_each_thread(leader, task); | 2789 | } while_each_thread(leader, task); |
2794 | rcu_read_unlock(); | 2790 | rcu_read_unlock(); |
2795 | spin_unlock_bh(&css_set_lock); | 2791 | spin_unlock_irq(&css_set_lock); |
2796 | 2792 | ||
2797 | return cgroup_taskset_migrate(&tset, root); | 2793 | return cgroup_taskset_migrate(&tset, root); |
2798 | } | 2794 | } |
@@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2816 | return -EBUSY; | 2812 | return -EBUSY; |
2817 | 2813 | ||
2818 | /* look up all src csets */ | 2814 | /* look up all src csets */ |
2819 | spin_lock_bh(&css_set_lock); | 2815 | spin_lock_irq(&css_set_lock); |
2820 | rcu_read_lock(); | 2816 | rcu_read_lock(); |
2821 | task = leader; | 2817 | task = leader; |
2822 | do { | 2818 | do { |
@@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2826 | break; | 2822 | break; |
2827 | } while_each_thread(leader, task); | 2823 | } while_each_thread(leader, task); |
2828 | rcu_read_unlock(); | 2824 | rcu_read_unlock(); |
2829 | spin_unlock_bh(&css_set_lock); | 2825 | spin_unlock_irq(&css_set_lock); |
2830 | 2826 | ||
2831 | /* prepare dst csets and commit */ | 2827 | /* prepare dst csets and commit */ |
2832 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); | 2828 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); |
@@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
2859 | struct cgroup *cgrp; | 2855 | struct cgroup *cgrp; |
2860 | struct inode *inode; | 2856 | struct inode *inode; |
2861 | 2857 | ||
2862 | spin_lock_bh(&css_set_lock); | 2858 | spin_lock_irq(&css_set_lock); |
2863 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | 2859 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
2864 | spin_unlock_bh(&css_set_lock); | 2860 | spin_unlock_irq(&css_set_lock); |
2865 | 2861 | ||
2866 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | 2862 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) |
2867 | cgrp = cgroup_parent(cgrp); | 2863 | cgrp = cgroup_parent(cgrp); |
@@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2956 | int retval = 0; | 2952 | int retval = 0; |
2957 | 2953 | ||
2958 | mutex_lock(&cgroup_mutex); | 2954 | mutex_lock(&cgroup_mutex); |
2955 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
2959 | for_each_root(root) { | 2956 | for_each_root(root) { |
2960 | struct cgroup *from_cgrp; | 2957 | struct cgroup *from_cgrp; |
2961 | 2958 | ||
2962 | if (root == &cgrp_dfl_root) | 2959 | if (root == &cgrp_dfl_root) |
2963 | continue; | 2960 | continue; |
2964 | 2961 | ||
2965 | spin_lock_bh(&css_set_lock); | 2962 | spin_lock_irq(&css_set_lock); |
2966 | from_cgrp = task_cgroup_from_root(from, root); | 2963 | from_cgrp = task_cgroup_from_root(from, root); |
2967 | spin_unlock_bh(&css_set_lock); | 2964 | spin_unlock_irq(&css_set_lock); |
2968 | 2965 | ||
2969 | retval = cgroup_attach_task(from_cgrp, tsk, false); | 2966 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2970 | if (retval) | 2967 | if (retval) |
2971 | break; | 2968 | break; |
2972 | } | 2969 | } |
2970 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
2973 | mutex_unlock(&cgroup_mutex); | 2971 | mutex_unlock(&cgroup_mutex); |
2974 | 2972 | ||
2975 | return retval; | 2973 | return retval; |
@@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
3080 | percpu_down_write(&cgroup_threadgroup_rwsem); | 3078 | percpu_down_write(&cgroup_threadgroup_rwsem); |
3081 | 3079 | ||
3082 | /* look up all csses currently attached to @cgrp's subtree */ | 3080 | /* look up all csses currently attached to @cgrp's subtree */ |
3083 | spin_lock_bh(&css_set_lock); | 3081 | spin_lock_irq(&css_set_lock); |
3084 | cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { | 3082 | cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { |
3085 | struct cgrp_cset_link *link; | 3083 | struct cgrp_cset_link *link; |
3086 | 3084 | ||
@@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
3088 | cgroup_migrate_add_src(link->cset, dsct, | 3086 | cgroup_migrate_add_src(link->cset, dsct, |
3089 | &preloaded_csets); | 3087 | &preloaded_csets); |
3090 | } | 3088 | } |
3091 | spin_unlock_bh(&css_set_lock); | 3089 | spin_unlock_irq(&css_set_lock); |
3092 | 3090 | ||
3093 | /* NULL dst indicates self on default hierarchy */ | 3091 | /* NULL dst indicates self on default hierarchy */ |
3094 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); | 3092 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); |
3095 | if (ret) | 3093 | if (ret) |
3096 | goto out_finish; | 3094 | goto out_finish; |
3097 | 3095 | ||
3098 | spin_lock_bh(&css_set_lock); | 3096 | spin_lock_irq(&css_set_lock); |
3099 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | 3097 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { |
3100 | struct task_struct *task, *ntask; | 3098 | struct task_struct *task, *ntask; |
3101 | 3099 | ||
@@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
3107 | list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) | 3105 | list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) |
3108 | cgroup_taskset_add(task, &tset); | 3106 | cgroup_taskset_add(task, &tset); |
3109 | } | 3107 | } |
3110 | spin_unlock_bh(&css_set_lock); | 3108 | spin_unlock_irq(&css_set_lock); |
3111 | 3109 | ||
3112 | ret = cgroup_taskset_migrate(&tset, cgrp->root); | 3110 | ret = cgroup_taskset_migrate(&tset, cgrp->root); |
3113 | out_finish: | 3111 | out_finish: |
@@ -3908,10 +3906,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
3908 | int count = 0; | 3906 | int count = 0; |
3909 | struct cgrp_cset_link *link; | 3907 | struct cgrp_cset_link *link; |
3910 | 3908 | ||
3911 | spin_lock_bh(&css_set_lock); | 3909 | spin_lock_irq(&css_set_lock); |
3912 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 3910 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
3913 | count += atomic_read(&link->cset->refcount); | 3911 | count += atomic_read(&link->cset->refcount); |
3914 | spin_unlock_bh(&css_set_lock); | 3912 | spin_unlock_irq(&css_set_lock); |
3915 | return count; | 3913 | return count; |
3916 | } | 3914 | } |
3917 | 3915 | ||
@@ -4249,7 +4247,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
4249 | 4247 | ||
4250 | memset(it, 0, sizeof(*it)); | 4248 | memset(it, 0, sizeof(*it)); |
4251 | 4249 | ||
4252 | spin_lock_bh(&css_set_lock); | 4250 | spin_lock_irq(&css_set_lock); |
4253 | 4251 | ||
4254 | it->ss = css->ss; | 4252 | it->ss = css->ss; |
4255 | 4253 | ||
@@ -4262,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
4262 | 4260 | ||
4263 | css_task_iter_advance_css_set(it); | 4261 | css_task_iter_advance_css_set(it); |
4264 | 4262 | ||
4265 | spin_unlock_bh(&css_set_lock); | 4263 | spin_unlock_irq(&css_set_lock); |
4266 | } | 4264 | } |
4267 | 4265 | ||
4268 | /** | 4266 | /** |
@@ -4280,7 +4278,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
4280 | it->cur_task = NULL; | 4278 | it->cur_task = NULL; |
4281 | } | 4279 | } |
4282 | 4280 | ||
4283 | spin_lock_bh(&css_set_lock); | 4281 | spin_lock_irq(&css_set_lock); |
4284 | 4282 | ||
4285 | if (it->task_pos) { | 4283 | if (it->task_pos) { |
4286 | it->cur_task = list_entry(it->task_pos, struct task_struct, | 4284 | it->cur_task = list_entry(it->task_pos, struct task_struct, |
@@ -4289,7 +4287,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
4289 | css_task_iter_advance(it); | 4287 | css_task_iter_advance(it); |
4290 | } | 4288 | } |
4291 | 4289 | ||
4292 | spin_unlock_bh(&css_set_lock); | 4290 | spin_unlock_irq(&css_set_lock); |
4293 | 4291 | ||
4294 | return it->cur_task; | 4292 | return it->cur_task; |
4295 | } | 4293 | } |
@@ -4303,10 +4301,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
4303 | void css_task_iter_end(struct css_task_iter *it) | 4301 | void css_task_iter_end(struct css_task_iter *it) |
4304 | { | 4302 | { |
4305 | if (it->cur_cset) { | 4303 | if (it->cur_cset) { |
4306 | spin_lock_bh(&css_set_lock); | 4304 | spin_lock_irq(&css_set_lock); |
4307 | list_del(&it->iters_node); | 4305 | list_del(&it->iters_node); |
4308 | put_css_set_locked(it->cur_cset); | 4306 | put_css_set_locked(it->cur_cset); |
4309 | spin_unlock_bh(&css_set_lock); | 4307 | spin_unlock_irq(&css_set_lock); |
4310 | } | 4308 | } |
4311 | 4309 | ||
4312 | if (it->cur_task) | 4310 | if (it->cur_task) |
@@ -4337,11 +4335,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
4337 | 4335 | ||
4338 | mutex_lock(&cgroup_mutex); | 4336 | mutex_lock(&cgroup_mutex); |
4339 | 4337 | ||
4338 | percpu_down_write(&cgroup_threadgroup_rwsem); | ||
4339 | |||
4340 | /* all tasks in @from are being moved, all csets are source */ | 4340 | /* all tasks in @from are being moved, all csets are source */ |
4341 | spin_lock_bh(&css_set_lock); | 4341 | spin_lock_irq(&css_set_lock); |
4342 | list_for_each_entry(link, &from->cset_links, cset_link) | 4342 | list_for_each_entry(link, &from->cset_links, cset_link) |
4343 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); | 4343 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); |
4344 | spin_unlock_bh(&css_set_lock); | 4344 | spin_unlock_irq(&css_set_lock); |
4345 | 4345 | ||
4346 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); | 4346 | ret = cgroup_migrate_prepare_dst(&preloaded_csets); |
4347 | if (ret) | 4347 | if (ret) |
@@ -4365,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
4365 | } while (task && !ret); | 4365 | } while (task && !ret); |
4366 | out_err: | 4366 | out_err: |
4367 | cgroup_migrate_finish(&preloaded_csets); | 4367 | cgroup_migrate_finish(&preloaded_csets); |
4368 | percpu_up_write(&cgroup_threadgroup_rwsem); | ||
4368 | mutex_unlock(&cgroup_mutex); | 4369 | mutex_unlock(&cgroup_mutex); |
4369 | return ret; | 4370 | return ret; |
4370 | } | 4371 | } |
@@ -5063,6 +5064,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
5063 | memset(css, 0, sizeof(*css)); | 5064 | memset(css, 0, sizeof(*css)); |
5064 | css->cgroup = cgrp; | 5065 | css->cgroup = cgrp; |
5065 | css->ss = ss; | 5066 | css->ss = ss; |
5067 | css->id = -1; | ||
5066 | INIT_LIST_HEAD(&css->sibling); | 5068 | INIT_LIST_HEAD(&css->sibling); |
5067 | INIT_LIST_HEAD(&css->children); | 5069 | INIT_LIST_HEAD(&css->children); |
5068 | css->serial_nr = css_serial_nr_next++; | 5070 | css->serial_nr = css_serial_nr_next++; |
@@ -5139,6 +5141,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | |||
5139 | lockdep_assert_held(&cgroup_mutex); | 5141 | lockdep_assert_held(&cgroup_mutex); |
5140 | 5142 | ||
5141 | css = ss->css_alloc(parent_css); | 5143 | css = ss->css_alloc(parent_css); |
5144 | if (!css) | ||
5145 | css = ERR_PTR(-ENOMEM); | ||
5142 | if (IS_ERR(css)) | 5146 | if (IS_ERR(css)) |
5143 | return css; | 5147 | return css; |
5144 | 5148 | ||
@@ -5150,7 +5154,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | |||
5150 | 5154 | ||
5151 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); | 5155 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); |
5152 | if (err < 0) | 5156 | if (err < 0) |
5153 | goto err_free_percpu_ref; | 5157 | goto err_free_css; |
5154 | css->id = err; | 5158 | css->id = err; |
5155 | 5159 | ||
5156 | /* @css is ready to be brought online now, make it visible */ | 5160 | /* @css is ready to be brought online now, make it visible */ |
@@ -5174,9 +5178,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | |||
5174 | 5178 | ||
5175 | err_list_del: | 5179 | err_list_del: |
5176 | list_del_rcu(&css->sibling); | 5180 | list_del_rcu(&css->sibling); |
5177 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
5178 | err_free_percpu_ref: | ||
5179 | percpu_ref_exit(&css->refcnt); | ||
5180 | err_free_css: | 5181 | err_free_css: |
5181 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 5182 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
5182 | return ERR_PTR(err); | 5183 | return ERR_PTR(err); |
@@ -5451,10 +5452,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
5451 | */ | 5452 | */ |
5452 | cgrp->self.flags &= ~CSS_ONLINE; | 5453 | cgrp->self.flags &= ~CSS_ONLINE; |
5453 | 5454 | ||
5454 | spin_lock_bh(&css_set_lock); | 5455 | spin_lock_irq(&css_set_lock); |
5455 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 5456 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
5456 | link->cset->dead = true; | 5457 | link->cset->dead = true; |
5457 | spin_unlock_bh(&css_set_lock); | 5458 | spin_unlock_irq(&css_set_lock); |
5458 | 5459 | ||
5459 | /* initiate massacre of all css's */ | 5460 | /* initiate massacre of all css's */ |
5460 | for_each_css(css, ssid, cgrp) | 5461 | for_each_css(css, ssid, cgrp) |
@@ -5725,7 +5726,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5725 | goto out; | 5726 | goto out; |
5726 | 5727 | ||
5727 | mutex_lock(&cgroup_mutex); | 5728 | mutex_lock(&cgroup_mutex); |
5728 | spin_lock_bh(&css_set_lock); | 5729 | spin_lock_irq(&css_set_lock); |
5729 | 5730 | ||
5730 | for_each_root(root) { | 5731 | for_each_root(root) { |
5731 | struct cgroup_subsys *ss; | 5732 | struct cgroup_subsys *ss; |
@@ -5778,7 +5779,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5778 | 5779 | ||
5779 | retval = 0; | 5780 | retval = 0; |
5780 | out_unlock: | 5781 | out_unlock: |
5781 | spin_unlock_bh(&css_set_lock); | 5782 | spin_unlock_irq(&css_set_lock); |
5782 | mutex_unlock(&cgroup_mutex); | 5783 | mutex_unlock(&cgroup_mutex); |
5783 | kfree(buf); | 5784 | kfree(buf); |
5784 | out: | 5785 | out: |
@@ -5923,13 +5924,13 @@ void cgroup_post_fork(struct task_struct *child) | |||
5923 | if (use_task_css_set_links) { | 5924 | if (use_task_css_set_links) { |
5924 | struct css_set *cset; | 5925 | struct css_set *cset; |
5925 | 5926 | ||
5926 | spin_lock_bh(&css_set_lock); | 5927 | spin_lock_irq(&css_set_lock); |
5927 | cset = task_css_set(current); | 5928 | cset = task_css_set(current); |
5928 | if (list_empty(&child->cg_list)) { | 5929 | if (list_empty(&child->cg_list)) { |
5929 | get_css_set(cset); | 5930 | get_css_set(cset); |
5930 | css_set_move_task(child, NULL, cset, false); | 5931 | css_set_move_task(child, NULL, cset, false); |
5931 | } | 5932 | } |
5932 | spin_unlock_bh(&css_set_lock); | 5933 | spin_unlock_irq(&css_set_lock); |
5933 | } | 5934 | } |
5934 | 5935 | ||
5935 | /* | 5936 | /* |
@@ -5974,9 +5975,9 @@ void cgroup_exit(struct task_struct *tsk) | |||
5974 | cset = task_css_set(tsk); | 5975 | cset = task_css_set(tsk); |
5975 | 5976 | ||
5976 | if (!list_empty(&tsk->cg_list)) { | 5977 | if (!list_empty(&tsk->cg_list)) { |
5977 | spin_lock_bh(&css_set_lock); | 5978 | spin_lock_irq(&css_set_lock); |
5978 | css_set_move_task(tsk, cset, NULL, false); | 5979 | css_set_move_task(tsk, cset, NULL, false); |
5979 | spin_unlock_bh(&css_set_lock); | 5980 | spin_unlock_irq(&css_set_lock); |
5980 | } else { | 5981 | } else { |
5981 | get_css_set(cset); | 5982 | get_css_set(cset); |
5982 | } | 5983 | } |
@@ -6044,9 +6045,9 @@ static void cgroup_release_agent(struct work_struct *work) | |||
6044 | if (!pathbuf || !agentbuf) | 6045 | if (!pathbuf || !agentbuf) |
6045 | goto out; | 6046 | goto out; |
6046 | 6047 | ||
6047 | spin_lock_bh(&css_set_lock); | 6048 | spin_lock_irq(&css_set_lock); |
6048 | path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); | 6049 | path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); |
6049 | spin_unlock_bh(&css_set_lock); | 6050 | spin_unlock_irq(&css_set_lock); |
6050 | if (!path) | 6051 | if (!path) |
6051 | goto out; | 6052 | goto out; |
6052 | 6053 | ||
@@ -6168,7 +6169,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | |||
6168 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 6169 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
6169 | { | 6170 | { |
6170 | WARN_ON_ONCE(!rcu_read_lock_held()); | 6171 | WARN_ON_ONCE(!rcu_read_lock_held()); |
6171 | return id > 0 ? idr_find(&ss->css_idr, id) : NULL; | 6172 | return idr_find(&ss->css_idr, id); |
6172 | } | 6173 | } |
6173 | 6174 | ||
6174 | /** | 6175 | /** |
@@ -6205,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path) | |||
6205 | } | 6206 | } |
6206 | EXPORT_SYMBOL_GPL(cgroup_get_from_path); | 6207 | EXPORT_SYMBOL_GPL(cgroup_get_from_path); |
6207 | 6208 | ||
6209 | /** | ||
6210 | * cgroup_get_from_fd - get a cgroup pointer from a fd | ||
6211 | * @fd: fd obtained by open(cgroup2_dir) | ||
6212 | * | ||
6213 | * Find the cgroup from a fd which should be obtained | ||
6214 | * by opening a cgroup directory. Returns a pointer to the | ||
6215 | * cgroup on success. ERR_PTR is returned if the cgroup | ||
6216 | * cannot be found. | ||
6217 | */ | ||
6218 | struct cgroup *cgroup_get_from_fd(int fd) | ||
6219 | { | ||
6220 | struct cgroup_subsys_state *css; | ||
6221 | struct cgroup *cgrp; | ||
6222 | struct file *f; | ||
6223 | |||
6224 | f = fget_raw(fd); | ||
6225 | if (!f) | ||
6226 | return ERR_PTR(-EBADF); | ||
6227 | |||
6228 | css = css_tryget_online_from_dir(f->f_path.dentry, NULL); | ||
6229 | fput(f); | ||
6230 | if (IS_ERR(css)) | ||
6231 | return ERR_CAST(css); | ||
6232 | |||
6233 | cgrp = css->cgroup; | ||
6234 | if (!cgroup_on_dfl(cgrp)) { | ||
6235 | cgroup_put(cgrp); | ||
6236 | return ERR_PTR(-EBADF); | ||
6237 | } | ||
6238 | |||
6239 | return cgrp; | ||
6240 | } | ||
6241 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); | ||
6242 | |||
6208 | /* | 6243 | /* |
6209 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data | 6244 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data |
6210 | * definition in cgroup-defs.h. | 6245 | * definition in cgroup-defs.h. |
@@ -6305,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
6305 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | 6340 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) |
6306 | return ERR_PTR(-EPERM); | 6341 | return ERR_PTR(-EPERM); |
6307 | 6342 | ||
6308 | mutex_lock(&cgroup_mutex); | 6343 | /* It is not safe to take cgroup_mutex here */ |
6309 | spin_lock_bh(&css_set_lock); | 6344 | spin_lock_irq(&css_set_lock); |
6310 | |||
6311 | cset = task_css_set(current); | 6345 | cset = task_css_set(current); |
6312 | get_css_set(cset); | 6346 | get_css_set(cset); |
6313 | 6347 | spin_unlock_irq(&css_set_lock); | |
6314 | spin_unlock_bh(&css_set_lock); | ||
6315 | mutex_unlock(&cgroup_mutex); | ||
6316 | 6348 | ||
6317 | new_ns = alloc_cgroup_ns(); | 6349 | new_ns = alloc_cgroup_ns(); |
6318 | if (IS_ERR(new_ns)) { | 6350 | if (IS_ERR(new_ns)) { |
@@ -6435,7 +6467,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
6435 | if (!name_buf) | 6467 | if (!name_buf) |
6436 | return -ENOMEM; | 6468 | return -ENOMEM; |
6437 | 6469 | ||
6438 | spin_lock_bh(&css_set_lock); | 6470 | spin_lock_irq(&css_set_lock); |
6439 | rcu_read_lock(); | 6471 | rcu_read_lock(); |
6440 | cset = rcu_dereference(current->cgroups); | 6472 | cset = rcu_dereference(current->cgroups); |
6441 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | 6473 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
@@ -6446,7 +6478,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
6446 | c->root->hierarchy_id, name_buf); | 6478 | c->root->hierarchy_id, name_buf); |
6447 | } | 6479 | } |
6448 | rcu_read_unlock(); | 6480 | rcu_read_unlock(); |
6449 | spin_unlock_bh(&css_set_lock); | 6481 | spin_unlock_irq(&css_set_lock); |
6450 | kfree(name_buf); | 6482 | kfree(name_buf); |
6451 | return 0; | 6483 | return 0; |
6452 | } | 6484 | } |
@@ -6457,7 +6489,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
6457 | struct cgroup_subsys_state *css = seq_css(seq); | 6489 | struct cgroup_subsys_state *css = seq_css(seq); |
6458 | struct cgrp_cset_link *link; | 6490 | struct cgrp_cset_link *link; |
6459 | 6491 | ||
6460 | spin_lock_bh(&css_set_lock); | 6492 | spin_lock_irq(&css_set_lock); |
6461 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | 6493 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
6462 | struct css_set *cset = link->cset; | 6494 | struct css_set *cset = link->cset; |
6463 | struct task_struct *task; | 6495 | struct task_struct *task; |
@@ -6480,7 +6512,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
6480 | overflow: | 6512 | overflow: |
6481 | seq_puts(seq, " ...\n"); | 6513 | seq_puts(seq, " ...\n"); |
6482 | } | 6514 | } |
6483 | spin_unlock_bh(&css_set_lock); | 6515 | spin_unlock_irq(&css_set_lock); |
6484 | return 0; | 6516 | return 0; |
6485 | } | 6517 | } |
6486 | 6518 | ||
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c | |||
@@ -49,6 +49,12 @@ struct pids_cgroup { | |||
49 | */ | 49 | */ |
50 | atomic64_t counter; | 50 | atomic64_t counter; |
51 | int64_t limit; | 51 | int64_t limit; |
52 | |||
53 | /* Handle for "pids.events" */ | ||
54 | struct cgroup_file events_file; | ||
55 | |||
56 | /* Number of times fork failed because limit was hit. */ | ||
57 | atomic64_t events_limit; | ||
52 | }; | 58 | }; |
53 | 59 | ||
54 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) | 60 | static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) |
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) | |||
72 | 78 | ||
73 | pids->limit = PIDS_MAX; | 79 | pids->limit = PIDS_MAX; |
74 | atomic64_set(&pids->counter, 0); | 80 | atomic64_set(&pids->counter, 0); |
81 | atomic64_set(&pids->events_limit, 0); | ||
75 | return &pids->css; | 82 | return &pids->css; |
76 | } | 83 | } |
77 | 84 | ||
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task) | |||
213 | { | 220 | { |
214 | struct cgroup_subsys_state *css; | 221 | struct cgroup_subsys_state *css; |
215 | struct pids_cgroup *pids; | 222 | struct pids_cgroup *pids; |
223 | int err; | ||
216 | 224 | ||
217 | css = task_css_check(current, pids_cgrp_id, true); | 225 | css = task_css_check(current, pids_cgrp_id, true); |
218 | pids = css_pids(css); | 226 | pids = css_pids(css); |
219 | return pids_try_charge(pids, 1); | 227 | err = pids_try_charge(pids, 1); |
228 | if (err) { | ||
229 | /* Only log the first time events_limit is incremented. */ | ||
230 | if (atomic64_inc_return(&pids->events_limit) == 1) { | ||
231 | pr_info("cgroup: fork rejected by pids controller in "); | ||
232 | pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); | ||
233 | pr_cont("\n"); | ||
234 | } | ||
235 | cgroup_file_notify(&pids->events_file); | ||
236 | } | ||
237 | return err; | ||
220 | } | 238 | } |
221 | 239 | ||
222 | static void pids_cancel_fork(struct task_struct *task) | 240 | static void pids_cancel_fork(struct task_struct *task) |
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, | |||
288 | return atomic64_read(&pids->counter); | 306 | return atomic64_read(&pids->counter); |
289 | } | 307 | } |
290 | 308 | ||
309 | static int pids_events_show(struct seq_file *sf, void *v) | ||
310 | { | ||
311 | struct pids_cgroup *pids = css_pids(seq_css(sf)); | ||
312 | |||
313 | seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); | ||
314 | return 0; | ||
315 | } | ||
316 | |||
291 | static struct cftype pids_files[] = { | 317 | static struct cftype pids_files[] = { |
292 | { | 318 | { |
293 | .name = "max", | 319 | .name = "max", |
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = { | |||
300 | .read_s64 = pids_current_read, | 326 | .read_s64 = pids_current_read, |
301 | .flags = CFTYPE_NOT_ON_ROOT, | 327 | .flags = CFTYPE_NOT_ON_ROOT, |
302 | }, | 328 | }, |
329 | { | ||
330 | .name = "events", | ||
331 | .seq_show = pids_events_show, | ||
332 | .file_offset = offsetof(struct pids_cgroup, events_file), | ||
333 | .flags = CFTYPE_NOT_ON_ROOT, | ||
334 | }, | ||
303 | { } /* terminate */ | 335 | { } /* terminate */ |
304 | }; | 336 | }; |
305 | 337 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index d948e44c471e..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
517 | if (!cpu_online(cpu)) | 517 | if (!cpu_online(cpu)) |
518 | return 0; | 518 | return 0; |
519 | 519 | ||
520 | /* | ||
521 | * If we are up and running, use the hotplug thread. For early calls | ||
522 | * we invoke the thread function directly. | ||
523 | */ | ||
524 | if (!st->thread) | ||
525 | return cpuhp_invoke_callback(cpu, state, cb); | ||
526 | |||
520 | st->cb_state = state; | 527 | st->cb_state = state; |
521 | st->cb = cb; | 528 | st->cb = cb; |
522 | /* | 529 | /* |
@@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1173 | .teardown = NULL, | 1180 | .teardown = NULL, |
1174 | .cant_stop = true, | 1181 | .cant_stop = true, |
1175 | }, | 1182 | }, |
1183 | [CPUHP_PERF_PREPARE] = { | ||
1184 | .name = "perf prepare", | ||
1185 | .startup = perf_event_init_cpu, | ||
1186 | .teardown = perf_event_exit_cpu, | ||
1187 | }, | ||
1188 | [CPUHP_WORKQUEUE_PREP] = { | ||
1189 | .name = "workqueue prepare", | ||
1190 | .startup = workqueue_prepare_cpu, | ||
1191 | .teardown = NULL, | ||
1192 | }, | ||
1193 | [CPUHP_HRTIMERS_PREPARE] = { | ||
1194 | .name = "hrtimers prepare", | ||
1195 | .startup = hrtimers_prepare_cpu, | ||
1196 | .teardown = hrtimers_dead_cpu, | ||
1197 | }, | ||
1198 | [CPUHP_SMPCFD_PREPARE] = { | ||
1199 | .name = "SMPCFD prepare", | ||
1200 | .startup = smpcfd_prepare_cpu, | ||
1201 | .teardown = smpcfd_dead_cpu, | ||
1202 | }, | ||
1203 | [CPUHP_RCUTREE_PREP] = { | ||
1204 | .name = "RCU-tree prepare", | ||
1205 | .startup = rcutree_prepare_cpu, | ||
1206 | .teardown = rcutree_dead_cpu, | ||
1207 | }, | ||
1176 | /* | 1208 | /* |
1177 | * Preparatory and dead notifiers. Will be replaced once the notifiers | 1209 | * Preparatory and dead notifiers. Will be replaced once the notifiers |
1178 | * are converted to states. | 1210 | * are converted to states. |
@@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1184 | .skip_onerr = true, | 1216 | .skip_onerr = true, |
1185 | .cant_stop = true, | 1217 | .cant_stop = true, |
1186 | }, | 1218 | }, |
1219 | /* | ||
1220 | * On the tear-down path, timers_dead_cpu() must be invoked | ||
1221 | * before blk_mq_queue_reinit_notify() from notify_dead(), | ||
1222 | * otherwise a RCU stall occurs. | ||
1223 | */ | ||
1224 | [CPUHP_TIMERS_DEAD] = { | ||
1225 | .name = "timers dead", | ||
1226 | .startup = NULL, | ||
1227 | .teardown = timers_dead_cpu, | ||
1228 | }, | ||
1187 | /* Kicks the plugged cpu into life */ | 1229 | /* Kicks the plugged cpu into life */ |
1188 | [CPUHP_BRINGUP_CPU] = { | 1230 | [CPUHP_BRINGUP_CPU] = { |
1189 | .name = "cpu:bringup", | 1231 | .name = "cpu:bringup", |
@@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1191 | .teardown = NULL, | 1233 | .teardown = NULL, |
1192 | .cant_stop = true, | 1234 | .cant_stop = true, |
1193 | }, | 1235 | }, |
1236 | [CPUHP_AP_SMPCFD_DYING] = { | ||
1237 | .startup = NULL, | ||
1238 | .teardown = smpcfd_dying_cpu, | ||
1239 | }, | ||
1194 | /* | 1240 | /* |
1195 | * Handled on controll processor until the plugged processor manages | 1241 | * Handled on controll processor until the plugged processor manages |
1196 | * this itself. | 1242 | * this itself. |
@@ -1201,6 +1247,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1201 | .teardown = takedown_cpu, | 1247 | .teardown = takedown_cpu, |
1202 | .cant_stop = true, | 1248 | .cant_stop = true, |
1203 | }, | 1249 | }, |
1250 | #else | ||
1251 | [CPUHP_BRINGUP_CPU] = { }, | ||
1204 | #endif | 1252 | #endif |
1205 | }; | 1253 | }; |
1206 | 1254 | ||
@@ -1225,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1225 | .startup = sched_cpu_starting, | 1273 | .startup = sched_cpu_starting, |
1226 | .teardown = sched_cpu_dying, | 1274 | .teardown = sched_cpu_dying, |
1227 | }, | 1275 | }, |
1276 | [CPUHP_AP_RCUTREE_DYING] = { | ||
1277 | .startup = NULL, | ||
1278 | .teardown = rcutree_dying_cpu, | ||
1279 | }, | ||
1228 | /* | 1280 | /* |
1229 | * Low level startup/teardown notifiers. Run with interrupts | 1281 | * Low level startup/teardown notifiers. Run with interrupts |
1230 | * disabled. Will be removed once the notifiers are converted to | 1282 | * disabled. Will be removed once the notifiers are converted to |
@@ -1248,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1248 | .startup = smpboot_unpark_threads, | 1300 | .startup = smpboot_unpark_threads, |
1249 | .teardown = NULL, | 1301 | .teardown = NULL, |
1250 | }, | 1302 | }, |
1303 | [CPUHP_AP_PERF_ONLINE] = { | ||
1304 | .name = "perf online", | ||
1305 | .startup = perf_event_init_cpu, | ||
1306 | .teardown = perf_event_exit_cpu, | ||
1307 | }, | ||
1308 | [CPUHP_AP_WORKQUEUE_ONLINE] = { | ||
1309 | .name = "workqueue online", | ||
1310 | .startup = workqueue_online_cpu, | ||
1311 | .teardown = workqueue_offline_cpu, | ||
1312 | }, | ||
1313 | [CPUHP_AP_RCUTREE_ONLINE] = { | ||
1314 | .name = "RCU-tree online", | ||
1315 | .startup = rcutree_online_cpu, | ||
1316 | .teardown = rcutree_offline_cpu, | ||
1317 | }, | ||
1318 | |||
1251 | /* | 1319 | /* |
1252 | * Online/down_prepare notifiers. Will be removed once the notifiers | 1320 | * Online/down_prepare notifiers. Will be removed once the notifiers |
1253 | * are converted to states. | 1321 | * are converted to states. |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1034 | { | 1034 | { |
1035 | bool need_loop; | 1035 | bool need_loop; |
1036 | 1036 | ||
1037 | /* | ||
1038 | * Allow tasks that have access to memory reserves because they have | ||
1039 | * been OOM killed to get memory anywhere. | ||
1040 | */ | ||
1041 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
1042 | return; | ||
1043 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | ||
1044 | return; | ||
1045 | |||
1046 | task_lock(tsk); | 1037 | task_lock(tsk); |
1047 | /* | 1038 | /* |
1048 | * Determine if a loop is necessary if another thread is doing | 1039 | * Determine if a loop is necessary if another thread is doing |
diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); | |||
689 | */ | 689 | */ |
690 | int set_create_files_as(struct cred *new, struct inode *inode) | 690 | int set_create_files_as(struct cred *new, struct inode *inode) |
691 | { | 691 | { |
692 | if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) | ||
693 | return -EINVAL; | ||
692 | new->fsuid = inode->i_uid; | 694 | new->fsuid = inode->i_uid; |
693 | new->fsgid = inode->i_gid; | 695 | new->fsgid = inode->i_gid; |
694 | return security_kernel_create_files_as(new, inode); | 696 | return security_kernel_create_files_as(new, inode); |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -104,7 +104,7 @@ fail: | |||
104 | return -ENOMEM; | 104 | return -ENOMEM; |
105 | } | 105 | } |
106 | 106 | ||
107 | int get_callchain_buffers(void) | 107 | int get_callchain_buffers(int event_max_stack) |
108 | { | 108 | { |
109 | int err = 0; | 109 | int err = 0; |
110 | int count; | 110 | int count; |
@@ -121,6 +121,15 @@ int get_callchain_buffers(void) | |||
121 | /* If the allocation failed, give up */ | 121 | /* If the allocation failed, give up */ |
122 | if (!callchain_cpus_entries) | 122 | if (!callchain_cpus_entries) |
123 | err = -ENOMEM; | 123 | err = -ENOMEM; |
124 | /* | ||
125 | * If requesting per event more than the global cap, | ||
126 | * return a different error to help userspace figure | ||
127 | * this out. | ||
128 | * | ||
129 | * And also do it here so that we have &callchain_mutex held. | ||
130 | */ | ||
131 | if (event_max_stack > sysctl_perf_event_max_stack) | ||
132 | err = -EOVERFLOW; | ||
124 | goto exit; | 133 | goto exit; |
125 | } | 134 | } |
126 | 135 | ||
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
174 | bool user = !event->attr.exclude_callchain_user; | 183 | bool user = !event->attr.exclude_callchain_user; |
175 | /* Disallow cross-task user callchains. */ | 184 | /* Disallow cross-task user callchains. */ |
176 | bool crosstask = event->ctx->task && event->ctx->task != current; | 185 | bool crosstask = event->ctx->task && event->ctx->task != current; |
186 | const u32 max_stack = event->attr.sample_max_stack; | ||
177 | 187 | ||
178 | if (!kernel && !user) | 188 | if (!kernel && !user) |
179 | return NULL; | 189 | return NULL; |
180 | 190 | ||
181 | return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); | 191 | return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); |
182 | } | 192 | } |
183 | 193 | ||
184 | struct perf_callchain_entry * | 194 | struct perf_callchain_entry * |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 274450efea90..356a6c7cb52a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count; | |||
335 | 335 | ||
336 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 336 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
337 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); | 337 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); |
338 | static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); | ||
338 | 339 | ||
339 | static atomic_t nr_mmap_events __read_mostly; | 340 | static atomic_t nr_mmap_events __read_mostly; |
340 | static atomic_t nr_comm_events __read_mostly; | 341 | static atomic_t nr_comm_events __read_mostly; |
@@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
396 | if (ret || !write) | 397 | if (ret || !write) |
397 | return ret; | 398 | return ret; |
398 | 399 | ||
400 | /* | ||
401 | * If throttling is disabled don't allow the write: | ||
402 | */ | ||
403 | if (sysctl_perf_cpu_time_max_percent == 100 || | ||
404 | sysctl_perf_cpu_time_max_percent == 0) | ||
405 | return -EINVAL; | ||
406 | |||
399 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | 407 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); |
400 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 408 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
401 | update_perf_cpu_limits(); | 409 | update_perf_cpu_limits(); |
@@ -1678,12 +1686,33 @@ static bool is_orphaned_event(struct perf_event *event) | |||
1678 | return event->state == PERF_EVENT_STATE_DEAD; | 1686 | return event->state == PERF_EVENT_STATE_DEAD; |
1679 | } | 1687 | } |
1680 | 1688 | ||
1681 | static inline int pmu_filter_match(struct perf_event *event) | 1689 | static inline int __pmu_filter_match(struct perf_event *event) |
1682 | { | 1690 | { |
1683 | struct pmu *pmu = event->pmu; | 1691 | struct pmu *pmu = event->pmu; |
1684 | return pmu->filter_match ? pmu->filter_match(event) : 1; | 1692 | return pmu->filter_match ? pmu->filter_match(event) : 1; |
1685 | } | 1693 | } |
1686 | 1694 | ||
1695 | /* | ||
1696 | * Check whether we should attempt to schedule an event group based on | ||
1697 | * PMU-specific filtering. An event group can consist of HW and SW events, | ||
1698 | * potentially with a SW leader, so we must check all the filters, to | ||
1699 | * determine whether a group is schedulable: | ||
1700 | */ | ||
1701 | static inline int pmu_filter_match(struct perf_event *event) | ||
1702 | { | ||
1703 | struct perf_event *child; | ||
1704 | |||
1705 | if (!__pmu_filter_match(event)) | ||
1706 | return 0; | ||
1707 | |||
1708 | list_for_each_entry(child, &event->sibling_list, group_entry) { | ||
1709 | if (!__pmu_filter_match(child)) | ||
1710 | return 0; | ||
1711 | } | ||
1712 | |||
1713 | return 1; | ||
1714 | } | ||
1715 | |||
1687 | static inline int | 1716 | static inline int |
1688 | event_filter_match(struct perf_event *event) | 1717 | event_filter_match(struct perf_event *event) |
1689 | { | 1718 | { |
@@ -3665,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head) | |||
3665 | static void ring_buffer_attach(struct perf_event *event, | 3694 | static void ring_buffer_attach(struct perf_event *event, |
3666 | struct ring_buffer *rb); | 3695 | struct ring_buffer *rb); |
3667 | 3696 | ||
3697 | static void detach_sb_event(struct perf_event *event) | ||
3698 | { | ||
3699 | struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); | ||
3700 | |||
3701 | raw_spin_lock(&pel->lock); | ||
3702 | list_del_rcu(&event->sb_list); | ||
3703 | raw_spin_unlock(&pel->lock); | ||
3704 | } | ||
3705 | |||
3706 | static bool is_sb_event(struct perf_event *event) | ||
3707 | { | ||
3708 | struct perf_event_attr *attr = &event->attr; | ||
3709 | |||
3710 | if (event->parent) | ||
3711 | return false; | ||
3712 | |||
3713 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3714 | return false; | ||
3715 | |||
3716 | if (attr->mmap || attr->mmap_data || attr->mmap2 || | ||
3717 | attr->comm || attr->comm_exec || | ||
3718 | attr->task || | ||
3719 | attr->context_switch) | ||
3720 | return true; | ||
3721 | return false; | ||
3722 | } | ||
3723 | |||
3724 | static void unaccount_pmu_sb_event(struct perf_event *event) | ||
3725 | { | ||
3726 | if (is_sb_event(event)) | ||
3727 | detach_sb_event(event); | ||
3728 | } | ||
3729 | |||
3668 | static void unaccount_event_cpu(struct perf_event *event, int cpu) | 3730 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3669 | { | 3731 | { |
3670 | if (event->parent) | 3732 | if (event->parent) |
@@ -3728,6 +3790,8 @@ static void unaccount_event(struct perf_event *event) | |||
3728 | } | 3790 | } |
3729 | 3791 | ||
3730 | unaccount_event_cpu(event, event->cpu); | 3792 | unaccount_event_cpu(event, event->cpu); |
3793 | |||
3794 | unaccount_pmu_sb_event(event); | ||
3731 | } | 3795 | } |
3732 | 3796 | ||
3733 | static void perf_sched_delayed(struct work_struct *work) | 3797 | static void perf_sched_delayed(struct work_struct *work) |
@@ -3862,10 +3926,8 @@ static void _free_event(struct perf_event *event) | |||
3862 | if (event->ctx) | 3926 | if (event->ctx) |
3863 | put_ctx(event->ctx); | 3927 | put_ctx(event->ctx); |
3864 | 3928 | ||
3865 | if (event->pmu) { | 3929 | exclusive_event_destroy(event); |
3866 | exclusive_event_destroy(event); | 3930 | module_put(event->pmu->module); |
3867 | module_put(event->pmu->module); | ||
3868 | } | ||
3869 | 3931 | ||
3870 | call_rcu(&event->rcu_head, free_event_rcu); | 3932 | call_rcu(&event->rcu_head, free_event_rcu); |
3871 | } | 3933 | } |
@@ -5555,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
5555 | } | 5617 | } |
5556 | 5618 | ||
5557 | if (sample_type & PERF_SAMPLE_RAW) { | 5619 | if (sample_type & PERF_SAMPLE_RAW) { |
5558 | if (data->raw) { | 5620 | struct perf_raw_record *raw = data->raw; |
5559 | u32 raw_size = data->raw->size; | 5621 | |
5560 | u32 real_size = round_up(raw_size + sizeof(u32), | 5622 | if (raw) { |
5561 | sizeof(u64)) - sizeof(u32); | 5623 | struct perf_raw_frag *frag = &raw->frag; |
5562 | u64 zero = 0; | 5624 | |
5563 | 5625 | perf_output_put(handle, raw->size); | |
5564 | perf_output_put(handle, real_size); | 5626 | do { |
5565 | __output_copy(handle, data->raw->data, raw_size); | 5627 | if (frag->copy) { |
5566 | if (real_size - raw_size) | 5628 | __output_custom(handle, frag->copy, |
5567 | __output_copy(handle, &zero, real_size - raw_size); | 5629 | frag->data, frag->size); |
5630 | } else { | ||
5631 | __output_copy(handle, frag->data, | ||
5632 | frag->size); | ||
5633 | } | ||
5634 | if (perf_raw_frag_last(frag)) | ||
5635 | break; | ||
5636 | frag = frag->next; | ||
5637 | } while (1); | ||
5638 | if (frag->pad) | ||
5639 | __output_skip(handle, NULL, frag->pad); | ||
5568 | } else { | 5640 | } else { |
5569 | struct { | 5641 | struct { |
5570 | u32 size; | 5642 | u32 size; |
@@ -5689,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
5689 | } | 5761 | } |
5690 | 5762 | ||
5691 | if (sample_type & PERF_SAMPLE_RAW) { | 5763 | if (sample_type & PERF_SAMPLE_RAW) { |
5692 | int size = sizeof(u32); | 5764 | struct perf_raw_record *raw = data->raw; |
5693 | 5765 | int size; | |
5694 | if (data->raw) | 5766 | |
5695 | size += data->raw->size; | 5767 | if (raw) { |
5696 | else | 5768 | struct perf_raw_frag *frag = &raw->frag; |
5697 | size += sizeof(u32); | 5769 | u32 sum = 0; |
5770 | |||
5771 | do { | ||
5772 | sum += frag->size; | ||
5773 | if (perf_raw_frag_last(frag)) | ||
5774 | break; | ||
5775 | frag = frag->next; | ||
5776 | } while (1); | ||
5777 | |||
5778 | size = round_up(sum + sizeof(u32), sizeof(u64)); | ||
5779 | raw->size = size - sizeof(u32); | ||
5780 | frag->pad = raw->size - sum; | ||
5781 | } else { | ||
5782 | size = sizeof(u64); | ||
5783 | } | ||
5698 | 5784 | ||
5699 | header->size += round_up(size, sizeof(u64)); | 5785 | header->size += size; |
5700 | } | 5786 | } |
5701 | 5787 | ||
5702 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 5788 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
@@ -5856,11 +5942,11 @@ perf_event_read_event(struct perf_event *event, | |||
5856 | perf_output_end(&handle); | 5942 | perf_output_end(&handle); |
5857 | } | 5943 | } |
5858 | 5944 | ||
5859 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 5945 | typedef void (perf_iterate_f)(struct perf_event *event, void *data); |
5860 | 5946 | ||
5861 | static void | 5947 | static void |
5862 | perf_event_aux_ctx(struct perf_event_context *ctx, | 5948 | perf_iterate_ctx(struct perf_event_context *ctx, |
5863 | perf_event_aux_output_cb output, | 5949 | perf_iterate_f output, |
5864 | void *data, bool all) | 5950 | void *data, bool all) |
5865 | { | 5951 | { |
5866 | struct perf_event *event; | 5952 | struct perf_event *event; |
@@ -5877,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
5877 | } | 5963 | } |
5878 | } | 5964 | } |
5879 | 5965 | ||
5880 | static void | 5966 | static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) |
5881 | perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, | ||
5882 | struct perf_event_context *task_ctx) | ||
5883 | { | 5967 | { |
5884 | rcu_read_lock(); | 5968 | struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); |
5885 | preempt_disable(); | 5969 | struct perf_event *event; |
5886 | perf_event_aux_ctx(task_ctx, output, data, false); | 5970 | |
5887 | preempt_enable(); | 5971 | list_for_each_entry_rcu(event, &pel->list, sb_list) { |
5888 | rcu_read_unlock(); | 5972 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
5973 | continue; | ||
5974 | if (!event_filter_match(event)) | ||
5975 | continue; | ||
5976 | output(event, data); | ||
5977 | } | ||
5889 | } | 5978 | } |
5890 | 5979 | ||
5980 | /* | ||
5981 | * Iterate all events that need to receive side-band events. | ||
5982 | * | ||
5983 | * For new callers; ensure that account_pmu_sb_event() includes | ||
5984 | * your event, otherwise it might not get delivered. | ||
5985 | */ | ||
5891 | static void | 5986 | static void |
5892 | perf_event_aux(perf_event_aux_output_cb output, void *data, | 5987 | perf_iterate_sb(perf_iterate_f output, void *data, |
5893 | struct perf_event_context *task_ctx) | 5988 | struct perf_event_context *task_ctx) |
5894 | { | 5989 | { |
5895 | struct perf_cpu_context *cpuctx; | ||
5896 | struct perf_event_context *ctx; | 5990 | struct perf_event_context *ctx; |
5897 | struct pmu *pmu; | ||
5898 | int ctxn; | 5991 | int ctxn; |
5899 | 5992 | ||
5993 | rcu_read_lock(); | ||
5994 | preempt_disable(); | ||
5995 | |||
5900 | /* | 5996 | /* |
5901 | * If we have task_ctx != NULL we only notify | 5997 | * If we have task_ctx != NULL we only notify the task context itself. |
5902 | * the task context itself. The task_ctx is set | 5998 | * The task_ctx is set only for EXIT events before releasing task |
5903 | * only for EXIT events before releasing task | ||
5904 | * context. | 5999 | * context. |
5905 | */ | 6000 | */ |
5906 | if (task_ctx) { | 6001 | if (task_ctx) { |
5907 | perf_event_aux_task_ctx(output, data, task_ctx); | 6002 | perf_iterate_ctx(task_ctx, output, data, false); |
5908 | return; | 6003 | goto done; |
5909 | } | 6004 | } |
5910 | 6005 | ||
5911 | rcu_read_lock(); | 6006 | perf_iterate_sb_cpu(output, data); |
5912 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6007 | |
5913 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 6008 | for_each_task_context_nr(ctxn) { |
5914 | if (cpuctx->unique_pmu != pmu) | ||
5915 | goto next; | ||
5916 | perf_event_aux_ctx(&cpuctx->ctx, output, data, false); | ||
5917 | ctxn = pmu->task_ctx_nr; | ||
5918 | if (ctxn < 0) | ||
5919 | goto next; | ||
5920 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 6009 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
5921 | if (ctx) | 6010 | if (ctx) |
5922 | perf_event_aux_ctx(ctx, output, data, false); | 6011 | perf_iterate_ctx(ctx, output, data, false); |
5923 | next: | ||
5924 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
5925 | } | 6012 | } |
6013 | done: | ||
6014 | preempt_enable(); | ||
5926 | rcu_read_unlock(); | 6015 | rcu_read_unlock(); |
5927 | } | 6016 | } |
5928 | 6017 | ||
@@ -5971,7 +6060,7 @@ void perf_event_exec(void) | |||
5971 | 6060 | ||
5972 | perf_event_enable_on_exec(ctxn); | 6061 | perf_event_enable_on_exec(ctxn); |
5973 | 6062 | ||
5974 | perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, | 6063 | perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, |
5975 | true); | 6064 | true); |
5976 | } | 6065 | } |
5977 | rcu_read_unlock(); | 6066 | rcu_read_unlock(); |
@@ -6015,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info) | |||
6015 | }; | 6104 | }; |
6016 | 6105 | ||
6017 | rcu_read_lock(); | 6106 | rcu_read_lock(); |
6018 | perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); | 6107 | perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); |
6019 | if (cpuctx->task_ctx) | 6108 | if (cpuctx->task_ctx) |
6020 | perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, | 6109 | perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, |
6021 | &ro, false); | 6110 | &ro, false); |
6022 | rcu_read_unlock(); | 6111 | rcu_read_unlock(); |
6023 | 6112 | ||
@@ -6146,7 +6235,7 @@ static void perf_event_task(struct task_struct *task, | |||
6146 | }, | 6235 | }, |
6147 | }; | 6236 | }; |
6148 | 6237 | ||
6149 | perf_event_aux(perf_event_task_output, | 6238 | perf_iterate_sb(perf_event_task_output, |
6150 | &task_event, | 6239 | &task_event, |
6151 | task_ctx); | 6240 | task_ctx); |
6152 | } | 6241 | } |
@@ -6225,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
6225 | 6314 | ||
6226 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 6315 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
6227 | 6316 | ||
6228 | perf_event_aux(perf_event_comm_output, | 6317 | perf_iterate_sb(perf_event_comm_output, |
6229 | comm_event, | 6318 | comm_event, |
6230 | NULL); | 6319 | NULL); |
6231 | } | 6320 | } |
@@ -6456,7 +6545,7 @@ got_name: | |||
6456 | 6545 | ||
6457 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 6546 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
6458 | 6547 | ||
6459 | perf_event_aux(perf_event_mmap_output, | 6548 | perf_iterate_sb(perf_event_mmap_output, |
6460 | mmap_event, | 6549 | mmap_event, |
6461 | NULL); | 6550 | NULL); |
6462 | 6551 | ||
@@ -6539,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) | |||
6539 | if (!ctx) | 6628 | if (!ctx) |
6540 | continue; | 6629 | continue; |
6541 | 6630 | ||
6542 | perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); | 6631 | perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); |
6543 | } | 6632 | } |
6544 | rcu_read_unlock(); | 6633 | rcu_read_unlock(); |
6545 | } | 6634 | } |
@@ -6726,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task, | |||
6726 | }, | 6815 | }, |
6727 | }; | 6816 | }; |
6728 | 6817 | ||
6729 | perf_event_aux(perf_event_switch_output, | 6818 | perf_iterate_sb(perf_event_switch_output, |
6730 | &switch_event, | 6819 | &switch_event, |
6731 | NULL); | 6820 | NULL); |
6732 | } | 6821 | } |
@@ -7333,7 +7422,7 @@ static struct pmu perf_swevent = { | |||
7333 | static int perf_tp_filter_match(struct perf_event *event, | 7422 | static int perf_tp_filter_match(struct perf_event *event, |
7334 | struct perf_sample_data *data) | 7423 | struct perf_sample_data *data) |
7335 | { | 7424 | { |
7336 | void *record = data->raw->data; | 7425 | void *record = data->raw->frag.data; |
7337 | 7426 | ||
7338 | /* only top level events have filters set */ | 7427 | /* only top level events have filters set */ |
7339 | if (event->parent) | 7428 | if (event->parent) |
@@ -7389,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, | |||
7389 | struct perf_event *event; | 7478 | struct perf_event *event; |
7390 | 7479 | ||
7391 | struct perf_raw_record raw = { | 7480 | struct perf_raw_record raw = { |
7392 | .size = entry_size, | 7481 | .frag = { |
7393 | .data = record, | 7482 | .size = entry_size, |
7483 | .data = record, | ||
7484 | }, | ||
7394 | }; | 7485 | }; |
7395 | 7486 | ||
7396 | perf_sample_data_init(&data, 0, 0); | 7487 | perf_sample_data_init(&data, 0, 0); |
@@ -8648,6 +8739,28 @@ unlock: | |||
8648 | return pmu; | 8739 | return pmu; |
8649 | } | 8740 | } |
8650 | 8741 | ||
8742 | static void attach_sb_event(struct perf_event *event) | ||
8743 | { | ||
8744 | struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); | ||
8745 | |||
8746 | raw_spin_lock(&pel->lock); | ||
8747 | list_add_rcu(&event->sb_list, &pel->list); | ||
8748 | raw_spin_unlock(&pel->lock); | ||
8749 | } | ||
8750 | |||
8751 | /* | ||
8752 | * We keep a list of all !task (and therefore per-cpu) events | ||
8753 | * that need to receive side-band records. | ||
8754 | * | ||
8755 | * This avoids having to scan all the various PMU per-cpu contexts | ||
8756 | * looking for them. | ||
8757 | */ | ||
8758 | static void account_pmu_sb_event(struct perf_event *event) | ||
8759 | { | ||
8760 | if (is_sb_event(event)) | ||
8761 | attach_sb_event(event); | ||
8762 | } | ||
8763 | |||
8651 | static void account_event_cpu(struct perf_event *event, int cpu) | 8764 | static void account_event_cpu(struct perf_event *event, int cpu) |
8652 | { | 8765 | { |
8653 | if (event->parent) | 8766 | if (event->parent) |
@@ -8728,6 +8841,8 @@ static void account_event(struct perf_event *event) | |||
8728 | enabled: | 8841 | enabled: |
8729 | 8842 | ||
8730 | account_event_cpu(event, event->cpu); | 8843 | account_event_cpu(event, event->cpu); |
8844 | |||
8845 | account_pmu_sb_event(event); | ||
8731 | } | 8846 | } |
8732 | 8847 | ||
8733 | /* | 8848 | /* |
@@ -8876,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
8876 | 8991 | ||
8877 | if (!event->parent) { | 8992 | if (!event->parent) { |
8878 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 8993 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
8879 | err = get_callchain_buffers(); | 8994 | err = get_callchain_buffers(attr->sample_max_stack); |
8880 | if (err) | 8995 | if (err) |
8881 | goto err_addr_filters; | 8996 | goto err_addr_filters; |
8882 | } | 8997 | } |
@@ -9198,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9198 | return -EINVAL; | 9313 | return -EINVAL; |
9199 | } | 9314 | } |
9200 | 9315 | ||
9316 | if (!attr.sample_max_stack) | ||
9317 | attr.sample_max_stack = sysctl_perf_event_max_stack; | ||
9318 | |||
9201 | /* | 9319 | /* |
9202 | * In cgroup mode, the pid argument is used to pass the fd | 9320 | * In cgroup mode, the pid argument is used to pass the fd |
9203 | * opened to the cgroup directory in cgroupfs. The cpu argument | 9321 | * opened to the cgroup directory in cgroupfs. The cpu argument |
@@ -9271,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9271 | 9389 | ||
9272 | if (is_sampling_event(event)) { | 9390 | if (is_sampling_event(event)) { |
9273 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | 9391 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { |
9274 | err = -ENOTSUPP; | 9392 | err = -EOPNOTSUPP; |
9275 | goto err_alloc; | 9393 | goto err_alloc; |
9276 | } | 9394 | } |
9277 | } | 9395 | } |
@@ -10233,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void) | |||
10233 | swhash = &per_cpu(swevent_htable, cpu); | 10351 | swhash = &per_cpu(swevent_htable, cpu); |
10234 | mutex_init(&swhash->hlist_mutex); | 10352 | mutex_init(&swhash->hlist_mutex); |
10235 | INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); | 10353 | INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); |
10354 | |||
10355 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); | ||
10356 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); | ||
10236 | } | 10357 | } |
10237 | } | 10358 | } |
10238 | 10359 | ||
10239 | static void perf_event_init_cpu(int cpu) | 10360 | int perf_event_init_cpu(unsigned int cpu) |
10240 | { | 10361 | { |
10241 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 10362 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
10242 | 10363 | ||
@@ -10249,6 +10370,7 @@ static void perf_event_init_cpu(int cpu) | |||
10249 | rcu_assign_pointer(swhash->swevent_hlist, hlist); | 10370 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
10250 | } | 10371 | } |
10251 | mutex_unlock(&swhash->hlist_mutex); | 10372 | mutex_unlock(&swhash->hlist_mutex); |
10373 | return 0; | ||
10252 | } | 10374 | } |
10253 | 10375 | ||
10254 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE | 10376 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE |
@@ -10280,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu) | |||
10280 | } | 10402 | } |
10281 | srcu_read_unlock(&pmus_srcu, idx); | 10403 | srcu_read_unlock(&pmus_srcu, idx); |
10282 | } | 10404 | } |
10405 | #else | ||
10406 | |||
10407 | static void perf_event_exit_cpu_context(int cpu) { } | ||
10283 | 10408 | ||
10284 | static void perf_event_exit_cpu(int cpu) | 10409 | #endif |
10410 | |||
10411 | int perf_event_exit_cpu(unsigned int cpu) | ||
10285 | { | 10412 | { |
10286 | perf_event_exit_cpu_context(cpu); | 10413 | perf_event_exit_cpu_context(cpu); |
10414 | return 0; | ||
10287 | } | 10415 | } |
10288 | #else | ||
10289 | static inline void perf_event_exit_cpu(int cpu) { } | ||
10290 | #endif | ||
10291 | 10416 | ||
10292 | static int | 10417 | static int |
10293 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) | 10418 | perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) |
@@ -10309,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = { | |||
10309 | .priority = INT_MIN, | 10434 | .priority = INT_MIN, |
10310 | }; | 10435 | }; |
10311 | 10436 | ||
10312 | static int | ||
10313 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | ||
10314 | { | ||
10315 | unsigned int cpu = (long)hcpu; | ||
10316 | |||
10317 | switch (action & ~CPU_TASKS_FROZEN) { | ||
10318 | |||
10319 | case CPU_UP_PREPARE: | ||
10320 | /* | ||
10321 | * This must be done before the CPU comes alive, because the | ||
10322 | * moment we can run tasks we can encounter (software) events. | ||
10323 | * | ||
10324 | * Specifically, someone can have inherited events on kthreadd | ||
10325 | * or a pre-existing worker thread that gets re-bound. | ||
10326 | */ | ||
10327 | perf_event_init_cpu(cpu); | ||
10328 | break; | ||
10329 | |||
10330 | case CPU_DOWN_PREPARE: | ||
10331 | /* | ||
10332 | * This must be done before the CPU dies because after that an | ||
10333 | * active event might want to IPI the CPU and that'll not work | ||
10334 | * so great for dead CPUs. | ||
10335 | * | ||
10336 | * XXX smp_call_function_single() return -ENXIO without a warn | ||
10337 | * so we could possibly deal with this. | ||
10338 | * | ||
10339 | * This is safe against new events arriving because | ||
10340 | * sys_perf_event_open() serializes against hotplug using | ||
10341 | * get_online_cpus(). | ||
10342 | */ | ||
10343 | perf_event_exit_cpu(cpu); | ||
10344 | break; | ||
10345 | default: | ||
10346 | break; | ||
10347 | } | ||
10348 | |||
10349 | return NOTIFY_OK; | ||
10350 | } | ||
10351 | |||
10352 | void __init perf_event_init(void) | 10437 | void __init perf_event_init(void) |
10353 | { | 10438 | { |
10354 | int ret; | 10439 | int ret; |
@@ -10361,7 +10446,7 @@ void __init perf_event_init(void) | |||
10361 | perf_pmu_register(&perf_cpu_clock, NULL, -1); | 10446 | perf_pmu_register(&perf_cpu_clock, NULL, -1); |
10362 | perf_pmu_register(&perf_task_clock, NULL, -1); | 10447 | perf_pmu_register(&perf_task_clock, NULL, -1); |
10363 | perf_tp_register(); | 10448 | perf_tp_register(); |
10364 | perf_cpu_notifier(perf_cpu_notify); | 10449 | perf_event_init_cpu(smp_processor_id()); |
10365 | register_reboot_notifier(&perf_reboot_notifier); | 10450 | register_reboot_notifier(&perf_reboot_notifier); |
10366 | 10451 | ||
10367 | ret = init_hw_breakpoint(); | 10452 | ret = init_hw_breakpoint(); |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) | |||
123 | return rb->aux_nr_pages << PAGE_SHIFT; | 123 | return rb->aux_nr_pages << PAGE_SHIFT; |
124 | } | 124 | } |
125 | 125 | ||
126 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 126 | #define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ |
127 | static inline unsigned long \ | ||
128 | func_name(struct perf_output_handle *handle, \ | ||
129 | const void *buf, unsigned long len) \ | ||
130 | { \ | 127 | { \ |
131 | unsigned long size, written; \ | 128 | unsigned long size, written; \ |
132 | \ | 129 | \ |
133 | do { \ | 130 | do { \ |
134 | size = min(handle->size, len); \ | 131 | size = min(handle->size, len); \ |
135 | written = memcpy_func(handle->addr, buf, size); \ | 132 | written = memcpy_func(__VA_ARGS__); \ |
136 | written = size - written; \ | 133 | written = size - written; \ |
137 | \ | 134 | \ |
138 | len -= written; \ | 135 | len -= written; \ |
139 | handle->addr += written; \ | 136 | handle->addr += written; \ |
140 | buf += written; \ | 137 | if (advance_buf) \ |
138 | buf += written; \ | ||
141 | handle->size -= written; \ | 139 | handle->size -= written; \ |
142 | if (!handle->size) { \ | 140 | if (!handle->size) { \ |
143 | struct ring_buffer *rb = handle->rb; \ | 141 | struct ring_buffer *rb = handle->rb; \ |
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \ | |||
152 | return len; \ | 150 | return len; \ |
153 | } | 151 | } |
154 | 152 | ||
153 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | ||
154 | static inline unsigned long \ | ||
155 | func_name(struct perf_output_handle *handle, \ | ||
156 | const void *buf, unsigned long len) \ | ||
157 | __DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) | ||
158 | |||
159 | static inline unsigned long | ||
160 | __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, | ||
161 | const void *buf, unsigned long len) | ||
162 | { | ||
163 | unsigned long orig_len = len; | ||
164 | __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, | ||
165 | orig_len - len, size) | ||
166 | } | ||
167 | |||
155 | static inline unsigned long | 168 | static inline unsigned long |
156 | memcpy_common(void *dst, const void *src, unsigned long n) | 169 | memcpy_common(void *dst, const void *src, unsigned long n) |
157 | { | 170 | { |
diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -211,6 +211,82 @@ repeat: | |||
211 | } | 211 | } |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * Note that if this function returns a valid task_struct pointer (!NULL) | ||
215 | * task->usage must remain >0 for the duration of the RCU critical section. | ||
216 | */ | ||
217 | struct task_struct *task_rcu_dereference(struct task_struct **ptask) | ||
218 | { | ||
219 | struct sighand_struct *sighand; | ||
220 | struct task_struct *task; | ||
221 | |||
222 | /* | ||
223 | * We need to verify that release_task() was not called and thus | ||
224 | * delayed_put_task_struct() can't run and drop the last reference | ||
225 | * before rcu_read_unlock(). We check task->sighand != NULL, | ||
226 | * but we can read the already freed and reused memory. | ||
227 | */ | ||
228 | retry: | ||
229 | task = rcu_dereference(*ptask); | ||
230 | if (!task) | ||
231 | return NULL; | ||
232 | |||
233 | probe_kernel_address(&task->sighand, sighand); | ||
234 | |||
235 | /* | ||
236 | * Pairs with atomic_dec_and_test() in put_task_struct(). If this task | ||
237 | * was already freed we can not miss the preceding update of this | ||
238 | * pointer. | ||
239 | */ | ||
240 | smp_rmb(); | ||
241 | if (unlikely(task != READ_ONCE(*ptask))) | ||
242 | goto retry; | ||
243 | |||
244 | /* | ||
245 | * We've re-checked that "task == *ptask", now we have two different | ||
246 | * cases: | ||
247 | * | ||
248 | * 1. This is actually the same task/task_struct. In this case | ||
249 | * sighand != NULL tells us it is still alive. | ||
250 | * | ||
251 | * 2. This is another task which got the same memory for task_struct. | ||
252 | * We can't know this of course, and we can not trust | ||
253 | * sighand != NULL. | ||
254 | * | ||
255 | * In this case we actually return a random value, but this is | ||
256 | * correct. | ||
257 | * | ||
258 | * If we return NULL - we can pretend that we actually noticed that | ||
259 | * *ptask was updated when the previous task has exited. Or pretend | ||
260 | * that probe_slab_address(&sighand) reads NULL. | ||
261 | * | ||
262 | * If we return the new task (because sighand is not NULL for any | ||
263 | * reason) - this is fine too. This (new) task can't go away before | ||
264 | * another gp pass. | ||
265 | * | ||
266 | * And note: We could even eliminate the false positive if re-read | ||
267 | * task->sighand once again to avoid the falsely NULL. But this case | ||
268 | * is very unlikely so we don't care. | ||
269 | */ | ||
270 | if (!sighand) | ||
271 | return NULL; | ||
272 | |||
273 | return task; | ||
274 | } | ||
275 | |||
276 | struct task_struct *try_get_task_struct(struct task_struct **ptask) | ||
277 | { | ||
278 | struct task_struct *task; | ||
279 | |||
280 | rcu_read_lock(); | ||
281 | task = task_rcu_dereference(ptask); | ||
282 | if (task) | ||
283 | get_task_struct(task); | ||
284 | rcu_read_unlock(); | ||
285 | |||
286 | return task; | ||
287 | } | ||
288 | |||
289 | /* | ||
214 | * Determine if a process group is "orphaned", according to the POSIX | 290 | * Determine if a process group is "orphaned", according to the POSIX |
215 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 291 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
216 | * by terminal-generated stop signals. Newly orphaned process groups are | 292 | * by terminal-generated stop signals. Newly orphaned process groups are |
@@ -700,10 +776,14 @@ void do_exit(long code) | |||
700 | 776 | ||
701 | exit_signals(tsk); /* sets PF_EXITING */ | 777 | exit_signals(tsk); /* sets PF_EXITING */ |
702 | /* | 778 | /* |
703 | * tsk->flags are checked in the futex code to protect against | 779 | * Ensure that all new tsk->pi_lock acquisitions must observe |
704 | * an exiting task cleaning up the robust pi futexes. | 780 | * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). |
705 | */ | 781 | */ |
706 | smp_mb(); | 782 | smp_mb(); |
783 | /* | ||
784 | * Ensure that we must observe the pi_state in exit_mm() -> | ||
785 | * mm_release() -> exit_pi_state_list(). | ||
786 | */ | ||
707 | raw_spin_unlock_wait(&tsk->pi_lock); | 787 | raw_spin_unlock_wait(&tsk->pi_lock); |
708 | 788 | ||
709 | if (unlikely(in_atomic())) { | 789 | if (unlikely(in_atomic())) { |
diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk) | |||
148 | } | 148 | } |
149 | #endif | 149 | #endif |
150 | 150 | ||
151 | void __weak arch_release_thread_info(struct thread_info *ti) | 151 | void __weak arch_release_thread_stack(unsigned long *stack) |
152 | { | 152 | { |
153 | } | 153 | } |
154 | 154 | ||
155 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR | 155 | #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | 158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a |
159 | * kmemcache based allocator. | 159 | * kmemcache based allocator. |
160 | */ | 160 | */ |
161 | # if THREAD_SIZE >= PAGE_SIZE | 161 | # if THREAD_SIZE >= PAGE_SIZE |
162 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, |
163 | int node) | 163 | int node) |
164 | { | 164 | { |
165 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, | 165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
166 | THREAD_SIZE_ORDER); | 166 | THREAD_SIZE_ORDER); |
167 | |||
168 | if (page) | ||
169 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
170 | 1 << THREAD_SIZE_ORDER); | ||
171 | 167 | ||
172 | return page ? page_address(page) : NULL; | 168 | return page ? page_address(page) : NULL; |
173 | } | 169 | } |
174 | 170 | ||
175 | static inline void free_thread_info(struct thread_info *ti) | 171 | static inline void free_thread_stack(unsigned long *stack) |
176 | { | 172 | { |
177 | struct page *page = virt_to_page(ti); | 173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); |
178 | |||
179 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | ||
180 | -(1 << THREAD_SIZE_ORDER)); | ||
181 | __free_kmem_pages(page, THREAD_SIZE_ORDER); | ||
182 | } | 174 | } |
183 | # else | 175 | # else |
184 | static struct kmem_cache *thread_info_cache; | 176 | static struct kmem_cache *thread_stack_cache; |
185 | 177 | ||
186 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 178 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, |
187 | int node) | 179 | int node) |
188 | { | 180 | { |
189 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | 181 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
190 | } | 182 | } |
191 | 183 | ||
192 | static void free_thread_info(struct thread_info *ti) | 184 | static void free_thread_stack(unsigned long *stack) |
193 | { | 185 | { |
194 | kmem_cache_free(thread_info_cache, ti); | 186 | kmem_cache_free(thread_stack_cache, stack); |
195 | } | 187 | } |
196 | 188 | ||
197 | void thread_info_cache_init(void) | 189 | void thread_stack_cache_init(void) |
198 | { | 190 | { |
199 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | 191 | thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, |
200 | THREAD_SIZE, 0, NULL); | 192 | THREAD_SIZE, 0, NULL); |
201 | BUG_ON(thread_info_cache == NULL); | 193 | BUG_ON(thread_stack_cache == NULL); |
202 | } | 194 | } |
203 | # endif | 195 | # endif |
204 | #endif | 196 | #endif |
@@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep; | |||
221 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 213 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
222 | static struct kmem_cache *mm_cachep; | 214 | static struct kmem_cache *mm_cachep; |
223 | 215 | ||
224 | static void account_kernel_stack(struct thread_info *ti, int account) | 216 | static void account_kernel_stack(unsigned long *stack, int account) |
225 | { | 217 | { |
226 | struct zone *zone = page_zone(virt_to_page(ti)); | 218 | /* All stack pages are in the same zone and belong to the same memcg. */ |
219 | struct page *first_page = virt_to_page(stack); | ||
220 | |||
221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
222 | THREAD_SIZE / 1024 * account); | ||
227 | 223 | ||
228 | mod_zone_page_state(zone, NR_KERNEL_STACK, account); | 224 | memcg_kmem_update_page_stat( |
225 | first_page, MEMCG_KERNEL_STACK_KB, | ||
226 | account * (THREAD_SIZE / 1024)); | ||
229 | } | 227 | } |
230 | 228 | ||
231 | void free_task(struct task_struct *tsk) | 229 | void free_task(struct task_struct *tsk) |
232 | { | 230 | { |
233 | account_kernel_stack(tsk->stack, -1); | 231 | account_kernel_stack(tsk->stack, -1); |
234 | arch_release_thread_info(tsk->stack); | 232 | arch_release_thread_stack(tsk->stack); |
235 | free_thread_info(tsk->stack); | 233 | free_thread_stack(tsk->stack); |
236 | rt_mutex_debug_task_free(tsk); | 234 | rt_mutex_debug_task_free(tsk); |
237 | ftrace_graph_exit_task(tsk); | 235 | ftrace_graph_exit_task(tsk); |
238 | put_seccomp_filter(tsk); | 236 | put_seccomp_filter(tsk); |
@@ -343,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) | |||
343 | static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | 341 | static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
344 | { | 342 | { |
345 | struct task_struct *tsk; | 343 | struct task_struct *tsk; |
346 | struct thread_info *ti; | 344 | unsigned long *stack; |
347 | int err; | 345 | int err; |
348 | 346 | ||
349 | if (node == NUMA_NO_NODE) | 347 | if (node == NUMA_NO_NODE) |
@@ -352,15 +350,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
352 | if (!tsk) | 350 | if (!tsk) |
353 | return NULL; | 351 | return NULL; |
354 | 352 | ||
355 | ti = alloc_thread_info_node(tsk, node); | 353 | stack = alloc_thread_stack_node(tsk, node); |
356 | if (!ti) | 354 | if (!stack) |
357 | goto free_tsk; | 355 | goto free_tsk; |
358 | 356 | ||
359 | err = arch_dup_task_struct(tsk, orig); | 357 | err = arch_dup_task_struct(tsk, orig); |
360 | if (err) | 358 | if (err) |
361 | goto free_ti; | 359 | goto free_stack; |
362 | 360 | ||
363 | tsk->stack = ti; | 361 | tsk->stack = stack; |
364 | #ifdef CONFIG_SECCOMP | 362 | #ifdef CONFIG_SECCOMP |
365 | /* | 363 | /* |
366 | * We must handle setting up seccomp filters once we're under | 364 | * We must handle setting up seccomp filters once we're under |
@@ -392,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
392 | tsk->task_frag.page = NULL; | 390 | tsk->task_frag.page = NULL; |
393 | tsk->wake_q.next = NULL; | 391 | tsk->wake_q.next = NULL; |
394 | 392 | ||
395 | account_kernel_stack(ti, 1); | 393 | account_kernel_stack(stack, 1); |
396 | 394 | ||
397 | kcov_task_init(tsk); | 395 | kcov_task_init(tsk); |
398 | 396 | ||
399 | return tsk; | 397 | return tsk; |
400 | 398 | ||
401 | free_ti: | 399 | free_stack: |
402 | free_thread_info(ti); | 400 | free_thread_stack(stack); |
403 | free_tsk: | 401 | free_tsk: |
404 | free_task_struct(tsk); | 402 | free_task_struct(tsk); |
405 | return NULL; | 403 | return NULL; |
diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) | |||
42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) | 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
43 | return false; | 43 | return false; |
44 | 44 | ||
45 | if (test_thread_flag(TIF_MEMDIE)) | 45 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
46 | return false; | 46 | return false; |
47 | 47 | ||
48 | if (pm_nosig_freezing || cgroup_freezing(p)) | 48 | if (pm_nosig_freezing || cgroup_freezing(p)) |
diff --git a/kernel/futex.c b/kernel/futex.c index ee25f5ba4aca..33664f70e2d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
469 | { | 469 | { |
470 | unsigned long address = (unsigned long)uaddr; | 470 | unsigned long address = (unsigned long)uaddr; |
471 | struct mm_struct *mm = current->mm; | 471 | struct mm_struct *mm = current->mm; |
472 | struct page *page; | 472 | struct page *page, *tail; |
473 | struct address_space *mapping; | 473 | struct address_space *mapping; |
474 | int err, ro = 0; | 474 | int err, ro = 0; |
475 | 475 | ||
@@ -530,7 +530,15 @@ again: | |||
530 | * considered here and page lock forces unnecessarily serialization | 530 | * considered here and page lock forces unnecessarily serialization |
531 | * From this point on, mapping will be re-verified if necessary and | 531 | * From this point on, mapping will be re-verified if necessary and |
532 | * page lock will be acquired only if it is unavoidable | 532 | * page lock will be acquired only if it is unavoidable |
533 | */ | 533 | * |
534 | * Mapping checks require the head page for any compound page so the | ||
535 | * head page and mapping is looked up now. For anonymous pages, it | ||
536 | * does not matter if the page splits in the future as the key is | ||
537 | * based on the address. For filesystem-backed pages, the tail is | ||
538 | * required as the index of the page determines the key. For | ||
539 | * base pages, there is no tail page and tail == page. | ||
540 | */ | ||
541 | tail = page; | ||
534 | page = compound_head(page); | 542 | page = compound_head(page); |
535 | mapping = READ_ONCE(page->mapping); | 543 | mapping = READ_ONCE(page->mapping); |
536 | 544 | ||
@@ -654,7 +662,7 @@ again: | |||
654 | 662 | ||
655 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 663 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
656 | key->shared.inode = inode; | 664 | key->shared.inode = inode; |
657 | key->shared.pgoff = basepage_index(page); | 665 | key->shared.pgoff = basepage_index(tail); |
658 | rcu_read_unlock(); | 666 | rcu_read_unlock(); |
659 | } | 667 | } |
660 | 668 | ||
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include "gcov.h" | 19 | #include "gcov.h" |
20 | 20 | ||
21 | #if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 | 21 | #if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) |
22 | #define GCOV_COUNTERS 10 | 22 | #define GCOV_COUNTERS 10 |
23 | #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 | 23 | #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 |
24 | #define GCOV_COUNTERS 9 | 24 | #define GCOV_COUNTERS 9 |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o | |||
9 | obj-$(CONFIG_PM_SLEEP) += pm.o | 9 | obj-$(CONFIG_PM_SLEEP) += pm.o |
10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | 10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o |
11 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o | 11 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o |
12 | obj-$(CONFIG_SMP) += affinity.o | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..f68959341c0f --- /dev/null +++ b/kernel/irq/affinity.c | |||
@@ -0,0 +1,61 @@ | |||
1 | |||
2 | #include <linux/interrupt.h> | ||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/slab.h> | ||
5 | #include <linux/cpu.h> | ||
6 | |||
7 | static int get_first_sibling(unsigned int cpu) | ||
8 | { | ||
9 | unsigned int ret; | ||
10 | |||
11 | ret = cpumask_first(topology_sibling_cpumask(cpu)); | ||
12 | if (ret < nr_cpu_ids) | ||
13 | return ret; | ||
14 | return cpu; | ||
15 | } | ||
16 | |||
17 | /* | ||
18 | * Take a map of online CPUs and the number of available interrupt vectors | ||
19 | * and generate an output cpumask suitable for spreading MSI/MSI-X vectors | ||
20 | * so that they are distributed as good as possible around the CPUs. If | ||
21 | * more vectors than CPUs are available we'll map one to each CPU, | ||
22 | * otherwise we map one to the first sibling of each socket. | ||
23 | * | ||
24 | * If there are more vectors than CPUs we will still only have one bit | ||
25 | * set per CPU, but interrupt code will keep on assigning the vectors from | ||
26 | * the start of the bitmap until we run out of vectors. | ||
27 | */ | ||
28 | struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) | ||
29 | { | ||
30 | struct cpumask *affinity_mask; | ||
31 | unsigned int max_vecs = *nr_vecs; | ||
32 | |||
33 | if (max_vecs == 1) | ||
34 | return NULL; | ||
35 | |||
36 | affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
37 | if (!affinity_mask) { | ||
38 | *nr_vecs = 1; | ||
39 | return NULL; | ||
40 | } | ||
41 | |||
42 | if (max_vecs >= num_online_cpus()) { | ||
43 | cpumask_copy(affinity_mask, cpu_online_mask); | ||
44 | *nr_vecs = num_online_cpus(); | ||
45 | } else { | ||
46 | unsigned int vecs = 0, cpu; | ||
47 | |||
48 | for_each_online_cpu(cpu) { | ||
49 | if (cpu == get_first_sibling(cpu)) { | ||
50 | cpumask_set_cpu(cpu, affinity_mask); | ||
51 | vecs++; | ||
52 | } | ||
53 | |||
54 | if (--max_vecs == 0) | ||
55 | break; | ||
56 | } | ||
57 | *nr_vecs = vecs; | ||
58 | } | ||
59 | |||
60 | return affinity_mask; | ||
61 | } | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..b4c1bc7c9ca2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -426,6 +426,49 @@ out_unlock: | |||
426 | } | 426 | } |
427 | EXPORT_SYMBOL_GPL(handle_simple_irq); | 427 | EXPORT_SYMBOL_GPL(handle_simple_irq); |
428 | 428 | ||
429 | /** | ||
430 | * handle_untracked_irq - Simple and software-decoded IRQs. | ||
431 | * @desc: the interrupt description structure for this irq | ||
432 | * | ||
433 | * Untracked interrupts are sent from a demultiplexing interrupt | ||
434 | * handler when the demultiplexer does not know which device it its | ||
435 | * multiplexed irq domain generated the interrupt. IRQ's handled | ||
436 | * through here are not subjected to stats tracking, randomness, or | ||
437 | * spurious interrupt detection. | ||
438 | * | ||
439 | * Note: Like handle_simple_irq, the caller is expected to handle | ||
440 | * the ack, clear, mask and unmask issues if necessary. | ||
441 | */ | ||
442 | void handle_untracked_irq(struct irq_desc *desc) | ||
443 | { | ||
444 | unsigned int flags = 0; | ||
445 | |||
446 | raw_spin_lock(&desc->lock); | ||
447 | |||
448 | if (!irq_may_run(desc)) | ||
449 | goto out_unlock; | ||
450 | |||
451 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | ||
452 | |||
453 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { | ||
454 | desc->istate |= IRQS_PENDING; | ||
455 | goto out_unlock; | ||
456 | } | ||
457 | |||
458 | desc->istate &= ~IRQS_PENDING; | ||
459 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
460 | raw_spin_unlock(&desc->lock); | ||
461 | |||
462 | __handle_irq_event_percpu(desc, &flags); | ||
463 | |||
464 | raw_spin_lock(&desc->lock); | ||
465 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); | ||
466 | |||
467 | out_unlock: | ||
468 | raw_spin_unlock(&desc->lock); | ||
469 | } | ||
470 | EXPORT_SYMBOL_GPL(handle_untracked_irq); | ||
471 | |||
429 | /* | 472 | /* |
430 | * Called unconditionally from handle_level_irq() and only for oneshot | 473 | * Called unconditionally from handle_level_irq() and only for oneshot |
431 | * interrupts from handle_fasteoi_irq() | 474 | * interrupts from handle_fasteoi_irq() |
@@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) | |||
1093 | 1136 | ||
1094 | return 0; | 1137 | return 0; |
1095 | } | 1138 | } |
1139 | |||
1140 | /** | ||
1141 | * irq_chip_pm_get - Enable power for an IRQ chip | ||
1142 | * @data: Pointer to interrupt specific data | ||
1143 | * | ||
1144 | * Enable the power to the IRQ chip referenced by the interrupt data | ||
1145 | * structure. | ||
1146 | */ | ||
1147 | int irq_chip_pm_get(struct irq_data *data) | ||
1148 | { | ||
1149 | int retval; | ||
1150 | |||
1151 | if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { | ||
1152 | retval = pm_runtime_get_sync(data->chip->parent_device); | ||
1153 | if (retval < 0) { | ||
1154 | pm_runtime_put_noidle(data->chip->parent_device); | ||
1155 | return retval; | ||
1156 | } | ||
1157 | } | ||
1158 | |||
1159 | return 0; | ||
1160 | } | ||
1161 | |||
1162 | /** | ||
1163 | * irq_chip_pm_put - Disable power for an IRQ chip | ||
1164 | * @data: Pointer to interrupt specific data | ||
1165 | * | ||
1166 | * Disable the power to the IRQ chip referenced by the interrupt data | ||
1167 | * structure, belongs. Note that power will only be disabled, once this | ||
1168 | * function has been called for all IRQs that have called irq_chip_pm_get(). | ||
1169 | */ | ||
1170 | int irq_chip_pm_put(struct irq_data *data) | ||
1171 | { | ||
1172 | int retval = 0; | ||
1173 | |||
1174 | if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) | ||
1175 | retval = pm_runtime_put(data->chip->parent_device); | ||
1176 | |||
1177 | return (retval < 0) ? retval : 0; | ||
1178 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) | |||
132 | wake_up_process(action->thread); | 132 | wake_up_process(action->thread); |
133 | } | 133 | } |
134 | 134 | ||
135 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | 135 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) |
136 | { | 136 | { |
137 | irqreturn_t retval = IRQ_NONE; | 137 | irqreturn_t retval = IRQ_NONE; |
138 | unsigned int flags = 0, irq = desc->irq_data.irq; | 138 | unsigned int irq = desc->irq_data.irq; |
139 | struct irqaction *action; | 139 | struct irqaction *action; |
140 | 140 | ||
141 | for_each_action_of_desc(desc, action) { | 141 | for_each_action_of_desc(desc, action) { |
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | |||
164 | 164 | ||
165 | /* Fall through to add to randomness */ | 165 | /* Fall through to add to randomness */ |
166 | case IRQ_HANDLED: | 166 | case IRQ_HANDLED: |
167 | flags |= action->flags; | 167 | *flags |= action->flags; |
168 | break; | 168 | break; |
169 | 169 | ||
170 | default: | 170 | default: |
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | |||
174 | retval |= res; | 174 | retval |= res; |
175 | } | 175 | } |
176 | 176 | ||
177 | add_interrupt_randomness(irq, flags); | 177 | return retval; |
178 | } | ||
179 | |||
180 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | ||
181 | { | ||
182 | irqreturn_t retval; | ||
183 | unsigned int flags = 0; | ||
184 | |||
185 | retval = __handle_irq_event_percpu(desc, &flags); | ||
186 | |||
187 | add_interrupt_randomness(desc->irq_data.irq, flags); | ||
178 | 188 | ||
179 | if (!noirqdebug) | 189 | if (!noirqdebug) |
180 | note_interrupt(desc, retval); | 190 | note_interrupt(desc, retval); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..bc226e783bd2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -7,6 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | #include <linux/irqdesc.h> | 8 | #include <linux/irqdesc.h> |
9 | #include <linux/kernel_stat.h> | 9 | #include <linux/kernel_stat.h> |
10 | #include <linux/pm_runtime.h> | ||
10 | 11 | ||
11 | #ifdef CONFIG_SPARSE_IRQ | 12 | #ifdef CONFIG_SPARSE_IRQ |
12 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | 13 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) |
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); | |||
83 | 84 | ||
84 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
85 | 86 | ||
87 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); | ||
86 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); | 88 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); |
87 | irqreturn_t handle_irq_event(struct irq_desc *desc); | 89 | irqreturn_t handle_irq_event(struct irq_desc *desc); |
88 | 90 | ||
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
105 | struct irqaction *action) { } | 107 | struct irqaction *action) { } |
106 | #endif | 108 | #endif |
107 | 109 | ||
110 | extern bool irq_can_set_affinity_usr(unsigned int irq); | ||
111 | |||
108 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | 112 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); |
109 | 113 | ||
110 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 114 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 89b49f6773f0..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c | |||
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain, | |||
76 | } | 76 | } |
77 | } | 77 | } |
78 | 78 | ||
79 | virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); | 79 | virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); |
80 | if (virq <= 0) { | 80 | if (virq <= 0) { |
81 | pr_warn("Can't reserve IPI, failed to alloc descs\n"); | 81 | pr_warn("Can't reserve IPI, failed to alloc descs\n"); |
82 | return -ENOMEM; | 82 | return -ENOMEM; |
83 | } | 83 | } |
84 | 84 | ||
85 | virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, | 85 | virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, |
86 | (void *) dest, true); | 86 | (void *) dest, true, NULL); |
87 | 87 | ||
88 | if (virq <= 0) { | 88 | if (virq <= 0) { |
89 | pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); | 89 | pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8731e1c5d1e7..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | |||
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | static void desc_smp_init(struct irq_desc *desc, int node) | 71 | static void desc_smp_init(struct irq_desc *desc, int node, |
72 | const struct cpumask *affinity) | ||
72 | { | 73 | { |
73 | cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); | 74 | if (!affinity) |
75 | affinity = irq_default_affinity; | ||
76 | cpumask_copy(desc->irq_common_data.affinity, affinity); | ||
77 | |||
74 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 78 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
75 | cpumask_clear(desc->pending_mask); | 79 | cpumask_clear(desc->pending_mask); |
76 | #endif | 80 | #endif |
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) | |||
82 | #else | 86 | #else |
83 | static inline int | 87 | static inline int |
84 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | 88 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } |
85 | static inline void desc_smp_init(struct irq_desc *desc, int node) { } | 89 | static inline void |
90 | desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } | ||
86 | #endif | 91 | #endif |
87 | 92 | ||
88 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, | 93 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, |
89 | struct module *owner) | 94 | const struct cpumask *affinity, struct module *owner) |
90 | { | 95 | { |
91 | int cpu; | 96 | int cpu; |
92 | 97 | ||
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, | |||
107 | desc->owner = owner; | 112 | desc->owner = owner; |
108 | for_each_possible_cpu(cpu) | 113 | for_each_possible_cpu(cpu) |
109 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | 114 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; |
110 | desc_smp_init(desc, node); | 115 | desc_smp_init(desc, node, affinity); |
111 | } | 116 | } |
112 | 117 | ||
113 | int nr_irqs = NR_IRQS; | 118 | int nr_irqs = NR_IRQS; |
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void) | |||
158 | mutex_unlock(&sparse_irq_lock); | 163 | mutex_unlock(&sparse_irq_lock); |
159 | } | 164 | } |
160 | 165 | ||
161 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | 166 | static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, |
167 | const struct cpumask *affinity, | ||
168 | struct module *owner) | ||
162 | { | 169 | { |
163 | struct irq_desc *desc; | 170 | struct irq_desc *desc; |
164 | gfp_t gfp = GFP_KERNEL; | 171 | gfp_t gfp = GFP_KERNEL; |
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | |||
178 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 185 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
179 | init_rcu_head(&desc->rcu); | 186 | init_rcu_head(&desc->rcu); |
180 | 187 | ||
181 | desc_set_defaults(irq, desc, node, owner); | 188 | desc_set_defaults(irq, desc, node, affinity, owner); |
189 | irqd_set(&desc->irq_data, flags); | ||
182 | 190 | ||
183 | return desc; | 191 | return desc; |
184 | 192 | ||
@@ -223,13 +231,32 @@ static void free_desc(unsigned int irq) | |||
223 | } | 231 | } |
224 | 232 | ||
225 | static int alloc_descs(unsigned int start, unsigned int cnt, int node, | 233 | static int alloc_descs(unsigned int start, unsigned int cnt, int node, |
226 | struct module *owner) | 234 | const struct cpumask *affinity, struct module *owner) |
227 | { | 235 | { |
236 | const struct cpumask *mask = NULL; | ||
228 | struct irq_desc *desc; | 237 | struct irq_desc *desc; |
229 | int i; | 238 | unsigned int flags; |
239 | int i, cpu = -1; | ||
240 | |||
241 | if (affinity && cpumask_empty(affinity)) | ||
242 | return -EINVAL; | ||
243 | |||
244 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; | ||
230 | 245 | ||
231 | for (i = 0; i < cnt; i++) { | 246 | for (i = 0; i < cnt; i++) { |
232 | desc = alloc_desc(start + i, node, owner); | 247 | if (affinity) { |
248 | cpu = cpumask_next(cpu, affinity); | ||
249 | if (cpu >= nr_cpu_ids) | ||
250 | cpu = cpumask_first(affinity); | ||
251 | node = cpu_to_node(cpu); | ||
252 | |||
253 | /* | ||
254 | * For single allocations we use the caller provided | ||
255 | * mask otherwise we use the mask of the target cpu | ||
256 | */ | ||
257 | mask = cnt == 1 ? affinity : cpumask_of(cpu); | ||
258 | } | ||
259 | desc = alloc_desc(start + i, node, flags, mask, owner); | ||
233 | if (!desc) | 260 | if (!desc) |
234 | goto err; | 261 | goto err; |
235 | mutex_lock(&sparse_irq_lock); | 262 | mutex_lock(&sparse_irq_lock); |
@@ -277,7 +304,7 @@ int __init early_irq_init(void) | |||
277 | nr_irqs = initcnt; | 304 | nr_irqs = initcnt; |
278 | 305 | ||
279 | for (i = 0; i < initcnt; i++) { | 306 | for (i = 0; i < initcnt; i++) { |
280 | desc = alloc_desc(i, node, NULL); | 307 | desc = alloc_desc(i, node, 0, NULL, NULL); |
281 | set_bit(i, allocated_irqs); | 308 | set_bit(i, allocated_irqs); |
282 | irq_insert_desc(i, desc); | 309 | irq_insert_desc(i, desc); |
283 | } | 310 | } |
@@ -311,7 +338,7 @@ int __init early_irq_init(void) | |||
311 | alloc_masks(&desc[i], GFP_KERNEL, node); | 338 | alloc_masks(&desc[i], GFP_KERNEL, node); |
312 | raw_spin_lock_init(&desc[i].lock); | 339 | raw_spin_lock_init(&desc[i].lock); |
313 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 340 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
314 | desc_set_defaults(i, &desc[i], node, NULL); | 341 | desc_set_defaults(i, &desc[i], node, NULL, NULL); |
315 | } | 342 | } |
316 | return arch_early_irq_init(); | 343 | return arch_early_irq_init(); |
317 | } | 344 | } |
@@ -328,11 +355,12 @@ static void free_desc(unsigned int irq) | |||
328 | unsigned long flags; | 355 | unsigned long flags; |
329 | 356 | ||
330 | raw_spin_lock_irqsave(&desc->lock, flags); | 357 | raw_spin_lock_irqsave(&desc->lock, flags); |
331 | desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); | 358 | desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); |
332 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 359 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
333 | } | 360 | } |
334 | 361 | ||
335 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, | 362 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, |
363 | const struct cpumask *affinity, | ||
336 | struct module *owner) | 364 | struct module *owner) |
337 | { | 365 | { |
338 | u32 i; | 366 | u32 i; |
@@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); | |||
453 | * @cnt: Number of consecutive irqs to allocate. | 481 | * @cnt: Number of consecutive irqs to allocate. |
454 | * @node: Preferred node on which the irq descriptor should be allocated | 482 | * @node: Preferred node on which the irq descriptor should be allocated |
455 | * @owner: Owning module (can be NULL) | 483 | * @owner: Owning module (can be NULL) |
484 | * @affinity: Optional pointer to an affinity mask which hints where the | ||
485 | * irq descriptors should be allocated and which default | ||
486 | * affinities to use | ||
456 | * | 487 | * |
457 | * Returns the first irq number or error code | 488 | * Returns the first irq number or error code |
458 | */ | 489 | */ |
459 | int __ref | 490 | int __ref |
460 | __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, | 491 | __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, |
461 | struct module *owner) | 492 | struct module *owner, const struct cpumask *affinity) |
462 | { | 493 | { |
463 | int start, ret; | 494 | int start, ret; |
464 | 495 | ||
@@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, | |||
494 | 525 | ||
495 | bitmap_set(allocated_irqs, start, cnt); | 526 | bitmap_set(allocated_irqs, start, cnt); |
496 | mutex_unlock(&sparse_irq_lock); | 527 | mutex_unlock(&sparse_irq_lock); |
497 | return alloc_descs(start, cnt, node, owner); | 528 | return alloc_descs(start, cnt, node, affinity, owner); |
498 | 529 | ||
499 | err: | 530 | err: |
500 | mutex_unlock(&sparse_irq_lock); | 531 | mutex_unlock(&sparse_irq_lock); |
@@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); | |||
512 | */ | 543 | */ |
513 | unsigned int irq_alloc_hwirqs(int cnt, int node) | 544 | unsigned int irq_alloc_hwirqs(int cnt, int node) |
514 | { | 545 | { |
515 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); | 546 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); |
516 | 547 | ||
517 | if (irq < 0) | 548 | if (irq < 0) |
518 | return 0; | 549 | return 0; |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8798b6c9e945..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
481 | } | 481 | } |
482 | 482 | ||
483 | /* Allocate a virtual interrupt number */ | 483 | /* Allocate a virtual interrupt number */ |
484 | virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); | 484 | virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); |
485 | if (virq <= 0) { | 485 | if (virq <= 0) { |
486 | pr_debug("-> virq allocation failed\n"); | 486 | pr_debug("-> virq allocation failed\n"); |
487 | return 0; | 487 | return 0; |
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, | |||
567 | unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | 567 | unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) |
568 | { | 568 | { |
569 | struct irq_domain *domain; | 569 | struct irq_domain *domain; |
570 | struct irq_data *irq_data; | ||
570 | irq_hw_number_t hwirq; | 571 | irq_hw_number_t hwirq; |
571 | unsigned int type = IRQ_TYPE_NONE; | 572 | unsigned int type = IRQ_TYPE_NONE; |
572 | int virq; | 573 | int virq; |
@@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | |||
588 | if (irq_domain_translate(domain, fwspec, &hwirq, &type)) | 589 | if (irq_domain_translate(domain, fwspec, &hwirq, &type)) |
589 | return 0; | 590 | return 0; |
590 | 591 | ||
591 | if (irq_domain_is_hierarchy(domain)) { | 592 | /* |
593 | * WARN if the irqchip returns a type with bits | ||
594 | * outside the sense mask set and clear these bits. | ||
595 | */ | ||
596 | if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) | ||
597 | type &= IRQ_TYPE_SENSE_MASK; | ||
598 | |||
599 | /* | ||
600 | * If we've already configured this interrupt, | ||
601 | * don't do it again, or hell will break loose. | ||
602 | */ | ||
603 | virq = irq_find_mapping(domain, hwirq); | ||
604 | if (virq) { | ||
605 | /* | ||
606 | * If the trigger type is not specified or matches the | ||
607 | * current trigger type then we are done so return the | ||
608 | * interrupt number. | ||
609 | */ | ||
610 | if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) | ||
611 | return virq; | ||
612 | |||
592 | /* | 613 | /* |
593 | * If we've already configured this interrupt, | 614 | * If the trigger type has not been set yet, then set |
594 | * don't do it again, or hell will break loose. | 615 | * it now and return the interrupt number. |
595 | */ | 616 | */ |
596 | virq = irq_find_mapping(domain, hwirq); | 617 | if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { |
597 | if (virq) | 618 | irq_data = irq_get_irq_data(virq); |
619 | if (!irq_data) | ||
620 | return 0; | ||
621 | |||
622 | irqd_set_trigger_type(irq_data, type); | ||
598 | return virq; | 623 | return virq; |
624 | } | ||
599 | 625 | ||
626 | pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", | ||
627 | hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); | ||
628 | return 0; | ||
629 | } | ||
630 | |||
631 | if (irq_domain_is_hierarchy(domain)) { | ||
600 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); | 632 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); |
601 | if (virq <= 0) | 633 | if (virq <= 0) |
602 | return 0; | 634 | return 0; |
@@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | |||
607 | return virq; | 639 | return virq; |
608 | } | 640 | } |
609 | 641 | ||
610 | /* Set type if specified and different than the current one */ | 642 | irq_data = irq_get_irq_data(virq); |
611 | if (type != IRQ_TYPE_NONE && | 643 | if (!irq_data) { |
612 | type != irq_get_trigger_type(virq)) | 644 | if (irq_domain_is_hierarchy(domain)) |
613 | irq_set_irq_type(virq, type); | 645 | irq_domain_free_irqs(virq, 1); |
646 | else | ||
647 | irq_dispose_mapping(virq); | ||
648 | return 0; | ||
649 | } | ||
650 | |||
651 | /* Store trigger type */ | ||
652 | irqd_set_trigger_type(irq_data, type); | ||
653 | |||
614 | return virq; | 654 | return virq; |
615 | } | 655 | } |
616 | EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); | 656 | EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); |
@@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) | |||
640 | if (WARN_ON(domain == NULL)) | 680 | if (WARN_ON(domain == NULL)) |
641 | return; | 681 | return; |
642 | 682 | ||
643 | irq_domain_disassociate(domain, virq); | 683 | if (irq_domain_is_hierarchy(domain)) { |
644 | irq_free_desc(virq); | 684 | irq_domain_free_irqs(virq, 1); |
685 | } else { | ||
686 | irq_domain_disassociate(domain, virq); | ||
687 | irq_free_desc(virq); | ||
688 | } | ||
645 | } | 689 | } |
646 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | 690 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); |
647 | 691 | ||
@@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { | |||
835 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 879 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
836 | 880 | ||
837 | int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, | 881 | int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, |
838 | int node) | 882 | int node, const struct cpumask *affinity) |
839 | { | 883 | { |
840 | unsigned int hint; | 884 | unsigned int hint; |
841 | 885 | ||
842 | if (virq >= 0) { | 886 | if (virq >= 0) { |
843 | virq = irq_alloc_descs(virq, virq, cnt, node); | 887 | virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, |
888 | affinity); | ||
844 | } else { | 889 | } else { |
845 | hint = hwirq % nr_irqs; | 890 | hint = hwirq % nr_irqs; |
846 | if (hint == 0) | 891 | if (hint == 0) |
847 | hint++; | 892 | hint++; |
848 | virq = irq_alloc_descs_from(hint, cnt, node); | 893 | virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, |
849 | if (virq <= 0 && hint > 1) | 894 | affinity); |
850 | virq = irq_alloc_descs_from(1, cnt, node); | 895 | if (virq <= 0 && hint > 1) { |
896 | virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, | ||
897 | affinity); | ||
898 | } | ||
851 | } | 899 | } |
852 | 900 | ||
853 | return virq; | 901 | return virq; |
@@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
1144 | if (recursive) | 1192 | if (recursive) |
1145 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, | 1193 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, |
1146 | nr_irqs, arg); | 1194 | nr_irqs, arg); |
1147 | if (ret >= 0) | 1195 | if (ret < 0) |
1148 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | 1196 | return ret; |
1197 | |||
1198 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | ||
1149 | if (ret < 0 && recursive) | 1199 | if (ret < 0 && recursive) |
1150 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); | 1200 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); |
1151 | 1201 | ||
@@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
1160 | * @node: NUMA node id for memory allocation | 1210 | * @node: NUMA node id for memory allocation |
1161 | * @arg: domain specific argument | 1211 | * @arg: domain specific argument |
1162 | * @realloc: IRQ descriptors have already been allocated if true | 1212 | * @realloc: IRQ descriptors have already been allocated if true |
1213 | * @affinity: Optional irq affinity mask for multiqueue devices | ||
1163 | * | 1214 | * |
1164 | * Allocate IRQ numbers and initialized all data structures to support | 1215 | * Allocate IRQ numbers and initialized all data structures to support |
1165 | * hierarchy IRQ domains. | 1216 | * hierarchy IRQ domains. |
@@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | |||
1175 | */ | 1226 | */ |
1176 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | 1227 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, |
1177 | unsigned int nr_irqs, int node, void *arg, | 1228 | unsigned int nr_irqs, int node, void *arg, |
1178 | bool realloc) | 1229 | bool realloc, const struct cpumask *affinity) |
1179 | { | 1230 | { |
1180 | int i, ret, virq; | 1231 | int i, ret, virq; |
1181 | 1232 | ||
@@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | |||
1193 | if (realloc && irq_base >= 0) { | 1244 | if (realloc && irq_base >= 0) { |
1194 | virq = irq_base; | 1245 | virq = irq_base; |
1195 | } else { | 1246 | } else { |
1196 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); | 1247 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, |
1248 | affinity); | ||
1197 | if (virq < 0) { | 1249 | if (virq < 0) { |
1198 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", | 1250 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", |
1199 | irq_base, nr_irqs); | 1251 | irq_base, nr_irqs); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef0bc02c3a70..73a2b786b5e9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); | |||
115 | #ifdef CONFIG_SMP | 115 | #ifdef CONFIG_SMP |
116 | cpumask_var_t irq_default_affinity; | 116 | cpumask_var_t irq_default_affinity; |
117 | 117 | ||
118 | static int __irq_can_set_affinity(struct irq_desc *desc) | 118 | static bool __irq_can_set_affinity(struct irq_desc *desc) |
119 | { | 119 | { |
120 | if (!desc || !irqd_can_balance(&desc->irq_data) || | 120 | if (!desc || !irqd_can_balance(&desc->irq_data) || |
121 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) | 121 | !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) |
122 | return 0; | 122 | return false; |
123 | return 1; | 123 | return true; |
124 | } | 124 | } |
125 | 125 | ||
126 | /** | 126 | /** |
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) | |||
134 | } | 134 | } |
135 | 135 | ||
136 | /** | 136 | /** |
137 | * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space | ||
138 | * @irq: Interrupt to check | ||
139 | * | ||
140 | * Like irq_can_set_affinity() above, but additionally checks for the | ||
141 | * AFFINITY_MANAGED flag. | ||
142 | */ | ||
143 | bool irq_can_set_affinity_usr(unsigned int irq) | ||
144 | { | ||
145 | struct irq_desc *desc = irq_to_desc(irq); | ||
146 | |||
147 | return __irq_can_set_affinity(desc) && | ||
148 | !irqd_affinity_is_managed(&desc->irq_data); | ||
149 | } | ||
150 | |||
151 | /** | ||
137 | * irq_set_thread_affinity - Notify irq threads to adjust affinity | 152 | * irq_set_thread_affinity - Notify irq threads to adjust affinity |
138 | * @desc: irq descriptor which has affitnity changed | 153 | * @desc: irq descriptor which has affitnity changed |
139 | * | 154 | * |
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) | |||
338 | return 0; | 353 | return 0; |
339 | 354 | ||
340 | /* | 355 | /* |
341 | * Preserve an userspace affinity setup, but make sure that | 356 | * Preserve the managed affinity setting and an userspace affinity |
342 | * one of the targets is online. | 357 | * setup, but make sure that one of the targets is online. |
343 | */ | 358 | */ |
344 | if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { | 359 | if (irqd_affinity_is_managed(&desc->irq_data) || |
360 | irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { | ||
345 | if (cpumask_intersects(desc->irq_common_data.affinity, | 361 | if (cpumask_intersects(desc->irq_common_data.affinity, |
346 | cpu_online_mask)) | 362 | cpu_online_mask)) |
347 | set = desc->irq_common_data.affinity; | 363 | set = desc->irq_common_data.affinity; |
@@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1117 | new->irq = irq; | 1133 | new->irq = irq; |
1118 | 1134 | ||
1119 | /* | 1135 | /* |
1136 | * If the trigger type is not specified by the caller, | ||
1137 | * then use the default for this interrupt. | ||
1138 | */ | ||
1139 | if (!(new->flags & IRQF_TRIGGER_MASK)) | ||
1140 | new->flags |= irqd_get_trigger_type(&desc->irq_data); | ||
1141 | |||
1142 | /* | ||
1120 | * Check whether the interrupt nests into another interrupt | 1143 | * Check whether the interrupt nests into another interrupt |
1121 | * thread. | 1144 | * thread. |
1122 | */ | 1145 | */ |
@@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
1409 | 1432 | ||
1410 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) | 1433 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) |
1411 | return -EINVAL; | 1434 | return -EINVAL; |
1435 | |||
1436 | retval = irq_chip_pm_get(&desc->irq_data); | ||
1437 | if (retval < 0) | ||
1438 | return retval; | ||
1439 | |||
1412 | chip_bus_lock(desc); | 1440 | chip_bus_lock(desc); |
1413 | retval = __setup_irq(irq, desc, act); | 1441 | retval = __setup_irq(irq, desc, act); |
1414 | chip_bus_sync_unlock(desc); | 1442 | chip_bus_sync_unlock(desc); |
1415 | 1443 | ||
1444 | if (retval) | ||
1445 | irq_chip_pm_put(&desc->irq_data); | ||
1446 | |||
1416 | return retval; | 1447 | return retval; |
1417 | } | 1448 | } |
1418 | EXPORT_SYMBOL_GPL(setup_irq); | 1449 | EXPORT_SYMBOL_GPL(setup_irq); |
@@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1506 | } | 1537 | } |
1507 | } | 1538 | } |
1508 | 1539 | ||
1540 | irq_chip_pm_put(&desc->irq_data); | ||
1509 | module_put(desc->owner); | 1541 | module_put(desc->owner); |
1510 | kfree(action->secondary); | 1542 | kfree(action->secondary); |
1511 | return action; | 1543 | return action; |
@@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1648 | action->name = devname; | 1680 | action->name = devname; |
1649 | action->dev_id = dev_id; | 1681 | action->dev_id = dev_id; |
1650 | 1682 | ||
1683 | retval = irq_chip_pm_get(&desc->irq_data); | ||
1684 | if (retval < 0) | ||
1685 | return retval; | ||
1686 | |||
1651 | chip_bus_lock(desc); | 1687 | chip_bus_lock(desc); |
1652 | retval = __setup_irq(irq, desc, action); | 1688 | retval = __setup_irq(irq, desc, action); |
1653 | chip_bus_sync_unlock(desc); | 1689 | chip_bus_sync_unlock(desc); |
1654 | 1690 | ||
1655 | if (retval) { | 1691 | if (retval) { |
1692 | irq_chip_pm_put(&desc->irq_data); | ||
1656 | kfree(action->secondary); | 1693 | kfree(action->secondary); |
1657 | kfree(action); | 1694 | kfree(action); |
1658 | } | 1695 | } |
@@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) | |||
1730 | if (!desc) | 1767 | if (!desc) |
1731 | return; | 1768 | return; |
1732 | 1769 | ||
1770 | /* | ||
1771 | * If the trigger type is not specified by the caller, then | ||
1772 | * use the default for this interrupt. | ||
1773 | */ | ||
1733 | type &= IRQ_TYPE_SENSE_MASK; | 1774 | type &= IRQ_TYPE_SENSE_MASK; |
1775 | if (type == IRQ_TYPE_NONE) | ||
1776 | type = irqd_get_trigger_type(&desc->irq_data); | ||
1777 | |||
1734 | if (type != IRQ_TYPE_NONE) { | 1778 | if (type != IRQ_TYPE_NONE) { |
1735 | int ret; | 1779 | int ret; |
1736 | 1780 | ||
@@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ | |||
1822 | 1866 | ||
1823 | unregister_handler_proc(irq, action); | 1867 | unregister_handler_proc(irq, action); |
1824 | 1868 | ||
1869 | irq_chip_pm_put(&desc->irq_data); | ||
1825 | module_put(desc->owner); | 1870 | module_put(desc->owner); |
1826 | return action; | 1871 | return action; |
1827 | 1872 | ||
@@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
1884 | 1929 | ||
1885 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) | 1930 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) |
1886 | return -EINVAL; | 1931 | return -EINVAL; |
1932 | |||
1933 | retval = irq_chip_pm_get(&desc->irq_data); | ||
1934 | if (retval < 0) | ||
1935 | return retval; | ||
1936 | |||
1887 | chip_bus_lock(desc); | 1937 | chip_bus_lock(desc); |
1888 | retval = __setup_irq(irq, desc, act); | 1938 | retval = __setup_irq(irq, desc, act); |
1889 | chip_bus_sync_unlock(desc); | 1939 | chip_bus_sync_unlock(desc); |
1890 | 1940 | ||
1941 | if (retval) | ||
1942 | irq_chip_pm_put(&desc->irq_data); | ||
1943 | |||
1891 | return retval; | 1944 | return retval; |
1892 | } | 1945 | } |
1893 | 1946 | ||
@@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
1931 | action->name = devname; | 1984 | action->name = devname; |
1932 | action->percpu_dev_id = dev_id; | 1985 | action->percpu_dev_id = dev_id; |
1933 | 1986 | ||
1987 | retval = irq_chip_pm_get(&desc->irq_data); | ||
1988 | if (retval < 0) | ||
1989 | return retval; | ||
1990 | |||
1934 | chip_bus_lock(desc); | 1991 | chip_bus_lock(desc); |
1935 | retval = __setup_irq(irq, desc, action); | 1992 | retval = __setup_irq(irq, desc, action); |
1936 | chip_bus_sync_unlock(desc); | 1993 | chip_bus_sync_unlock(desc); |
1937 | 1994 | ||
1938 | if (retval) | 1995 | if (retval) { |
1996 | irq_chip_pm_put(&desc->irq_data); | ||
1939 | kfree(action); | 1997 | kfree(action); |
1998 | } | ||
1940 | 1999 | ||
1941 | return retval; | 2000 | return retval; |
1942 | } | 2001 | } |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..54999350162c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
324 | struct msi_domain_ops *ops = info->ops; | 324 | struct msi_domain_ops *ops = info->ops; |
325 | msi_alloc_info_t arg; | 325 | msi_alloc_info_t arg; |
326 | struct msi_desc *desc; | 326 | struct msi_desc *desc; |
327 | int i, ret, virq = -1; | 327 | int i, ret, virq; |
328 | 328 | ||
329 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); | 329 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); |
330 | if (ret) | 330 | if (ret) |
@@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
332 | 332 | ||
333 | for_each_msi_entry(desc, dev) { | 333 | for_each_msi_entry(desc, dev) { |
334 | ops->set_desc(&arg, desc); | 334 | ops->set_desc(&arg, desc); |
335 | if (info->flags & MSI_FLAG_IDENTITY_MAP) | ||
336 | virq = (int)ops->get_hwirq(info, &arg); | ||
337 | else | ||
338 | virq = -1; | ||
339 | 335 | ||
340 | virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, | 336 | virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, |
341 | dev_to_node(dev), &arg, false); | 337 | dev_to_node(dev), &arg, false, |
338 | desc->affinity); | ||
342 | if (virq < 0) { | 339 | if (virq < 0) { |
343 | ret = -ENOSPC; | 340 | ret = -ENOSPC; |
344 | if (ops->handle_error) | 341 | if (ops->handle_error) |
@@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
356 | ops->msi_finish(&arg, 0); | 353 | ops->msi_finish(&arg, 0); |
357 | 354 | ||
358 | for_each_msi_entry(desc, dev) { | 355 | for_each_msi_entry(desc, dev) { |
356 | virq = desc->irq; | ||
359 | if (desc->nvec_used == 1) | 357 | if (desc->nvec_used == 1) |
360 | dev_dbg(dev, "irq %d for MSI\n", virq); | 358 | dev_dbg(dev, "irq %d for MSI\n", virq); |
361 | else | 359 | else |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..feaa813b84a9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, | |||
96 | cpumask_var_t new_value; | 96 | cpumask_var_t new_value; |
97 | int err; | 97 | int err; |
98 | 98 | ||
99 | if (!irq_can_set_affinity(irq) || no_irq_affinity) | 99 | if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) |
100 | return -EIO; | 100 | return -EIO; |
101 | 101 | ||
102 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 102 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
311 | !name_unique(irq, action)) | 311 | !name_unique(irq, action)) |
312 | return; | 312 | return; |
313 | 313 | ||
314 | memset(name, 0, MAX_NAMELEN); | ||
315 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 314 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
316 | 315 | ||
317 | /* create /proc/irq/1234/handler/ */ | 316 | /* create /proc/irq/1234/handler/ */ |
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
340 | if (desc->dir) | 339 | if (desc->dir) |
341 | goto out_unlock; | 340 | goto out_unlock; |
342 | 341 | ||
343 | memset(name, 0, MAX_NAMELEN); | ||
344 | sprintf(name, "%d", irq); | 342 | sprintf(name, "%d", irq); |
345 | 343 | ||
346 | /* create /proc/irq/1234 */ | 344 | /* create /proc/irq/1234 */ |
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
386 | #endif | 384 | #endif |
387 | remove_proc_entry("spurious", desc->dir); | 385 | remove_proc_entry("spurious", desc->dir); |
388 | 386 | ||
389 | memset(name, 0, MAX_NAMELEN); | ||
390 | sprintf(name, "%u", irq); | 387 | sprintf(name, "%u", irq); |
391 | remove_proc_entry(name, root_irq_dir); | 388 | remove_proc_entry(name, root_irq_dir); |
392 | } | 389 | } |
@@ -421,12 +418,8 @@ void init_irq_proc(void) | |||
421 | /* | 418 | /* |
422 | * Create entries for all existing IRQs. | 419 | * Create entries for all existing IRQs. |
423 | */ | 420 | */ |
424 | for_each_irq_desc(irq, desc) { | 421 | for_each_irq_desc(irq, desc) |
425 | if (!desc) | ||
426 | continue; | ||
427 | |||
428 | register_irq_proc(irq, desc); | 422 | register_irq_proc(irq, desc); |
429 | } | ||
430 | } | 423 | } |
431 | 424 | ||
432 | #ifdef CONFIG_GENERIC_IRQ_SHOW | 425 | #ifdef CONFIG_GENERIC_IRQ_SHOW |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); | |||
58 | 58 | ||
59 | void static_key_slow_inc(struct static_key *key) | 59 | void static_key_slow_inc(struct static_key *key) |
60 | { | 60 | { |
61 | int v, v1; | ||
62 | |||
61 | STATIC_KEY_CHECK_USE(); | 63 | STATIC_KEY_CHECK_USE(); |
62 | if (atomic_inc_not_zero(&key->enabled)) | 64 | |
63 | return; | 65 | /* |
66 | * Careful if we get concurrent static_key_slow_inc() calls; | ||
67 | * later calls must wait for the first one to _finish_ the | ||
68 | * jump_label_update() process. At the same time, however, | ||
69 | * the jump_label_update() call below wants to see | ||
70 | * static_key_enabled(&key) for jumps to be updated properly. | ||
71 | * | ||
72 | * So give a special meaning to negative key->enabled: it sends | ||
73 | * static_key_slow_inc() down the slow path, and it is non-zero | ||
74 | * so it counts as "enabled" in jump_label_update(). Note that | ||
75 | * atomic_inc_unless_negative() checks >= 0, so roll our own. | ||
76 | */ | ||
77 | for (v = atomic_read(&key->enabled); v > 0; v = v1) { | ||
78 | v1 = atomic_cmpxchg(&key->enabled, v, v + 1); | ||
79 | if (likely(v1 == v)) | ||
80 | return; | ||
81 | } | ||
64 | 82 | ||
65 | jump_label_lock(); | 83 | jump_label_lock(); |
66 | if (atomic_inc_return(&key->enabled) == 1) | 84 | if (atomic_read(&key->enabled) == 0) { |
85 | atomic_set(&key->enabled, -1); | ||
67 | jump_label_update(key); | 86 | jump_label_update(key); |
87 | atomic_set(&key->enabled, 1); | ||
88 | } else { | ||
89 | atomic_inc(&key->enabled); | ||
90 | } | ||
68 | jump_label_unlock(); | 91 | jump_label_unlock(); |
69 | } | 92 | } |
70 | EXPORT_SYMBOL_GPL(static_key_slow_inc); | 93 | EXPORT_SYMBOL_GPL(static_key_slow_inc); |
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); | |||
72 | static void __static_key_slow_dec(struct static_key *key, | 95 | static void __static_key_slow_dec(struct static_key *key, |
73 | unsigned long rate_limit, struct delayed_work *work) | 96 | unsigned long rate_limit, struct delayed_work *work) |
74 | { | 97 | { |
98 | /* | ||
99 | * The negative count check is valid even when a negative | ||
100 | * key->enabled is in use by static_key_slow_inc(); a | ||
101 | * __static_key_slow_dec() before the first static_key_slow_inc() | ||
102 | * returns is unbalanced, because all other static_key_slow_inc() | ||
103 | * instances block while the update is in progress. | ||
104 | */ | ||
75 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { | 105 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { |
76 | WARN(atomic_read(&key->enabled) < 0, | 106 | WARN(atomic_read(&key->enabled) < 0, |
77 | "jump label: negative count!\n"); | 107 | "jump label: negative count!\n"); |
@@ -422,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
422 | return notifier_from_errno(ret); | 452 | return notifier_from_errno(ret); |
423 | } | 453 | } |
424 | 454 | ||
425 | struct notifier_block jump_label_module_nb = { | 455 | static struct notifier_block jump_label_module_nb = { |
426 | .notifier_call = jump_label_module_notify, | 456 | .notifier_call = jump_label_module_notify, |
427 | .priority = 1, /* higher than tracepoints */ | 457 | .priority = 1, /* higher than tracepoints */ |
428 | }; | 458 | }; |
diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = { | |||
264 | 264 | ||
265 | static int __init kcov_init(void) | 265 | static int __init kcov_init(void) |
266 | { | 266 | { |
267 | if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { | 267 | /* |
268 | * The kcov debugfs file won't ever get removed and thus, | ||
269 | * there is no need to protect it against removal races. The | ||
270 | * use of debugfs_create_file_unsafe() is actually safe here. | ||
271 | */ | ||
272 | if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { | ||
268 | pr_err("failed to create kcov in debugfs\n"); | 273 | pr_err("failed to create kcov in debugfs\n"); |
269 | return -ENOMEM; | 274 | return -ENOMEM; |
270 | } | 275 | } |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/gfp.h> | 46 | #include <linux/gfp.h> |
47 | #include <linux/kmemcheck.h> | 47 | #include <linux/kmemcheck.h> |
48 | #include <linux/random.h> | 48 | #include <linux/random.h> |
49 | #include <linux/jhash.h> | ||
49 | 50 | ||
50 | #include <asm/sections.h> | 51 | #include <asm/sections.h> |
51 | 52 | ||
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; | |||
309 | * It's a 64-bit hash, because it's important for the keys to be | 310 | * It's a 64-bit hash, because it's important for the keys to be |
310 | * unique. | 311 | * unique. |
311 | */ | 312 | */ |
312 | #define iterate_chain_key(key1, key2) \ | 313 | static inline u64 iterate_chain_key(u64 key, u32 idx) |
313 | (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ | 314 | { |
314 | ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ | 315 | u32 k0 = key, k1 = key >> 32; |
315 | (key2)) | 316 | |
317 | __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ | ||
318 | |||
319 | return k0 | (u64)k1 << 32; | ||
320 | } | ||
316 | 321 | ||
317 | void lockdep_off(void) | 322 | void lockdep_off(void) |
318 | { | 323 | { |
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) | |||
49 | } | 49 | } |
50 | 50 | ||
51 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 51 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
52 | struct thread_info *ti) | 52 | struct task_struct *task) |
53 | { | 53 | { |
54 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); | 54 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); |
55 | 55 | ||
56 | /* Mark the current thread as blocked on the lock: */ | 56 | /* Mark the current thread as blocked on the lock: */ |
57 | ti->task->blocked_on = waiter; | 57 | task->blocked_on = waiter; |
58 | } | 58 | } |
59 | 59 | ||
60 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 60 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
61 | struct thread_info *ti) | 61 | struct task_struct *task) |
62 | { | 62 | { |
63 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); | 63 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); |
64 | DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); | 64 | DEBUG_LOCKS_WARN_ON(waiter->task != task); |
65 | DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); | 65 | DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); |
66 | ti->task->blocked_on = NULL; | 66 | task->blocked_on = NULL; |
67 | 67 | ||
68 | list_del_init(&waiter->list); | 68 | list_del_init(&waiter->list); |
69 | waiter->task = NULL; | 69 | waiter->task = NULL; |
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h | |||
@@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, | |||
20 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); | 20 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); |
21 | extern void debug_mutex_add_waiter(struct mutex *lock, | 21 | extern void debug_mutex_add_waiter(struct mutex *lock, |
22 | struct mutex_waiter *waiter, | 22 | struct mutex_waiter *waiter, |
23 | struct thread_info *ti); | 23 | struct task_struct *task); |
24 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 24 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
25 | struct thread_info *ti); | 25 | struct task_struct *task); |
26 | extern void debug_mutex_unlock(struct mutex *lock); | 26 | extern void debug_mutex_unlock(struct mutex *lock); |
27 | extern void debug_mutex_init(struct mutex *lock, const char *name, | 27 | extern void debug_mutex_init(struct mutex *lock, const char *name, |
28 | struct lock_class_key *key); | 28 | struct lock_class_key *key); |
29 | 29 | ||
30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
31 | { | 31 | { |
32 | lock->owner = current; | 32 | WRITE_ONCE(lock->owner, current); |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
36 | { | 36 | { |
37 | lock->owner = NULL; | 37 | WRITE_ONCE(lock->owner, NULL); |
38 | } | 38 | } |
39 | 39 | ||
40 | #define spin_lock_mutex(lock, flags) \ | 40 | #define spin_lock_mutex(lock, flags) \ |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e364b424b019..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | |||
486 | if (!hold_ctx) | 486 | if (!hold_ctx) |
487 | return 0; | 487 | return 0; |
488 | 488 | ||
489 | if (unlikely(ctx == hold_ctx)) | ||
490 | return -EALREADY; | ||
491 | |||
492 | if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && | 489 | if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && |
493 | (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { | 490 | (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { |
494 | #ifdef CONFIG_DEBUG_MUTEXES | 491 | #ifdef CONFIG_DEBUG_MUTEXES |
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
514 | unsigned long flags; | 511 | unsigned long flags; |
515 | int ret; | 512 | int ret; |
516 | 513 | ||
514 | if (use_ww_ctx) { | ||
515 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | ||
516 | if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) | ||
517 | return -EALREADY; | ||
518 | } | ||
519 | |||
517 | preempt_disable(); | 520 | preempt_disable(); |
518 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); | 521 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
519 | 522 | ||
@@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
534 | goto skip_wait; | 537 | goto skip_wait; |
535 | 538 | ||
536 | debug_mutex_lock_common(lock, &waiter); | 539 | debug_mutex_lock_common(lock, &waiter); |
537 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 540 | debug_mutex_add_waiter(lock, &waiter, task); |
538 | 541 | ||
539 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | 542 | /* add waiting tasks to the end of the waitqueue (FIFO): */ |
540 | list_add_tail(&waiter.list, &lock->wait_list); | 543 | list_add_tail(&waiter.list, &lock->wait_list); |
@@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
581 | } | 584 | } |
582 | __set_task_state(task, TASK_RUNNING); | 585 | __set_task_state(task, TASK_RUNNING); |
583 | 586 | ||
584 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | 587 | mutex_remove_waiter(lock, &waiter, task); |
585 | /* set it to 0 if there are no waiters left: */ | 588 | /* set it to 0 if there are no waiters left: */ |
586 | if (likely(list_empty(&lock->wait_list))) | 589 | if (likely(list_empty(&lock->wait_list))) |
587 | atomic_set(&lock->count, 0); | 590 | atomic_set(&lock->count, 0); |
@@ -602,7 +605,7 @@ skip_wait: | |||
602 | return 0; | 605 | return 0; |
603 | 606 | ||
604 | err: | 607 | err: |
605 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | 608 | mutex_remove_waiter(lock, &waiter, task); |
606 | spin_unlock_mutex(&lock->wait_lock, flags); | 609 | spin_unlock_mutex(&lock->wait_lock, flags); |
607 | debug_mutex_free_waiter(&waiter); | 610 | debug_mutex_free_waiter(&waiter); |
608 | mutex_release(&lock->dep_map, 1, ip); | 611 | mutex_release(&lock->dep_map, 1, ip); |
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h | |||
@@ -13,18 +13,24 @@ | |||
13 | do { spin_lock(lock); (void)(flags); } while (0) | 13 | do { spin_lock(lock); (void)(flags); } while (0) |
14 | #define spin_unlock_mutex(lock, flags) \ | 14 | #define spin_unlock_mutex(lock, flags) \ |
15 | do { spin_unlock(lock); (void)(flags); } while (0) | 15 | do { spin_unlock(lock); (void)(flags); } while (0) |
16 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, task) \ |
17 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
18 | 18 | ||
19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
20 | /* | ||
21 | * The mutex owner can get read and written to locklessly. | ||
22 | * We should use WRITE_ONCE when writing the owner value to | ||
23 | * avoid store tearing, otherwise, a thread could potentially | ||
24 | * read a partially written and incomplete owner value. | ||
25 | */ | ||
20 | static inline void mutex_set_owner(struct mutex *lock) | 26 | static inline void mutex_set_owner(struct mutex *lock) |
21 | { | 27 | { |
22 | lock->owner = current; | 28 | WRITE_ONCE(lock->owner, current); |
23 | } | 29 | } |
24 | 30 | ||
25 | static inline void mutex_clear_owner(struct mutex *lock) | 31 | static inline void mutex_clear_owner(struct mutex *lock) |
26 | { | 32 | { |
27 | lock->owner = NULL; | 33 | WRITE_ONCE(lock->owner, NULL); |
28 | } | 34 | } |
29 | #else | 35 | #else |
30 | static inline void mutex_set_owner(struct mutex *lock) | 36 | static inline void mutex_set_owner(struct mutex *lock) |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) | |||
93 | * that accesses can't leak upwards out of our subsequent critical | 93 | * that accesses can't leak upwards out of our subsequent critical |
94 | * section in the case that the lock is currently held for write. | 94 | * section in the case that the lock is currently held for write. |
95 | */ | 95 | */ |
96 | cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; | 96 | cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); |
97 | rspin_until_writer_unlock(lock, cnts); | 97 | rspin_until_writer_unlock(lock, cnts); |
98 | 98 | ||
99 | /* | 99 | /* |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ce2f75e32ae1..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); | |||
90 | * therefore increment the cpu number by one. | 90 | * therefore increment the cpu number by one. |
91 | */ | 91 | */ |
92 | 92 | ||
93 | static inline u32 encode_tail(int cpu, int idx) | 93 | static inline __pure u32 encode_tail(int cpu, int idx) |
94 | { | 94 | { |
95 | u32 tail; | 95 | u32 tail; |
96 | 96 | ||
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) | |||
103 | return tail; | 103 | return tail; |
104 | } | 104 | } |
105 | 105 | ||
106 | static inline struct mcs_spinlock *decode_tail(u32 tail) | 106 | static inline __pure struct mcs_spinlock *decode_tail(u32 tail) |
107 | { | 107 | { |
108 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; | 108 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
109 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; | 109 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
@@ -267,6 +267,123 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, | |||
267 | #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath | 267 | #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath |
268 | #endif | 268 | #endif |
269 | 269 | ||
270 | /* | ||
271 | * Various notes on spin_is_locked() and spin_unlock_wait(), which are | ||
272 | * 'interesting' functions: | ||
273 | * | ||
274 | * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE | ||
275 | * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, | ||
276 | * PPC). Also qspinlock has a similar issue per construction, the setting of | ||
277 | * the locked byte can be unordered acquiring the lock proper. | ||
278 | * | ||
279 | * This gets to be 'interesting' in the following cases, where the /should/s | ||
280 | * end up false because of this issue. | ||
281 | * | ||
282 | * | ||
283 | * CASE 1: | ||
284 | * | ||
285 | * So the spin_is_locked() correctness issue comes from something like: | ||
286 | * | ||
287 | * CPU0 CPU1 | ||
288 | * | ||
289 | * global_lock(); local_lock(i) | ||
290 | * spin_lock(&G) spin_lock(&L[i]) | ||
291 | * for (i) if (!spin_is_locked(&G)) { | ||
292 | * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); | ||
293 | * return; | ||
294 | * } | ||
295 | * // deal with fail | ||
296 | * | ||
297 | * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such | ||
298 | * that there is exclusion between the two critical sections. | ||
299 | * | ||
300 | * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from | ||
301 | * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) | ||
302 | * /should/ be constrained by the ACQUIRE from spin_lock(&G). | ||
303 | * | ||
304 | * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. | ||
305 | * | ||
306 | * | ||
307 | * CASE 2: | ||
308 | * | ||
309 | * For spin_unlock_wait() there is a second correctness issue, namely: | ||
310 | * | ||
311 | * CPU0 CPU1 | ||
312 | * | ||
313 | * flag = set; | ||
314 | * smp_mb(); spin_lock(&l) | ||
315 | * spin_unlock_wait(&l); if (!flag) | ||
316 | * // add to lockless list | ||
317 | * spin_unlock(&l); | ||
318 | * // iterate lockless list | ||
319 | * | ||
320 | * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 | ||
321 | * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE | ||
322 | * semantics etc..) | ||
323 | * | ||
324 | * Where flag /should/ be ordered against the locked store of l. | ||
325 | */ | ||
326 | |||
327 | /* | ||
328 | * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before | ||
329 | * issuing an _unordered_ store to set _Q_LOCKED_VAL. | ||
330 | * | ||
331 | * This means that the store can be delayed, but no later than the | ||
332 | * store-release from the unlock. This means that simply observing | ||
333 | * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. | ||
334 | * | ||
335 | * There are two paths that can issue the unordered store: | ||
336 | * | ||
337 | * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 | ||
338 | * | ||
339 | * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 | ||
340 | * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 | ||
341 | * | ||
342 | * However, in both cases we have other !0 state we've set before to queue | ||
343 | * ourseves: | ||
344 | * | ||
345 | * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our | ||
346 | * load is constrained by that ACQUIRE to not pass before that, and thus must | ||
347 | * observe the store. | ||
348 | * | ||
349 | * For (2) we have a more intersting scenario. We enqueue ourselves using | ||
350 | * xchg_tail(), which ends up being a RELEASE. This in itself is not | ||
351 | * sufficient, however that is followed by an smp_cond_acquire() on the same | ||
352 | * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and | ||
353 | * guarantees we must observe that store. | ||
354 | * | ||
355 | * Therefore both cases have other !0 state that is observable before the | ||
356 | * unordered locked byte store comes through. This means we can use that to | ||
357 | * wait for the lock store, and then wait for an unlock. | ||
358 | */ | ||
359 | #ifndef queued_spin_unlock_wait | ||
360 | void queued_spin_unlock_wait(struct qspinlock *lock) | ||
361 | { | ||
362 | u32 val; | ||
363 | |||
364 | for (;;) { | ||
365 | val = atomic_read(&lock->val); | ||
366 | |||
367 | if (!val) /* not locked, we're done */ | ||
368 | goto done; | ||
369 | |||
370 | if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ | ||
371 | break; | ||
372 | |||
373 | /* not locked, but pending, wait until we observe the lock */ | ||
374 | cpu_relax(); | ||
375 | } | ||
376 | |||
377 | /* any unlock is good */ | ||
378 | while (atomic_read(&lock->val) & _Q_LOCKED_MASK) | ||
379 | cpu_relax(); | ||
380 | |||
381 | done: | ||
382 | smp_acquire__after_ctrl_dep(); | ||
383 | } | ||
384 | EXPORT_SYMBOL(queued_spin_unlock_wait); | ||
385 | #endif | ||
386 | |||
270 | #endif /* _GEN_PV_LOCK_SLOWPATH */ | 387 | #endif /* _GEN_PV_LOCK_SLOWPATH */ |
271 | 388 | ||
272 | /** | 389 | /** |
@@ -358,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
358 | * sequentiality; this is because not all clear_pending_set_locked() | 475 | * sequentiality; this is because not all clear_pending_set_locked() |
359 | * implementations imply full barriers. | 476 | * implementations imply full barriers. |
360 | */ | 477 | */ |
361 | smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); | 478 | smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); |
362 | 479 | ||
363 | /* | 480 | /* |
364 | * take ownership and clear the pending bit. | 481 | * take ownership and clear the pending bit. |
@@ -395,6 +512,8 @@ queue: | |||
395 | * pending stuff. | 512 | * pending stuff. |
396 | * | 513 | * |
397 | * p,*,* -> n,*,* | 514 | * p,*,* -> n,*,* |
515 | * | ||
516 | * RELEASE, such that the stores to @node must be complete. | ||
398 | */ | 517 | */ |
399 | old = xchg_tail(lock, tail); | 518 | old = xchg_tail(lock, tail); |
400 | next = NULL; | 519 | next = NULL; |
@@ -405,6 +524,15 @@ queue: | |||
405 | */ | 524 | */ |
406 | if (old & _Q_TAIL_MASK) { | 525 | if (old & _Q_TAIL_MASK) { |
407 | prev = decode_tail(old); | 526 | prev = decode_tail(old); |
527 | /* | ||
528 | * The above xchg_tail() is also a load of @lock which generates, | ||
529 | * through decode_tail(), a pointer. | ||
530 | * | ||
531 | * The address dependency matches the RELEASE of xchg_tail() | ||
532 | * such that the access to @prev must happen after. | ||
533 | */ | ||
534 | smp_read_barrier_depends(); | ||
535 | |||
408 | WRITE_ONCE(prev->next, node); | 536 | WRITE_ONCE(prev->next, node); |
409 | 537 | ||
410 | pv_wait_node(node, prev); | 538 | pv_wait_node(node, prev); |
@@ -434,7 +562,7 @@ queue: | |||
434 | * | 562 | * |
435 | * The PV pv_wait_head_or_lock function, if active, will acquire | 563 | * The PV pv_wait_head_or_lock function, if active, will acquire |
436 | * the lock and return a non-zero value. So we have to skip the | 564 | * the lock and return a non-zero value. So we have to skip the |
437 | * smp_cond_acquire() call. As the next PV queue head hasn't been | 565 | * smp_cond_load_acquire() call. As the next PV queue head hasn't been |
438 | * designated yet, there is no way for the locked value to become | 566 | * designated yet, there is no way for the locked value to become |
439 | * _Q_SLOW_VAL. So both the set_locked() and the | 567 | * _Q_SLOW_VAL. So both the set_locked() and the |
440 | * atomic_cmpxchg_relaxed() calls will be safe. | 568 | * atomic_cmpxchg_relaxed() calls will be safe. |
@@ -445,7 +573,7 @@ queue: | |||
445 | if ((val = pv_wait_head_or_lock(lock, node))) | 573 | if ((val = pv_wait_head_or_lock(lock, node))) |
446 | goto locked; | 574 | goto locked; |
447 | 575 | ||
448 | smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); | 576 | val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); |
449 | 577 | ||
450 | locked: | 578 | locked: |
451 | /* | 579 | /* |
@@ -465,9 +593,9 @@ locked: | |||
465 | break; | 593 | break; |
466 | } | 594 | } |
467 | /* | 595 | /* |
468 | * The smp_cond_acquire() call above has provided the necessary | 596 | * The smp_cond_load_acquire() call above has provided the |
469 | * acquire semantics required for locking. At most two | 597 | * necessary acquire semantics required for locking. At most |
470 | * iterations of this loop may be ran. | 598 | * two iterations of this loop may be ran. |
471 | */ | 599 | */ |
472 | old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); | 600 | old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); |
473 | if (old == val) | 601 | if (old == val) |
@@ -491,7 +619,7 @@ release: | |||
491 | /* | 619 | /* |
492 | * release the node | 620 | * release the node |
493 | */ | 621 | */ |
494 | this_cpu_dec(mcs_nodes[0].count); | 622 | __this_cpu_dec(mcs_nodes[0].count); |
495 | } | 623 | } |
496 | EXPORT_SYMBOL(queued_spin_lock_slowpath); | 624 | EXPORT_SYMBOL(queued_spin_lock_slowpath); |
497 | 625 | ||
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) | |||
112 | #else /* _Q_PENDING_BITS == 8 */ | 112 | #else /* _Q_PENDING_BITS == 8 */ |
113 | static __always_inline void set_pending(struct qspinlock *lock) | 113 | static __always_inline void set_pending(struct qspinlock *lock) |
114 | { | 114 | { |
115 | atomic_set_mask(_Q_PENDING_VAL, &lock->val); | 115 | atomic_or(_Q_PENDING_VAL, &lock->val); |
116 | } | 116 | } |
117 | 117 | ||
118 | static __always_inline void clear_pending(struct qspinlock *lock) | 118 | static __always_inline void clear_pending(struct qspinlock *lock) |
119 | { | 119 | { |
120 | atomic_clear_mask(_Q_PENDING_VAL, &lock->val); | 120 | atomic_andnot(_Q_PENDING_VAL, &lock->val); |
121 | } | 121 | } |
122 | 122 | ||
123 | static __always_inline int trylock_clear_pending(struct qspinlock *lock) | 123 | static __always_inline int trylock_clear_pending(struct qspinlock *lock) |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | |||
1478 | */ | 1478 | */ |
1479 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | 1479 | int __sched rt_mutex_trylock(struct rt_mutex *lock) |
1480 | { | 1480 | { |
1481 | if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) | 1481 | if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) |
1482 | return 0; | 1482 | return 0; |
1483 | 1483 | ||
1484 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | 1484 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
80 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | 80 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
81 | lockdep_init_map(&sem->dep_map, name, key, 0); | 81 | lockdep_init_map(&sem->dep_map, name, key, 0); |
82 | #endif | 82 | #endif |
83 | sem->count = RWSEM_UNLOCKED_VALUE; | 83 | atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); |
84 | raw_spin_lock_init(&sem->wait_lock); | 84 | raw_spin_lock_init(&sem->wait_lock); |
85 | INIT_LIST_HEAD(&sem->wait_list); | 85 | INIT_LIST_HEAD(&sem->wait_list); |
86 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 86 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
@@ -114,12 +114,16 @@ enum rwsem_wake_type { | |||
114 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) | 114 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) |
115 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) | 115 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) |
116 | * - there must be someone on the queue | 116 | * - there must be someone on the queue |
117 | * - the spinlock must be held by the caller | 117 | * - the wait_lock must be held by the caller |
118 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() | ||
119 | * to actually wakeup the blocked task(s) and drop the reference count, | ||
120 | * preferably when the wait_lock is released | ||
118 | * - woken process blocks are discarded from the list after having task zeroed | 121 | * - woken process blocks are discarded from the list after having task zeroed |
119 | * - writers are only woken if downgrading is false | 122 | * - writers are only marked woken if downgrading is false |
120 | */ | 123 | */ |
121 | static struct rw_semaphore * | 124 | static struct rw_semaphore * |
122 | __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | 125 | __rwsem_mark_wake(struct rw_semaphore *sem, |
126 | enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) | ||
123 | { | 127 | { |
124 | struct rwsem_waiter *waiter; | 128 | struct rwsem_waiter *waiter; |
125 | struct task_struct *tsk; | 129 | struct task_struct *tsk; |
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
128 | 132 | ||
129 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | 133 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); |
130 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | 134 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { |
131 | if (wake_type == RWSEM_WAKE_ANY) | 135 | if (wake_type == RWSEM_WAKE_ANY) { |
132 | /* Wake writer at the front of the queue, but do not | 136 | /* |
133 | * grant it the lock yet as we want other writers | 137 | * Mark writer at the front of the queue for wakeup. |
134 | * to be able to steal it. Readers, on the other hand, | 138 | * Until the task is actually later awoken later by |
135 | * will block as they will notice the queued writer. | 139 | * the caller, other writers are able to steal it. |
140 | * Readers, on the other hand, will block as they | ||
141 | * will notice the queued writer. | ||
136 | */ | 142 | */ |
137 | wake_up_process(waiter->task); | 143 | wake_q_add(wake_q, waiter->task); |
144 | } | ||
138 | goto out; | 145 | goto out; |
139 | } | 146 | } |
140 | 147 | ||
@@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
146 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | 153 | if (wake_type != RWSEM_WAKE_READ_OWNED) { |
147 | adjustment = RWSEM_ACTIVE_READ_BIAS; | 154 | adjustment = RWSEM_ACTIVE_READ_BIAS; |
148 | try_reader_grant: | 155 | try_reader_grant: |
149 | oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; | 156 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); |
157 | |||
150 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | 158 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { |
151 | /* A writer stole the lock. Undo our reader grant. */ | 159 | /* |
152 | if (rwsem_atomic_update(-adjustment, sem) & | 160 | * If the count is still less than RWSEM_WAITING_BIAS |
153 | RWSEM_ACTIVE_MASK) | 161 | * after removing the adjustment, it is assumed that |
162 | * a writer has stolen the lock. We have to undo our | ||
163 | * reader grant. | ||
164 | */ | ||
165 | if (atomic_long_add_return(-adjustment, &sem->count) < | ||
166 | RWSEM_WAITING_BIAS) | ||
154 | goto out; | 167 | goto out; |
155 | /* Last active locker left. Retry waking readers. */ | 168 | /* Last active locker left. Retry waking readers. */ |
156 | goto try_reader_grant; | 169 | goto try_reader_grant; |
157 | } | 170 | } |
171 | /* | ||
172 | * It is not really necessary to set it to reader-owned here, | ||
173 | * but it gives the spinners an early indication that the | ||
174 | * readers now have the lock. | ||
175 | */ | ||
176 | rwsem_set_reader_owned(sem); | ||
158 | } | 177 | } |
159 | 178 | ||
160 | /* Grant an infinite number of read locks to the readers at the front | 179 | /* Grant an infinite number of read locks to the readers at the front |
@@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
179 | adjustment -= RWSEM_WAITING_BIAS; | 198 | adjustment -= RWSEM_WAITING_BIAS; |
180 | 199 | ||
181 | if (adjustment) | 200 | if (adjustment) |
182 | rwsem_atomic_add(adjustment, sem); | 201 | atomic_long_add(adjustment, &sem->count); |
183 | 202 | ||
184 | next = sem->wait_list.next; | 203 | next = sem->wait_list.next; |
185 | loop = woken; | 204 | loop = woken; |
@@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
187 | waiter = list_entry(next, struct rwsem_waiter, list); | 206 | waiter = list_entry(next, struct rwsem_waiter, list); |
188 | next = waiter->list.next; | 207 | next = waiter->list.next; |
189 | tsk = waiter->task; | 208 | tsk = waiter->task; |
209 | |||
210 | wake_q_add(wake_q, tsk); | ||
190 | /* | 211 | /* |
191 | * Make sure we do not wakeup the next reader before | 212 | * Ensure that the last operation is setting the reader |
192 | * setting the nil condition to grant the next reader; | 213 | * waiter to nil such that rwsem_down_read_failed() cannot |
193 | * otherwise we could miss the wakeup on the other | 214 | * race with do_exit() by always holding a reference count |
194 | * side and end up sleeping again. See the pairing | 215 | * to the task to wakeup. |
195 | * in rwsem_down_read_failed(). | ||
196 | */ | 216 | */ |
197 | smp_mb(); | 217 | smp_store_release(&waiter->task, NULL); |
198 | waiter->task = NULL; | ||
199 | wake_up_process(tsk); | ||
200 | put_task_struct(tsk); | ||
201 | } while (--loop); | 218 | } while (--loop); |
202 | 219 | ||
203 | sem->wait_list.next = next; | 220 | sem->wait_list.next = next; |
@@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
216 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | 233 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; |
217 | struct rwsem_waiter waiter; | 234 | struct rwsem_waiter waiter; |
218 | struct task_struct *tsk = current; | 235 | struct task_struct *tsk = current; |
236 | WAKE_Q(wake_q); | ||
219 | 237 | ||
220 | /* set up my own style of waitqueue */ | 238 | /* set up my own style of waitqueue */ |
221 | waiter.task = tsk; | 239 | waiter.task = tsk; |
222 | waiter.type = RWSEM_WAITING_FOR_READ; | 240 | waiter.type = RWSEM_WAITING_FOR_READ; |
223 | get_task_struct(tsk); | ||
224 | 241 | ||
225 | raw_spin_lock_irq(&sem->wait_lock); | 242 | raw_spin_lock_irq(&sem->wait_lock); |
226 | if (list_empty(&sem->wait_list)) | 243 | if (list_empty(&sem->wait_list)) |
@@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
228 | list_add_tail(&waiter.list, &sem->wait_list); | 245 | list_add_tail(&waiter.list, &sem->wait_list); |
229 | 246 | ||
230 | /* we're now waiting on the lock, but no longer actively locking */ | 247 | /* we're now waiting on the lock, but no longer actively locking */ |
231 | count = rwsem_atomic_update(adjustment, sem); | 248 | count = atomic_long_add_return(adjustment, &sem->count); |
232 | 249 | ||
233 | /* If there are no active locks, wake the front queued process(es). | 250 | /* If there are no active locks, wake the front queued process(es). |
234 | * | 251 | * |
@@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
238 | if (count == RWSEM_WAITING_BIAS || | 255 | if (count == RWSEM_WAITING_BIAS || |
239 | (count > RWSEM_WAITING_BIAS && | 256 | (count > RWSEM_WAITING_BIAS && |
240 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | 257 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) |
241 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 258 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
242 | 259 | ||
243 | raw_spin_unlock_irq(&sem->wait_lock); | 260 | raw_spin_unlock_irq(&sem->wait_lock); |
261 | wake_up_q(&wake_q); | ||
244 | 262 | ||
245 | /* wait to be given the lock */ | 263 | /* wait to be given the lock */ |
246 | while (true) { | 264 | while (true) { |
@@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
255 | } | 273 | } |
256 | EXPORT_SYMBOL(rwsem_down_read_failed); | 274 | EXPORT_SYMBOL(rwsem_down_read_failed); |
257 | 275 | ||
276 | /* | ||
277 | * This function must be called with the sem->wait_lock held to prevent | ||
278 | * race conditions between checking the rwsem wait list and setting the | ||
279 | * sem->count accordingly. | ||
280 | */ | ||
258 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | 281 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) |
259 | { | 282 | { |
260 | /* | 283 | /* |
261 | * Try acquiring the write lock. Check count first in order | 284 | * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. |
262 | * to reduce unnecessary expensive cmpxchg() operations. | ||
263 | */ | 285 | */ |
264 | if (count == RWSEM_WAITING_BIAS && | 286 | if (count != RWSEM_WAITING_BIAS) |
265 | cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, | 287 | return false; |
266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 288 | |
267 | if (!list_is_singular(&sem->wait_list)) | 289 | /* |
268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 290 | * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there |
291 | * are other tasks on the wait list, we need to add on WAITING_BIAS. | ||
292 | */ | ||
293 | count = list_is_singular(&sem->wait_list) ? | ||
294 | RWSEM_ACTIVE_WRITE_BIAS : | ||
295 | RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; | ||
296 | |||
297 | if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) | ||
298 | == RWSEM_WAITING_BIAS) { | ||
269 | rwsem_set_owner(sem); | 299 | rwsem_set_owner(sem); |
270 | return true; | 300 | return true; |
271 | } | 301 | } |
@@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
279 | */ | 309 | */ |
280 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 310 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
281 | { | 311 | { |
282 | long old, count = READ_ONCE(sem->count); | 312 | long old, count = atomic_long_read(&sem->count); |
283 | 313 | ||
284 | while (true) { | 314 | while (true) { |
285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 315 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) |
286 | return false; | 316 | return false; |
287 | 317 | ||
288 | old = cmpxchg_acquire(&sem->count, count, | 318 | old = atomic_long_cmpxchg_acquire(&sem->count, count, |
289 | count + RWSEM_ACTIVE_WRITE_BIAS); | 319 | count + RWSEM_ACTIVE_WRITE_BIAS); |
290 | if (old == count) { | 320 | if (old == count) { |
291 | rwsem_set_owner(sem); | 321 | rwsem_set_owner(sem); |
@@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | |||
306 | 336 | ||
307 | rcu_read_lock(); | 337 | rcu_read_lock(); |
308 | owner = READ_ONCE(sem->owner); | 338 | owner = READ_ONCE(sem->owner); |
309 | if (!owner) { | 339 | if (!rwsem_owner_is_writer(owner)) { |
310 | long count = READ_ONCE(sem->count); | ||
311 | /* | 340 | /* |
312 | * If sem->owner is not set, yet we have just recently entered the | 341 | * Don't spin if the rwsem is readers owned. |
313 | * slowpath with the lock being active, then there is a possibility | ||
314 | * reader(s) may have the lock. To be safe, bail spinning in these | ||
315 | * situations. | ||
316 | */ | 342 | */ |
317 | if (count & RWSEM_ACTIVE_MASK) | 343 | ret = !rwsem_owner_is_reader(owner); |
318 | ret = false; | ||
319 | goto done; | 344 | goto done; |
320 | } | 345 | } |
321 | 346 | ||
@@ -325,10 +350,15 @@ done: | |||
325 | return ret; | 350 | return ret; |
326 | } | 351 | } |
327 | 352 | ||
328 | static noinline | 353 | /* |
329 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 354 | * Return true only if we can still spin on the owner field of the rwsem. |
355 | */ | ||
356 | static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) | ||
330 | { | 357 | { |
331 | long count; | 358 | struct task_struct *owner = READ_ONCE(sem->owner); |
359 | |||
360 | if (!rwsem_owner_is_writer(owner)) | ||
361 | goto out; | ||
332 | 362 | ||
333 | rcu_read_lock(); | 363 | rcu_read_lock(); |
334 | while (sem->owner == owner) { | 364 | while (sem->owner == owner) { |
@@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | |||
349 | cpu_relax_lowlatency(); | 379 | cpu_relax_lowlatency(); |
350 | } | 380 | } |
351 | rcu_read_unlock(); | 381 | rcu_read_unlock(); |
352 | 382 | out: | |
353 | if (READ_ONCE(sem->owner)) | ||
354 | return true; /* new owner, continue spinning */ | ||
355 | |||
356 | /* | 383 | /* |
357 | * When the owner is not set, the lock could be free or | 384 | * If there is a new owner or the owner is not set, we continue |
358 | * held by readers. Check the counter to verify the | 385 | * spinning. |
359 | * state. | ||
360 | */ | 386 | */ |
361 | count = READ_ONCE(sem->count); | 387 | return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); |
362 | return (count == 0 || count == RWSEM_WAITING_BIAS); | ||
363 | } | 388 | } |
364 | 389 | ||
365 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 390 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
366 | { | 391 | { |
367 | struct task_struct *owner; | ||
368 | bool taken = false; | 392 | bool taken = false; |
369 | 393 | ||
370 | preempt_disable(); | 394 | preempt_disable(); |
@@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
376 | if (!osq_lock(&sem->osq)) | 400 | if (!osq_lock(&sem->osq)) |
377 | goto done; | 401 | goto done; |
378 | 402 | ||
379 | while (true) { | 403 | /* |
380 | owner = READ_ONCE(sem->owner); | 404 | * Optimistically spin on the owner field and attempt to acquire the |
381 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 405 | * lock whenever the owner changes. Spinning will be stopped when: |
382 | break; | 406 | * 1) the owning writer isn't running; or |
383 | 407 | * 2) readers own the lock as we can't determine if they are | |
384 | /* wait_lock will be acquired if write_lock is obtained */ | 408 | * actively running or not. |
409 | */ | ||
410 | while (rwsem_spin_on_owner(sem)) { | ||
411 | /* | ||
412 | * Try to acquire the lock | ||
413 | */ | ||
385 | if (rwsem_try_write_lock_unqueued(sem)) { | 414 | if (rwsem_try_write_lock_unqueued(sem)) { |
386 | taken = true; | 415 | taken = true; |
387 | break; | 416 | break; |
@@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
393 | * we're an RT task that will live-lock because we won't let | 422 | * we're an RT task that will live-lock because we won't let |
394 | * the owner complete. | 423 | * the owner complete. |
395 | */ | 424 | */ |
396 | if (!owner && (need_resched() || rt_task(current))) | 425 | if (!sem->owner && (need_resched() || rt_task(current))) |
397 | break; | 426 | break; |
398 | 427 | ||
399 | /* | 428 | /* |
@@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
440 | bool waiting = true; /* any queued threads before us */ | 469 | bool waiting = true; /* any queued threads before us */ |
441 | struct rwsem_waiter waiter; | 470 | struct rwsem_waiter waiter; |
442 | struct rw_semaphore *ret = sem; | 471 | struct rw_semaphore *ret = sem; |
472 | WAKE_Q(wake_q); | ||
443 | 473 | ||
444 | /* undo write bias from down_write operation, stop active locking */ | 474 | /* undo write bias from down_write operation, stop active locking */ |
445 | count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); | 475 | count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); |
446 | 476 | ||
447 | /* do optimistic spinning and steal lock if possible */ | 477 | /* do optimistic spinning and steal lock if possible */ |
448 | if (rwsem_optimistic_spin(sem)) | 478 | if (rwsem_optimistic_spin(sem)) |
@@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
465 | 495 | ||
466 | /* we're now waiting on the lock, but no longer actively locking */ | 496 | /* we're now waiting on the lock, but no longer actively locking */ |
467 | if (waiting) { | 497 | if (waiting) { |
468 | count = READ_ONCE(sem->count); | 498 | count = atomic_long_read(&sem->count); |
469 | 499 | ||
470 | /* | 500 | /* |
471 | * If there were already threads queued before us and there are | 501 | * If there were already threads queued before us and there are |
472 | * no active writers, the lock must be read owned; so we try to | 502 | * no active writers, the lock must be read owned; so we try to |
473 | * wake any read locks that were queued ahead of us. | 503 | * wake any read locks that were queued ahead of us. |
474 | */ | 504 | */ |
475 | if (count > RWSEM_WAITING_BIAS) | 505 | if (count > RWSEM_WAITING_BIAS) { |
476 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | 506 | WAKE_Q(wake_q); |
507 | |||
508 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); | ||
509 | /* | ||
510 | * The wakeup is normally called _after_ the wait_lock | ||
511 | * is released, but given that we are proactively waking | ||
512 | * readers we can deal with the wake_q overhead as it is | ||
513 | * similar to releasing and taking the wait_lock again | ||
514 | * for attempting rwsem_try_write_lock(). | ||
515 | */ | ||
516 | wake_up_q(&wake_q); | ||
517 | } | ||
477 | 518 | ||
478 | } else | 519 | } else |
479 | count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 520 | count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); |
480 | 521 | ||
481 | /* wait until we successfully acquire the lock */ | 522 | /* wait until we successfully acquire the lock */ |
482 | set_current_state(state); | 523 | set_current_state(state); |
@@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
492 | 533 | ||
493 | schedule(); | 534 | schedule(); |
494 | set_current_state(state); | 535 | set_current_state(state); |
495 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); | 536 | } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); |
496 | 537 | ||
497 | raw_spin_lock_irq(&sem->wait_lock); | 538 | raw_spin_lock_irq(&sem->wait_lock); |
498 | } | 539 | } |
@@ -507,10 +548,11 @@ out_nolock: | |||
507 | raw_spin_lock_irq(&sem->wait_lock); | 548 | raw_spin_lock_irq(&sem->wait_lock); |
508 | list_del(&waiter.list); | 549 | list_del(&waiter.list); |
509 | if (list_empty(&sem->wait_list)) | 550 | if (list_empty(&sem->wait_list)) |
510 | rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); | 551 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); |
511 | else | 552 | else |
512 | __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 553 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
513 | raw_spin_unlock_irq(&sem->wait_lock); | 554 | raw_spin_unlock_irq(&sem->wait_lock); |
555 | wake_up_q(&wake_q); | ||
514 | 556 | ||
515 | return ERR_PTR(-EINTR); | 557 | return ERR_PTR(-EINTR); |
516 | } | 558 | } |
@@ -537,6 +579,7 @@ __visible | |||
537 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | 579 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) |
538 | { | 580 | { |
539 | unsigned long flags; | 581 | unsigned long flags; |
582 | WAKE_Q(wake_q); | ||
540 | 583 | ||
541 | /* | 584 | /* |
542 | * If a spinner is present, it is not necessary to do the wakeup. | 585 | * If a spinner is present, it is not necessary to do the wakeup. |
@@ -573,9 +616,10 @@ locked: | |||
573 | 616 | ||
574 | /* do nothing if list empty */ | 617 | /* do nothing if list empty */ |
575 | if (!list_empty(&sem->wait_list)) | 618 | if (!list_empty(&sem->wait_list)) |
576 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); | 619 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
577 | 620 | ||
578 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 621 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
622 | wake_up_q(&wake_q); | ||
579 | 623 | ||
580 | return sem; | 624 | return sem; |
581 | } | 625 | } |
@@ -590,14 +634,16 @@ __visible | |||
590 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | 634 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) |
591 | { | 635 | { |
592 | unsigned long flags; | 636 | unsigned long flags; |
637 | WAKE_Q(wake_q); | ||
593 | 638 | ||
594 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 639 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
595 | 640 | ||
596 | /* do nothing if list empty */ | 641 | /* do nothing if list empty */ |
597 | if (!list_empty(&sem->wait_list)) | 642 | if (!list_empty(&sem->wait_list)) |
598 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); | 643 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); |
599 | 644 | ||
600 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 645 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
646 | wake_up_q(&wake_q); | ||
601 | 647 | ||
602 | return sem; | 648 | return sem; |
603 | } | 649 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) | |||
22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
23 | 23 | ||
24 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 24 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
25 | rwsem_set_reader_owned(sem); | ||
25 | } | 26 | } |
26 | 27 | ||
27 | EXPORT_SYMBOL(down_read); | 28 | EXPORT_SYMBOL(down_read); |
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) | |||
33 | { | 34 | { |
34 | int ret = __down_read_trylock(sem); | 35 | int ret = __down_read_trylock(sem); |
35 | 36 | ||
36 | if (ret == 1) | 37 | if (ret == 1) { |
37 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | 38 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
39 | rwsem_set_reader_owned(sem); | ||
40 | } | ||
38 | return ret; | 41 | return ret; |
39 | } | 42 | } |
40 | 43 | ||
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) | |||
124 | * lockdep: a downgraded write will live on as a write | 127 | * lockdep: a downgraded write will live on as a write |
125 | * dependency. | 128 | * dependency. |
126 | */ | 129 | */ |
127 | rwsem_clear_owner(sem); | 130 | rwsem_set_reader_owned(sem); |
128 | __downgrade_write(sem); | 131 | __downgrade_write(sem); |
129 | } | 132 | } |
130 | 133 | ||
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
138 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 141 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
139 | 142 | ||
140 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 143 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
144 | rwsem_set_reader_owned(sem); | ||
141 | } | 145 | } |
142 | 146 | ||
143 | EXPORT_SYMBOL(down_read_nested); | 147 | EXPORT_SYMBOL(down_read_nested); |
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
@@ -1,14 +1,58 @@ | |||
1 | /* | ||
2 | * The owner field of the rw_semaphore structure will be set to | ||
3 | * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear | ||
4 | * the owner field when it unlocks. A reader, on the other hand, will | ||
5 | * not touch the owner field when it unlocks. | ||
6 | * | ||
7 | * In essence, the owner field now has the following 3 states: | ||
8 | * 1) 0 | ||
9 | * - lock is free or the owner hasn't set the field yet | ||
10 | * 2) RWSEM_READER_OWNED | ||
11 | * - lock is currently or previously owned by readers (lock is free | ||
12 | * or not set by owner yet) | ||
13 | * 3) Other non-zero value | ||
14 | * - a writer owns the lock | ||
15 | */ | ||
16 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) | ||
17 | |||
1 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 18 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
19 | /* | ||
20 | * All writes to owner are protected by WRITE_ONCE() to make sure that | ||
21 | * store tearing can't happen as optimistic spinners may read and use | ||
22 | * the owner value concurrently without lock. Read from owner, however, | ||
23 | * may not need READ_ONCE() as long as the pointer value is only used | ||
24 | * for comparison and isn't being dereferenced. | ||
25 | */ | ||
2 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 26 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
3 | { | 27 | { |
4 | sem->owner = current; | 28 | WRITE_ONCE(sem->owner, current); |
5 | } | 29 | } |
6 | 30 | ||
7 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | 31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) |
8 | { | 32 | { |
9 | sem->owner = NULL; | 33 | WRITE_ONCE(sem->owner, NULL); |
34 | } | ||
35 | |||
36 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
37 | { | ||
38 | /* | ||
39 | * We check the owner value first to make sure that we will only | ||
40 | * do a write to the rwsem cacheline when it is really necessary | ||
41 | * to minimize cacheline contention. | ||
42 | */ | ||
43 | if (sem->owner != RWSEM_READER_OWNED) | ||
44 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); | ||
45 | } | ||
46 | |||
47 | static inline bool rwsem_owner_is_writer(struct task_struct *owner) | ||
48 | { | ||
49 | return owner && owner != RWSEM_READER_OWNED; | ||
10 | } | 50 | } |
11 | 51 | ||
52 | static inline bool rwsem_owner_is_reader(struct task_struct *owner) | ||
53 | { | ||
54 | return owner == RWSEM_READER_OWNED; | ||
55 | } | ||
12 | #else | 56 | #else |
13 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 57 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
14 | { | 58 | { |
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) | |||
17 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | 61 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) |
18 | { | 62 | { |
19 | } | 63 | } |
64 | |||
65 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
66 | { | ||
67 | } | ||
20 | #endif | 68 | #endif |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..251d16b4cb41 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) | |||
169 | } | 169 | } |
170 | EXPORT_SYMBOL(devm_memunmap); | 170 | EXPORT_SYMBOL(devm_memunmap); |
171 | 171 | ||
172 | pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) | ||
173 | { | ||
174 | return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); | ||
175 | } | ||
176 | EXPORT_SYMBOL(phys_to_pfn_t); | ||
177 | |||
178 | #ifdef CONFIG_ZONE_DEVICE | 172 | #ifdef CONFIG_ZONE_DEVICE |
179 | static DEFINE_MUTEX(pgmap_lock); | 173 | static DEFINE_MUTEX(pgmap_lock); |
180 | static RADIX_TREE(pgmap_radix, GFP_KERNEL); | 174 | static RADIX_TREE(pgmap_radix, GFP_KERNEL); |
@@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
308 | if (is_ram == REGION_INTERSECTS) | 302 | if (is_ram == REGION_INTERSECTS) |
309 | return __va(res->start); | 303 | return __va(res->start); |
310 | 304 | ||
311 | if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { | ||
312 | dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", | ||
313 | __func__); | ||
314 | return ERR_PTR(-ENXIO); | ||
315 | } | ||
316 | |||
317 | if (!ref) | 305 | if (!ref) |
318 | return ERR_PTR(-EINVAL); | 306 | return ERR_PTR(-EINVAL); |
319 | 307 | ||
@@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) | |||
401 | altmap->alloc -= nr_pfns; | 389 | altmap->alloc -= nr_pfns; |
402 | } | 390 | } |
403 | 391 | ||
404 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
405 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | 392 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) |
406 | { | 393 | { |
407 | /* | 394 | /* |
@@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | |||
427 | 414 | ||
428 | return pgmap ? pgmap->altmap : NULL; | 415 | return pgmap ? pgmap->altmap : NULL; |
429 | } | 416 | } |
430 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
431 | #endif /* CONFIG_ZONE_DEVICE */ | 417 | #endif /* CONFIG_ZONE_DEVICE */ |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,6 +1,8 @@ | |||
1 | 1 | ||
2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
3 | 3 | ||
4 | KASAN_SANITIZE_snapshot.o := n | ||
5 | |||
4 | obj-y += qos.o | 6 | obj-y += qos.o |
5 | obj-$(CONFIG_PM) += main.o | 7 | obj-$(CONFIG_PM) += main.o |
6 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o | 8 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o |
diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -126,17 +126,17 @@ out: | |||
126 | return ret; | 126 | return ret; |
127 | } | 127 | } |
128 | 128 | ||
129 | int pm_prepare_console(void) | 129 | void pm_prepare_console(void) |
130 | { | 130 | { |
131 | if (!pm_vt_switch()) | 131 | if (!pm_vt_switch()) |
132 | return 0; | 132 | return; |
133 | 133 | ||
134 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); | 134 | orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); |
135 | if (orig_fgconsole < 0) | 135 | if (orig_fgconsole < 0) |
136 | return 1; | 136 | return; |
137 | 137 | ||
138 | orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); | 138 | orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); |
139 | return 0; | 139 | return; |
140 | } | 140 | } |
141 | 141 | ||
142 | void pm_restore_console(void) | 142 | void pm_restore_console(void) |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..a881c6a7ba74 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -52,6 +52,7 @@ enum { | |||
52 | #ifdef CONFIG_SUSPEND | 52 | #ifdef CONFIG_SUSPEND |
53 | HIBERNATION_SUSPEND, | 53 | HIBERNATION_SUSPEND, |
54 | #endif | 54 | #endif |
55 | HIBERNATION_TEST_RESUME, | ||
55 | /* keep last */ | 56 | /* keep last */ |
56 | __HIBERNATION_AFTER_LAST | 57 | __HIBERNATION_AFTER_LAST |
57 | }; | 58 | }; |
@@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode) | |||
409 | goto Close; | 410 | goto Close; |
410 | } | 411 | } |
411 | 412 | ||
413 | int __weak hibernate_resume_nonboot_cpu_disable(void) | ||
414 | { | ||
415 | return disable_nonboot_cpus(); | ||
416 | } | ||
417 | |||
412 | /** | 418 | /** |
413 | * resume_target_kernel - Restore system state from a hibernation image. | 419 | * resume_target_kernel - Restore system state from a hibernation image. |
414 | * @platform_mode: Whether or not to use the platform driver. | 420 | * @platform_mode: Whether or not to use the platform driver. |
@@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) | |||
433 | if (error) | 439 | if (error) |
434 | goto Cleanup; | 440 | goto Cleanup; |
435 | 441 | ||
436 | error = disable_nonboot_cpus(); | 442 | error = hibernate_resume_nonboot_cpu_disable(); |
437 | if (error) | 443 | if (error) |
438 | goto Enable_cpus; | 444 | goto Enable_cpus; |
439 | 445 | ||
@@ -642,12 +648,39 @@ static void power_down(void) | |||
642 | cpu_relax(); | 648 | cpu_relax(); |
643 | } | 649 | } |
644 | 650 | ||
651 | static int load_image_and_restore(void) | ||
652 | { | ||
653 | int error; | ||
654 | unsigned int flags; | ||
655 | |||
656 | pr_debug("PM: Loading hibernation image.\n"); | ||
657 | |||
658 | lock_device_hotplug(); | ||
659 | error = create_basic_memory_bitmaps(); | ||
660 | if (error) | ||
661 | goto Unlock; | ||
662 | |||
663 | error = swsusp_read(&flags); | ||
664 | swsusp_close(FMODE_READ); | ||
665 | if (!error) | ||
666 | hibernation_restore(flags & SF_PLATFORM_MODE); | ||
667 | |||
668 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | ||
669 | swsusp_free(); | ||
670 | free_basic_memory_bitmaps(); | ||
671 | Unlock: | ||
672 | unlock_device_hotplug(); | ||
673 | |||
674 | return error; | ||
675 | } | ||
676 | |||
645 | /** | 677 | /** |
646 | * hibernate - Carry out system hibernation, including saving the image. | 678 | * hibernate - Carry out system hibernation, including saving the image. |
647 | */ | 679 | */ |
648 | int hibernate(void) | 680 | int hibernate(void) |
649 | { | 681 | { |
650 | int error; | 682 | int error, nr_calls = 0; |
683 | bool snapshot_test = false; | ||
651 | 684 | ||
652 | if (!hibernation_available()) { | 685 | if (!hibernation_available()) { |
653 | pr_debug("PM: Hibernation not available.\n"); | 686 | pr_debug("PM: Hibernation not available.\n"); |
@@ -662,9 +695,11 @@ int hibernate(void) | |||
662 | } | 695 | } |
663 | 696 | ||
664 | pm_prepare_console(); | 697 | pm_prepare_console(); |
665 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 698 | error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); |
666 | if (error) | 699 | if (error) { |
700 | nr_calls--; | ||
667 | goto Exit; | 701 | goto Exit; |
702 | } | ||
668 | 703 | ||
669 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 704 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
670 | sys_sync(); | 705 | sys_sync(); |
@@ -697,8 +732,12 @@ int hibernate(void) | |||
697 | pr_debug("PM: writing image.\n"); | 732 | pr_debug("PM: writing image.\n"); |
698 | error = swsusp_write(flags); | 733 | error = swsusp_write(flags); |
699 | swsusp_free(); | 734 | swsusp_free(); |
700 | if (!error) | 735 | if (!error) { |
701 | power_down(); | 736 | if (hibernation_mode == HIBERNATION_TEST_RESUME) |
737 | snapshot_test = true; | ||
738 | else | ||
739 | power_down(); | ||
740 | } | ||
702 | in_suspend = 0; | 741 | in_suspend = 0; |
703 | pm_restore_gfp_mask(); | 742 | pm_restore_gfp_mask(); |
704 | } else { | 743 | } else { |
@@ -709,12 +748,18 @@ int hibernate(void) | |||
709 | free_basic_memory_bitmaps(); | 748 | free_basic_memory_bitmaps(); |
710 | Thaw: | 749 | Thaw: |
711 | unlock_device_hotplug(); | 750 | unlock_device_hotplug(); |
751 | if (snapshot_test) { | ||
752 | pr_debug("PM: Checking hibernation image\n"); | ||
753 | error = swsusp_check(); | ||
754 | if (!error) | ||
755 | error = load_image_and_restore(); | ||
756 | } | ||
712 | thaw_processes(); | 757 | thaw_processes(); |
713 | 758 | ||
714 | /* Don't bother checking whether freezer_test_done is true */ | 759 | /* Don't bother checking whether freezer_test_done is true */ |
715 | freezer_test_done = false; | 760 | freezer_test_done = false; |
716 | Exit: | 761 | Exit: |
717 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 762 | __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); |
718 | pm_restore_console(); | 763 | pm_restore_console(); |
719 | atomic_inc(&snapshot_device_available); | 764 | atomic_inc(&snapshot_device_available); |
720 | Unlock: | 765 | Unlock: |
@@ -740,8 +785,7 @@ int hibernate(void) | |||
740 | */ | 785 | */ |
741 | static int software_resume(void) | 786 | static int software_resume(void) |
742 | { | 787 | { |
743 | int error; | 788 | int error, nr_calls = 0; |
744 | unsigned int flags; | ||
745 | 789 | ||
746 | /* | 790 | /* |
747 | * If the user said "noresume".. bail out early. | 791 | * If the user said "noresume".. bail out early. |
@@ -827,35 +871,20 @@ static int software_resume(void) | |||
827 | } | 871 | } |
828 | 872 | ||
829 | pm_prepare_console(); | 873 | pm_prepare_console(); |
830 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 874 | error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); |
831 | if (error) | 875 | if (error) { |
876 | nr_calls--; | ||
832 | goto Close_Finish; | 877 | goto Close_Finish; |
878 | } | ||
833 | 879 | ||
834 | pr_debug("PM: Preparing processes for restore.\n"); | 880 | pr_debug("PM: Preparing processes for restore.\n"); |
835 | error = freeze_processes(); | 881 | error = freeze_processes(); |
836 | if (error) | 882 | if (error) |
837 | goto Close_Finish; | 883 | goto Close_Finish; |
838 | 884 | error = load_image_and_restore(); | |
839 | pr_debug("PM: Loading hibernation image.\n"); | ||
840 | |||
841 | lock_device_hotplug(); | ||
842 | error = create_basic_memory_bitmaps(); | ||
843 | if (error) | ||
844 | goto Thaw; | ||
845 | |||
846 | error = swsusp_read(&flags); | ||
847 | swsusp_close(FMODE_READ); | ||
848 | if (!error) | ||
849 | hibernation_restore(flags & SF_PLATFORM_MODE); | ||
850 | |||
851 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | ||
852 | swsusp_free(); | ||
853 | free_basic_memory_bitmaps(); | ||
854 | Thaw: | ||
855 | unlock_device_hotplug(); | ||
856 | thaw_processes(); | 885 | thaw_processes(); |
857 | Finish: | 886 | Finish: |
858 | pm_notifier_call_chain(PM_POST_RESTORE); | 887 | __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); |
859 | pm_restore_console(); | 888 | pm_restore_console(); |
860 | atomic_inc(&snapshot_device_available); | 889 | atomic_inc(&snapshot_device_available); |
861 | /* For success case, the suspend path will release the lock */ | 890 | /* For success case, the suspend path will release the lock */ |
@@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = { | |||
878 | #ifdef CONFIG_SUSPEND | 907 | #ifdef CONFIG_SUSPEND |
879 | [HIBERNATION_SUSPEND] = "suspend", | 908 | [HIBERNATION_SUSPEND] = "suspend", |
880 | #endif | 909 | #endif |
910 | [HIBERNATION_TEST_RESUME] = "test_resume", | ||
881 | }; | 911 | }; |
882 | 912 | ||
883 | /* | 913 | /* |
@@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
924 | #ifdef CONFIG_SUSPEND | 954 | #ifdef CONFIG_SUSPEND |
925 | case HIBERNATION_SUSPEND: | 955 | case HIBERNATION_SUSPEND: |
926 | #endif | 956 | #endif |
957 | case HIBERNATION_TEST_RESUME: | ||
927 | break; | 958 | break; |
928 | case HIBERNATION_PLATFORM: | 959 | case HIBERNATION_PLATFORM: |
929 | if (hibernation_ops) | 960 | if (hibernation_ops) |
@@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
970 | #ifdef CONFIG_SUSPEND | 1001 | #ifdef CONFIG_SUSPEND |
971 | case HIBERNATION_SUSPEND: | 1002 | case HIBERNATION_SUSPEND: |
972 | #endif | 1003 | #endif |
1004 | case HIBERNATION_TEST_RESUME: | ||
973 | hibernation_mode = mode; | 1005 | hibernation_mode = mode; |
974 | break; | 1006 | break; |
975 | case HIBERNATION_PLATFORM: | 1007 | case HIBERNATION_PLATFORM: |
@@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str) | |||
1115 | 1147 | ||
1116 | static int __init hibernate_setup(char *str) | 1148 | static int __init hibernate_setup(char *str) |
1117 | { | 1149 | { |
1118 | if (!strncmp(str, "noresume", 8)) | 1150 | if (!strncmp(str, "noresume", 8)) { |
1119 | noresume = 1; | 1151 | noresume = 1; |
1120 | else if (!strncmp(str, "nocompress", 10)) | 1152 | } else if (!strncmp(str, "nocompress", 10)) { |
1121 | nocompress = 1; | 1153 | nocompress = 1; |
1122 | else if (!strncmp(str, "no", 2)) { | 1154 | } else if (!strncmp(str, "no", 2)) { |
1123 | noresume = 1; | 1155 | noresume = 1; |
1124 | nohibernate = 1; | 1156 | nohibernate = 1; |
1157 | } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) | ||
1158 | && !strncmp(str, "protect_image", 13)) { | ||
1159 | enable_restore_image_protection(); | ||
1125 | } | 1160 | } |
1126 | return 1; | 1161 | return 1; |
1127 | } | 1162 | } |
@@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str) | |||
1154 | return 1; | 1189 | return 1; |
1155 | } | 1190 | } |
1156 | 1191 | ||
1157 | static int __init kaslr_nohibernate_setup(char *str) | ||
1158 | { | ||
1159 | return nohibernate_setup(str); | ||
1160 | } | ||
1161 | |||
1162 | static int __init page_poison_nohibernate_setup(char *str) | 1192 | static int __init page_poison_nohibernate_setup(char *str) |
1163 | { | 1193 | { |
1164 | #ifdef CONFIG_PAGE_POISONING_ZERO | 1194 | #ifdef CONFIG_PAGE_POISONING_ZERO |
@@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup); | |||
1182 | __setup("resumewait", resumewait_setup); | 1212 | __setup("resumewait", resumewait_setup); |
1183 | __setup("resumedelay=", resumedelay_setup); | 1213 | __setup("resumedelay=", resumedelay_setup); |
1184 | __setup("nohibernate", nohibernate_setup); | 1214 | __setup("nohibernate", nohibernate_setup); |
1185 | __setup("kaslr", kaslr_nohibernate_setup); | ||
1186 | __setup("page_poison=", page_poison_nohibernate_setup); | 1215 | __setup("page_poison=", page_poison_nohibernate_setup); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) | |||
38 | } | 38 | } |
39 | EXPORT_SYMBOL_GPL(unregister_pm_notifier); | 39 | EXPORT_SYMBOL_GPL(unregister_pm_notifier); |
40 | 40 | ||
41 | int pm_notifier_call_chain(unsigned long val) | 41 | int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) |
42 | { | 42 | { |
43 | int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); | 43 | int ret; |
44 | |||
45 | ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, | ||
46 | nr_to_call, nr_calls); | ||
44 | 47 | ||
45 | return notifier_to_errno(ret); | 48 | return notifier_to_errno(ret); |
46 | } | 49 | } |
50 | int pm_notifier_call_chain(unsigned long val) | ||
51 | { | ||
52 | return __pm_notifier_call_chain(val, -1, NULL); | ||
53 | } | ||
47 | 54 | ||
48 | /* If set, devices may be suspended and resumed asynchronously. */ | 55 | /* If set, devices may be suspended and resumed asynchronously. */ |
49 | int pm_async_enabled = 1; | 56 | int pm_async_enabled = 1; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
38 | } | 38 | } |
39 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ | 39 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ |
40 | 40 | ||
41 | extern int hibernate_resume_nonboot_cpu_disable(void); | ||
42 | |||
41 | /* | 43 | /* |
42 | * Keep some memory free so that I/O operations can succeed without paging | 44 | * Keep some memory free so that I/O operations can succeed without paging |
43 | * [Might this be more than 4 MB?] | 45 | * [Might this be more than 4 MB?] |
@@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode); | |||
59 | extern int hibernation_restore(int platform_mode); | 61 | extern int hibernation_restore(int platform_mode); |
60 | extern int hibernation_platform_enter(void); | 62 | extern int hibernation_platform_enter(void); |
61 | 63 | ||
64 | #ifdef CONFIG_DEBUG_RODATA | ||
65 | /* kernel/power/snapshot.c */ | ||
66 | extern void enable_restore_image_protection(void); | ||
67 | #else | ||
68 | static inline void enable_restore_image_protection(void) {} | ||
69 | #endif /* CONFIG_DEBUG_RODATA */ | ||
70 | |||
62 | #else /* !CONFIG_HIBERNATION */ | 71 | #else /* !CONFIG_HIBERNATION */ |
63 | 72 | ||
64 | static inline void hibernate_reserved_size_init(void) {} | 73 | static inline void hibernate_reserved_size_init(void) {} |
@@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {} | |||
200 | 209 | ||
201 | #ifdef CONFIG_PM_SLEEP | 210 | #ifdef CONFIG_PM_SLEEP |
202 | /* kernel/power/main.c */ | 211 | /* kernel/power/main.c */ |
212 | extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, | ||
213 | int *nr_calls); | ||
203 | extern int pm_notifier_call_chain(unsigned long val); | 214 | extern int pm_notifier_call_chain(unsigned long val); |
204 | #endif | 215 | #endif |
205 | 216 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..8f27d5a8adf6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only) | |||
89 | elapsed_msecs / 1000, elapsed_msecs % 1000, | 89 | elapsed_msecs / 1000, elapsed_msecs % 1000, |
90 | todo - wq_busy, wq_busy); | 90 | todo - wq_busy, wq_busy); |
91 | 91 | ||
92 | if (wq_busy) | ||
93 | show_workqueue_state(); | ||
94 | |||
92 | if (!wakeup) { | 95 | if (!wakeup) { |
93 | read_lock(&tasklist_lock); | 96 | read_lock(&tasklist_lock); |
94 | for_each_process_thread(g, p) { | 97 | for_each_process_thread(g, p) { |
@@ -146,6 +149,18 @@ int freeze_processes(void) | |||
146 | if (!error && !oom_killer_disable()) | 149 | if (!error && !oom_killer_disable()) |
147 | error = -EBUSY; | 150 | error = -EBUSY; |
148 | 151 | ||
152 | /* | ||
153 | * There is a hard to fix race between oom_reaper kernel thread | ||
154 | * and oom_killer_disable. oom_reaper calls exit_oom_victim | ||
155 | * before the victim reaches exit_mm so try to freeze all the tasks | ||
156 | * again and catch such a left over task. | ||
157 | */ | ||
158 | if (!error) { | ||
159 | pr_info("Double checking all user space processes after OOM killer disable... "); | ||
160 | error = try_to_freeze_tasks(true); | ||
161 | pr_cont("\n"); | ||
162 | } | ||
163 | |||
149 | if (error) | 164 | if (error) |
150 | thaw_processes(); | 165 | thaw_processes(); |
151 | return error; | 166 | return error; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -38,6 +38,43 @@ | |||
38 | 38 | ||
39 | #include "power.h" | 39 | #include "power.h" |
40 | 40 | ||
41 | #ifdef CONFIG_DEBUG_RODATA | ||
42 | static bool hibernate_restore_protection; | ||
43 | static bool hibernate_restore_protection_active; | ||
44 | |||
45 | void enable_restore_image_protection(void) | ||
46 | { | ||
47 | hibernate_restore_protection = true; | ||
48 | } | ||
49 | |||
50 | static inline void hibernate_restore_protection_begin(void) | ||
51 | { | ||
52 | hibernate_restore_protection_active = hibernate_restore_protection; | ||
53 | } | ||
54 | |||
55 | static inline void hibernate_restore_protection_end(void) | ||
56 | { | ||
57 | hibernate_restore_protection_active = false; | ||
58 | } | ||
59 | |||
60 | static inline void hibernate_restore_protect_page(void *page_address) | ||
61 | { | ||
62 | if (hibernate_restore_protection_active) | ||
63 | set_memory_ro((unsigned long)page_address, 1); | ||
64 | } | ||
65 | |||
66 | static inline void hibernate_restore_unprotect_page(void *page_address) | ||
67 | { | ||
68 | if (hibernate_restore_protection_active) | ||
69 | set_memory_rw((unsigned long)page_address, 1); | ||
70 | } | ||
71 | #else | ||
72 | static inline void hibernate_restore_protection_begin(void) {} | ||
73 | static inline void hibernate_restore_protection_end(void) {} | ||
74 | static inline void hibernate_restore_protect_page(void *page_address) {} | ||
75 | static inline void hibernate_restore_unprotect_page(void *page_address) {} | ||
76 | #endif /* CONFIG_DEBUG_RODATA */ | ||
77 | |||
41 | static int swsusp_page_is_free(struct page *); | 78 | static int swsusp_page_is_free(struct page *); |
42 | static void swsusp_set_page_forbidden(struct page *); | 79 | static void swsusp_set_page_forbidden(struct page *); |
43 | static void swsusp_unset_page_forbidden(struct page *); | 80 | static void swsusp_unset_page_forbidden(struct page *); |
@@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void) | |||
67 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; | 104 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; |
68 | } | 105 | } |
69 | 106 | ||
70 | /* List of PBEs needed for restoring the pages that were allocated before | 107 | /* |
108 | * List of PBEs needed for restoring the pages that were allocated before | ||
71 | * the suspend and included in the suspend image, but have also been | 109 | * the suspend and included in the suspend image, but have also been |
72 | * allocated by the "resume" kernel, so their contents cannot be written | 110 | * allocated by the "resume" kernel, so their contents cannot be written |
73 | * directly to their "original" page frames. | 111 | * directly to their "original" page frames. |
74 | */ | 112 | */ |
75 | struct pbe *restore_pblist; | 113 | struct pbe *restore_pblist; |
76 | 114 | ||
77 | /* Pointer to an auxiliary buffer (1 page) */ | 115 | /* struct linked_page is used to build chains of pages */ |
78 | static void *buffer; | ||
79 | 116 | ||
80 | /** | 117 | #define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) |
81 | * @safe_needed - on resume, for storing the PBE list and the image, | 118 | |
82 | * we can only use memory pages that do not conflict with the pages | 119 | struct linked_page { |
83 | * used before suspend. The unsafe pages have PageNosaveFree set | 120 | struct linked_page *next; |
84 | * and we count them using unsafe_pages. | 121 | char data[LINKED_PAGE_DATA_SIZE]; |
85 | * | 122 | } __packed; |
86 | * Each allocated image page is marked as PageNosave and PageNosaveFree | 123 | |
87 | * so that swsusp_free() can release it. | 124 | /* |
125 | * List of "safe" pages (ie. pages that were not used by the image kernel | ||
126 | * before hibernation) that may be used as temporary storage for image kernel | ||
127 | * memory contents. | ||
88 | */ | 128 | */ |
129 | static struct linked_page *safe_pages_list; | ||
130 | |||
131 | /* Pointer to an auxiliary buffer (1 page) */ | ||
132 | static void *buffer; | ||
89 | 133 | ||
90 | #define PG_ANY 0 | 134 | #define PG_ANY 0 |
91 | #define PG_SAFE 1 | 135 | #define PG_SAFE 1 |
@@ -94,6 +138,19 @@ static void *buffer; | |||
94 | 138 | ||
95 | static unsigned int allocated_unsafe_pages; | 139 | static unsigned int allocated_unsafe_pages; |
96 | 140 | ||
141 | /** | ||
142 | * get_image_page - Allocate a page for a hibernation image. | ||
143 | * @gfp_mask: GFP mask for the allocation. | ||
144 | * @safe_needed: Get pages that were not used before hibernation (restore only) | ||
145 | * | ||
146 | * During image restoration, for storing the PBE list and the image data, we can | ||
147 | * only use memory pages that do not conflict with the pages used before | ||
148 | * hibernation. The "unsafe" pages have PageNosaveFree set and we count them | ||
149 | * using allocated_unsafe_pages. | ||
150 | * | ||
151 | * Each allocated image page is marked as PageNosave and PageNosaveFree so that | ||
152 | * swsusp_free() can release it. | ||
153 | */ | ||
97 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) | 154 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) |
98 | { | 155 | { |
99 | void *res; | 156 | void *res; |
@@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) | |||
113 | return res; | 170 | return res; |
114 | } | 171 | } |
115 | 172 | ||
173 | static void *__get_safe_page(gfp_t gfp_mask) | ||
174 | { | ||
175 | if (safe_pages_list) { | ||
176 | void *ret = safe_pages_list; | ||
177 | |||
178 | safe_pages_list = safe_pages_list->next; | ||
179 | memset(ret, 0, PAGE_SIZE); | ||
180 | return ret; | ||
181 | } | ||
182 | return get_image_page(gfp_mask, PG_SAFE); | ||
183 | } | ||
184 | |||
116 | unsigned long get_safe_page(gfp_t gfp_mask) | 185 | unsigned long get_safe_page(gfp_t gfp_mask) |
117 | { | 186 | { |
118 | return (unsigned long)get_image_page(gfp_mask, PG_SAFE); | 187 | return (unsigned long)__get_safe_page(gfp_mask); |
119 | } | 188 | } |
120 | 189 | ||
121 | static struct page *alloc_image_page(gfp_t gfp_mask) | 190 | static struct page *alloc_image_page(gfp_t gfp_mask) |
@@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask) | |||
130 | return page; | 199 | return page; |
131 | } | 200 | } |
132 | 201 | ||
202 | static void recycle_safe_page(void *page_address) | ||
203 | { | ||
204 | struct linked_page *lp = page_address; | ||
205 | |||
206 | lp->next = safe_pages_list; | ||
207 | safe_pages_list = lp; | ||
208 | } | ||
209 | |||
133 | /** | 210 | /** |
134 | * free_image_page - free page represented by @addr, allocated with | 211 | * free_image_page - Free a page allocated for hibernation image. |
135 | * get_image_page (page flags set by it must be cleared) | 212 | * @addr: Address of the page to free. |
213 | * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. | ||
214 | * | ||
215 | * The page to free should have been allocated by get_image_page() (page flags | ||
216 | * set by it are affected). | ||
136 | */ | 217 | */ |
137 | |||
138 | static inline void free_image_page(void *addr, int clear_nosave_free) | 218 | static inline void free_image_page(void *addr, int clear_nosave_free) |
139 | { | 219 | { |
140 | struct page *page; | 220 | struct page *page; |
@@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) | |||
150 | __free_page(page); | 230 | __free_page(page); |
151 | } | 231 | } |
152 | 232 | ||
153 | /* struct linked_page is used to build chains of pages */ | 233 | static inline void free_list_of_pages(struct linked_page *list, |
154 | 234 | int clear_page_nosave) | |
155 | #define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) | ||
156 | |||
157 | struct linked_page { | ||
158 | struct linked_page *next; | ||
159 | char data[LINKED_PAGE_DATA_SIZE]; | ||
160 | } __packed; | ||
161 | |||
162 | static inline void | ||
163 | free_list_of_pages(struct linked_page *list, int clear_page_nosave) | ||
164 | { | 235 | { |
165 | while (list) { | 236 | while (list) { |
166 | struct linked_page *lp = list->next; | 237 | struct linked_page *lp = list->next; |
@@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave) | |||
170 | } | 241 | } |
171 | } | 242 | } |
172 | 243 | ||
173 | /** | 244 | /* |
174 | * struct chain_allocator is used for allocating small objects out of | 245 | * struct chain_allocator is used for allocating small objects out of |
175 | * a linked list of pages called 'the chain'. | 246 | * a linked list of pages called 'the chain'. |
176 | * | 247 | * |
177 | * The chain grows each time when there is no room for a new object in | 248 | * The chain grows each time when there is no room for a new object in |
178 | * the current page. The allocated objects cannot be freed individually. | 249 | * the current page. The allocated objects cannot be freed individually. |
179 | * It is only possible to free them all at once, by freeing the entire | 250 | * It is only possible to free them all at once, by freeing the entire |
180 | * chain. | 251 | * chain. |
181 | * | 252 | * |
182 | * NOTE: The chain allocator may be inefficient if the allocated objects | 253 | * NOTE: The chain allocator may be inefficient if the allocated objects |
183 | * are not much smaller than PAGE_SIZE. | 254 | * are not much smaller than PAGE_SIZE. |
184 | */ | 255 | */ |
185 | |||
186 | struct chain_allocator { | 256 | struct chain_allocator { |
187 | struct linked_page *chain; /* the chain */ | 257 | struct linked_page *chain; /* the chain */ |
188 | unsigned int used_space; /* total size of objects allocated out | 258 | unsigned int used_space; /* total size of objects allocated out |
189 | * of the current page | 259 | of the current page */ |
190 | */ | ||
191 | gfp_t gfp_mask; /* mask for allocating pages */ | 260 | gfp_t gfp_mask; /* mask for allocating pages */ |
192 | int safe_needed; /* if set, only "safe" pages are allocated */ | 261 | int safe_needed; /* if set, only "safe" pages are allocated */ |
193 | }; | 262 | }; |
194 | 263 | ||
195 | static void | 264 | static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, |
196 | chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) | 265 | int safe_needed) |
197 | { | 266 | { |
198 | ca->chain = NULL; | 267 | ca->chain = NULL; |
199 | ca->used_space = LINKED_PAGE_DATA_SIZE; | 268 | ca->used_space = LINKED_PAGE_DATA_SIZE; |
@@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
208 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { | 277 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { |
209 | struct linked_page *lp; | 278 | struct linked_page *lp; |
210 | 279 | ||
211 | lp = get_image_page(ca->gfp_mask, ca->safe_needed); | 280 | lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : |
281 | get_image_page(ca->gfp_mask, PG_ANY); | ||
212 | if (!lp) | 282 | if (!lp) |
213 | return NULL; | 283 | return NULL; |
214 | 284 | ||
@@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
222 | } | 292 | } |
223 | 293 | ||
224 | /** | 294 | /** |
225 | * Data types related to memory bitmaps. | 295 | * Data types related to memory bitmaps. |
226 | * | 296 | * |
227 | * Memory bitmap is a structure consiting of many linked lists of | 297 | * Memory bitmap is a structure consiting of many linked lists of |
228 | * objects. The main list's elements are of type struct zone_bitmap | 298 | * objects. The main list's elements are of type struct zone_bitmap |
229 | * and each of them corresonds to one zone. For each zone bitmap | 299 | * and each of them corresonds to one zone. For each zone bitmap |
230 | * object there is a list of objects of type struct bm_block that | 300 | * object there is a list of objects of type struct bm_block that |
231 | * represent each blocks of bitmap in which information is stored. | 301 | * represent each blocks of bitmap in which information is stored. |
232 | * | 302 | * |
233 | * struct memory_bitmap contains a pointer to the main list of zone | 303 | * struct memory_bitmap contains a pointer to the main list of zone |
234 | * bitmap objects, a struct bm_position used for browsing the bitmap, | 304 | * bitmap objects, a struct bm_position used for browsing the bitmap, |
235 | * and a pointer to the list of pages used for allocating all of the | 305 | * and a pointer to the list of pages used for allocating all of the |
236 | * zone bitmap objects and bitmap block objects. | 306 | * zone bitmap objects and bitmap block objects. |
237 | * | 307 | * |
238 | * NOTE: It has to be possible to lay out the bitmap in memory | 308 | * NOTE: It has to be possible to lay out the bitmap in memory |
239 | * using only allocations of order 0. Additionally, the bitmap is | 309 | * using only allocations of order 0. Additionally, the bitmap is |
240 | * designed to work with arbitrary number of zones (this is over the | 310 | * designed to work with arbitrary number of zones (this is over the |
241 | * top for now, but let's avoid making unnecessary assumptions ;-). | 311 | * top for now, but let's avoid making unnecessary assumptions ;-). |
242 | * | 312 | * |
243 | * struct zone_bitmap contains a pointer to a list of bitmap block | 313 | * struct zone_bitmap contains a pointer to a list of bitmap block |
244 | * objects and a pointer to the bitmap block object that has been | 314 | * objects and a pointer to the bitmap block object that has been |
245 | * most recently used for setting bits. Additionally, it contains the | 315 | * most recently used for setting bits. Additionally, it contains the |
246 | * pfns that correspond to the start and end of the represented zone. | 316 | * PFNs that correspond to the start and end of the represented zone. |
247 | * | 317 | * |
248 | * struct bm_block contains a pointer to the memory page in which | 318 | * struct bm_block contains a pointer to the memory page in which |
249 | * information is stored (in the form of a block of bitmap) | 319 | * information is stored (in the form of a block of bitmap) |
250 | * It also contains the pfns that correspond to the start and end of | 320 | * It also contains the pfns that correspond to the start and end of |
251 | * the represented memory area. | 321 | * the represented memory area. |
252 | * | 322 | * |
253 | * The memory bitmap is organized as a radix tree to guarantee fast random | 323 | * The memory bitmap is organized as a radix tree to guarantee fast random |
254 | * access to the bits. There is one radix tree for each zone (as returned | 324 | * access to the bits. There is one radix tree for each zone (as returned |
255 | * from create_mem_extents). | 325 | * from create_mem_extents). |
256 | * | 326 | * |
257 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are | 327 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are |
258 | * two linked lists for the nodes of the tree, one for the inner nodes and | 328 | * two linked lists for the nodes of the tree, one for the inner nodes and |
259 | * one for the leave nodes. The linked leave nodes are used for fast linear | 329 | * one for the leave nodes. The linked leave nodes are used for fast linear |
260 | * access of the memory bitmap. | 330 | * access of the memory bitmap. |
261 | * | 331 | * |
262 | * The struct rtree_node represents one node of the radix tree. | 332 | * The struct rtree_node represents one node of the radix tree. |
263 | */ | 333 | */ |
264 | 334 | ||
265 | #define BM_END_OF_MAP (~0UL) | 335 | #define BM_END_OF_MAP (~0UL) |
@@ -305,9 +375,8 @@ struct bm_position { | |||
305 | struct memory_bitmap { | 375 | struct memory_bitmap { |
306 | struct list_head zones; | 376 | struct list_head zones; |
307 | struct linked_page *p_list; /* list of pages used to store zone | 377 | struct linked_page *p_list; /* list of pages used to store zone |
308 | * bitmap objects and bitmap block | 378 | bitmap objects and bitmap block |
309 | * objects | 379 | objects */ |
310 | */ | ||
311 | struct bm_position cur; /* most recently used bit position */ | 380 | struct bm_position cur; /* most recently used bit position */ |
312 | }; | 381 | }; |
313 | 382 | ||
@@ -321,12 +390,12 @@ struct memory_bitmap { | |||
321 | #endif | 390 | #endif |
322 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) | 391 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) |
323 | 392 | ||
324 | /* | 393 | /** |
325 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. | 394 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. |
326 | * | 395 | * |
327 | * This function is used to allocate inner nodes as well as the | 396 | * This function is used to allocate inner nodes as well as the |
328 | * leave nodes of the radix tree. It also adds the node to the | 397 | * leave nodes of the radix tree. It also adds the node to the |
329 | * corresponding linked list passed in by the *list parameter. | 398 | * corresponding linked list passed in by the *list parameter. |
330 | */ | 399 | */ |
331 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | 400 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, |
332 | struct chain_allocator *ca, | 401 | struct chain_allocator *ca, |
@@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | |||
347 | return node; | 416 | return node; |
348 | } | 417 | } |
349 | 418 | ||
350 | /* | 419 | /** |
351 | * add_rtree_block - Add a new leave node to the radix tree | 420 | * add_rtree_block - Add a new leave node to the radix tree. |
352 | * | 421 | * |
353 | * The leave nodes need to be allocated in order to keep the leaves | 422 | * The leave nodes need to be allocated in order to keep the leaves |
354 | * linked list in order. This is guaranteed by the zone->blocks | 423 | * linked list in order. This is guaranteed by the zone->blocks |
355 | * counter. | 424 | * counter. |
356 | */ | 425 | */ |
357 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, | 426 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, |
358 | int safe_needed, struct chain_allocator *ca) | 427 | int safe_needed, struct chain_allocator *ca) |
@@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, | |||
417 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | 486 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, |
418 | int clear_nosave_free); | 487 | int clear_nosave_free); |
419 | 488 | ||
420 | /* | 489 | /** |
421 | * create_zone_bm_rtree - create a radix tree for one zone | 490 | * create_zone_bm_rtree - Create a radix tree for one zone. |
422 | * | 491 | * |
423 | * Allocated the mem_zone_bm_rtree structure and initializes it. | 492 | * Allocated the mem_zone_bm_rtree structure and initializes it. |
424 | * This function also allocated and builds the radix tree for the | 493 | * This function also allocated and builds the radix tree for the |
425 | * zone. | 494 | * zone. |
426 | */ | 495 | */ |
427 | static struct mem_zone_bm_rtree * | 496 | static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, |
428 | create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | 497 | int safe_needed, |
429 | struct chain_allocator *ca, | 498 | struct chain_allocator *ca, |
430 | unsigned long start, unsigned long end) | 499 | unsigned long start, |
500 | unsigned long end) | ||
431 | { | 501 | { |
432 | struct mem_zone_bm_rtree *zone; | 502 | struct mem_zone_bm_rtree *zone; |
433 | unsigned int i, nr_blocks; | 503 | unsigned int i, nr_blocks; |
@@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | |||
454 | return zone; | 524 | return zone; |
455 | } | 525 | } |
456 | 526 | ||
457 | /* | 527 | /** |
458 | * free_zone_bm_rtree - Free the memory of the radix tree | 528 | * free_zone_bm_rtree - Free the memory of the radix tree. |
459 | * | 529 | * |
460 | * Free all node pages of the radix tree. The mem_zone_bm_rtree | 530 | * Free all node pages of the radix tree. The mem_zone_bm_rtree |
461 | * structure itself is not freed here nor are the rtree_node | 531 | * structure itself is not freed here nor are the rtree_node |
462 | * structs. | 532 | * structs. |
463 | */ | 533 | */ |
464 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | 534 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, |
465 | int clear_nosave_free) | 535 | int clear_nosave_free) |
@@ -492,8 +562,8 @@ struct mem_extent { | |||
492 | }; | 562 | }; |
493 | 563 | ||
494 | /** | 564 | /** |
495 | * free_mem_extents - free a list of memory extents | 565 | * free_mem_extents - Free a list of memory extents. |
496 | * @list - list of extents to empty | 566 | * @list: List of extents to free. |
497 | */ | 567 | */ |
498 | static void free_mem_extents(struct list_head *list) | 568 | static void free_mem_extents(struct list_head *list) |
499 | { | 569 | { |
@@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list) | |||
506 | } | 576 | } |
507 | 577 | ||
508 | /** | 578 | /** |
509 | * create_mem_extents - create a list of memory extents representing | 579 | * create_mem_extents - Create a list of memory extents. |
510 | * contiguous ranges of PFNs | 580 | * @list: List to put the extents into. |
511 | * @list - list to put the extents into | 581 | * @gfp_mask: Mask to use for memory allocations. |
512 | * @gfp_mask - mask to use for memory allocations | 582 | * |
583 | * The extents represent contiguous ranges of PFNs. | ||
513 | */ | 584 | */ |
514 | static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | 585 | static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) |
515 | { | 586 | { |
@@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | |||
565 | } | 636 | } |
566 | 637 | ||
567 | /** | 638 | /** |
568 | * memory_bm_create - allocate memory for a memory bitmap | 639 | * memory_bm_create - Allocate memory for a memory bitmap. |
569 | */ | 640 | */ |
570 | static int | 641 | static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, |
571 | memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | 642 | int safe_needed) |
572 | { | 643 | { |
573 | struct chain_allocator ca; | 644 | struct chain_allocator ca; |
574 | struct list_head mem_extents; | 645 | struct list_head mem_extents; |
@@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
607 | } | 678 | } |
608 | 679 | ||
609 | /** | 680 | /** |
610 | * memory_bm_free - free memory occupied by the memory bitmap @bm | 681 | * memory_bm_free - Free memory occupied by the memory bitmap. |
611 | */ | 682 | * @bm: Memory bitmap. |
683 | */ | ||
612 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | 684 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) |
613 | { | 685 | { |
614 | struct mem_zone_bm_rtree *zone; | 686 | struct mem_zone_bm_rtree *zone; |
@@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | |||
622 | } | 694 | } |
623 | 695 | ||
624 | /** | 696 | /** |
625 | * memory_bm_find_bit - Find the bit for pfn in the memory | 697 | * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. |
626 | * bitmap | ||
627 | * | 698 | * |
628 | * Find the bit in the bitmap @bm that corresponds to given pfn. | 699 | * Find the bit in memory bitmap @bm that corresponds to the given PFN. |
629 | * The cur.zone, cur.block and cur.node_pfn member of @bm are | 700 | * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. |
630 | * updated. | 701 | * |
631 | * It walks the radix tree to find the page which contains the bit for | 702 | * Walk the radix tree to find the page containing the bit that represents @pfn |
632 | * pfn and returns the bit position in **addr and *bit_nr. | 703 | * and return the position of the bit in @addr and @bit_nr. |
633 | */ | 704 | */ |
634 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | 705 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, |
635 | void **addr, unsigned int *bit_nr) | 706 | void **addr, unsigned int *bit_nr) |
@@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | |||
658 | 729 | ||
659 | zone_found: | 730 | zone_found: |
660 | /* | 731 | /* |
661 | * We have a zone. Now walk the radix tree to find the leave | 732 | * We have found the zone. Now walk the radix tree to find the leaf node |
662 | * node for our pfn. | 733 | * for our PFN. |
663 | */ | 734 | */ |
664 | |||
665 | node = bm->cur.node; | 735 | node = bm->cur.node; |
666 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) | 736 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) |
667 | goto node_found; | 737 | goto node_found; |
@@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) | |||
754 | } | 824 | } |
755 | 825 | ||
756 | /* | 826 | /* |
757 | * rtree_next_node - Jumps to the next leave node | 827 | * rtree_next_node - Jump to the next leaf node. |
758 | * | 828 | * |
759 | * Sets the position to the beginning of the next node in the | 829 | * Set the position to the beginning of the next node in the |
760 | * memory bitmap. This is either the next node in the current | 830 | * memory bitmap. This is either the next node in the current |
761 | * zone's radix tree or the first node in the radix tree of the | 831 | * zone's radix tree or the first node in the radix tree of the |
762 | * next zone. | 832 | * next zone. |
763 | * | 833 | * |
764 | * Returns true if there is a next node, false otherwise. | 834 | * Return true if there is a next node, false otherwise. |
765 | */ | 835 | */ |
766 | static bool rtree_next_node(struct memory_bitmap *bm) | 836 | static bool rtree_next_node(struct memory_bitmap *bm) |
767 | { | 837 | { |
@@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm) | |||
790 | } | 860 | } |
791 | 861 | ||
792 | /** | 862 | /** |
793 | * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm | 863 | * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. |
864 | * @bm: Memory bitmap. | ||
794 | * | 865 | * |
795 | * Starting from the last returned position this function searches | 866 | * Starting from the last returned position this function searches for the next |
796 | * for the next set bit in the memory bitmap and returns its | 867 | * set bit in @bm and returns the PFN represented by it. If no more bits are |
797 | * number. If no more bit is set BM_END_OF_MAP is returned. | 868 | * set, BM_END_OF_MAP is returned. |
798 | * | 869 | * |
799 | * It is required to run memory_bm_position_reset() before the | 870 | * It is required to run memory_bm_position_reset() before the first call to |
800 | * first call to this function. | 871 | * this function for the given memory bitmap. |
801 | */ | 872 | */ |
802 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | 873 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) |
803 | { | 874 | { |
@@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | |||
819 | return BM_END_OF_MAP; | 890 | return BM_END_OF_MAP; |
820 | } | 891 | } |
821 | 892 | ||
822 | /** | 893 | /* |
823 | * This structure represents a range of page frames the contents of which | 894 | * This structure represents a range of page frames the contents of which |
824 | * should not be saved during the suspend. | 895 | * should not be saved during hibernation. |
825 | */ | 896 | */ |
826 | |||
827 | struct nosave_region { | 897 | struct nosave_region { |
828 | struct list_head list; | 898 | struct list_head list; |
829 | unsigned long start_pfn; | 899 | unsigned long start_pfn; |
@@ -832,15 +902,42 @@ struct nosave_region { | |||
832 | 902 | ||
833 | static LIST_HEAD(nosave_regions); | 903 | static LIST_HEAD(nosave_regions); |
834 | 904 | ||
905 | static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) | ||
906 | { | ||
907 | struct rtree_node *node; | ||
908 | |||
909 | list_for_each_entry(node, &zone->nodes, list) | ||
910 | recycle_safe_page(node->data); | ||
911 | |||
912 | list_for_each_entry(node, &zone->leaves, list) | ||
913 | recycle_safe_page(node->data); | ||
914 | } | ||
915 | |||
916 | static void memory_bm_recycle(struct memory_bitmap *bm) | ||
917 | { | ||
918 | struct mem_zone_bm_rtree *zone; | ||
919 | struct linked_page *p_list; | ||
920 | |||
921 | list_for_each_entry(zone, &bm->zones, list) | ||
922 | recycle_zone_bm_rtree(zone); | ||
923 | |||
924 | p_list = bm->p_list; | ||
925 | while (p_list) { | ||
926 | struct linked_page *lp = p_list; | ||
927 | |||
928 | p_list = lp->next; | ||
929 | recycle_safe_page(lp); | ||
930 | } | ||
931 | } | ||
932 | |||
835 | /** | 933 | /** |
836 | * register_nosave_region - register a range of page frames the contents | 934 | * register_nosave_region - Register a region of unsaveable memory. |
837 | * of which should not be saved during the suspend (to be used in the early | 935 | * |
838 | * initialization code) | 936 | * Register a range of page frames the contents of which should not be saved |
937 | * during hibernation (to be used in the early initialization code). | ||
839 | */ | 938 | */ |
840 | 939 | void __init __register_nosave_region(unsigned long start_pfn, | |
841 | void __init | 940 | unsigned long end_pfn, int use_kmalloc) |
842 | __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | ||
843 | int use_kmalloc) | ||
844 | { | 941 | { |
845 | struct nosave_region *region; | 942 | struct nosave_region *region; |
846 | 943 | ||
@@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
857 | } | 954 | } |
858 | } | 955 | } |
859 | if (use_kmalloc) { | 956 | if (use_kmalloc) { |
860 | /* during init, this shouldn't fail */ | 957 | /* During init, this shouldn't fail */ |
861 | region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); | 958 | region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); |
862 | BUG_ON(!region); | 959 | BUG_ON(!region); |
863 | } else | 960 | } else { |
864 | /* This allocation cannot fail */ | 961 | /* This allocation cannot fail */ |
865 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); | 962 | region = memblock_virt_alloc(sizeof(struct nosave_region), 0); |
963 | } | ||
866 | region->start_pfn = start_pfn; | 964 | region->start_pfn = start_pfn; |
867 | region->end_pfn = end_pfn; | 965 | region->end_pfn = end_pfn; |
868 | list_add_tail(®ion->list, &nosave_regions); | 966 | list_add_tail(®ion->list, &nosave_regions); |
@@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page) | |||
923 | } | 1021 | } |
924 | 1022 | ||
925 | /** | 1023 | /** |
926 | * mark_nosave_pages - set bits corresponding to the page frames the | 1024 | * mark_nosave_pages - Mark pages that should not be saved. |
927 | * contents of which should not be saved in a given bitmap. | 1025 | * @bm: Memory bitmap. |
1026 | * | ||
1027 | * Set the bits in @bm that correspond to the page frames the contents of which | ||
1028 | * should not be saved. | ||
928 | */ | 1029 | */ |
929 | |||
930 | static void mark_nosave_pages(struct memory_bitmap *bm) | 1030 | static void mark_nosave_pages(struct memory_bitmap *bm) |
931 | { | 1031 | { |
932 | struct nosave_region *region; | 1032 | struct nosave_region *region; |
@@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
956 | } | 1056 | } |
957 | 1057 | ||
958 | /** | 1058 | /** |
959 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 1059 | * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. |
960 | * frames that should not be saved and free page frames. The pointers | 1060 | * |
961 | * forbidden_pages_map and free_pages_map are only modified if everything | 1061 | * Create bitmaps needed for marking page frames that should not be saved and |
962 | * goes well, because we don't want the bits to be used before both bitmaps | 1062 | * free page frames. The forbidden_pages_map and free_pages_map pointers are |
963 | * are set up. | 1063 | * only modified if everything goes well, because we don't want the bits to be |
1064 | * touched before both bitmaps are set up. | ||
964 | */ | 1065 | */ |
965 | |||
966 | int create_basic_memory_bitmaps(void) | 1066 | int create_basic_memory_bitmaps(void) |
967 | { | 1067 | { |
968 | struct memory_bitmap *bm1, *bm2; | 1068 | struct memory_bitmap *bm1, *bm2; |
@@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void) | |||
1007 | } | 1107 | } |
1008 | 1108 | ||
1009 | /** | 1109 | /** |
1010 | * free_basic_memory_bitmaps - free memory bitmaps allocated by | 1110 | * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. |
1011 | * create_basic_memory_bitmaps(). The auxiliary pointers are necessary | 1111 | * |
1012 | * so that the bitmaps themselves are not referred to while they are being | 1112 | * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The |
1013 | * freed. | 1113 | * auxiliary pointers are necessary so that the bitmaps themselves are not |
1114 | * referred to while they are being freed. | ||
1014 | */ | 1115 | */ |
1015 | |||
1016 | void free_basic_memory_bitmaps(void) | 1116 | void free_basic_memory_bitmaps(void) |
1017 | { | 1117 | { |
1018 | struct memory_bitmap *bm1, *bm2; | 1118 | struct memory_bitmap *bm1, *bm2; |
@@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void) | |||
1033 | } | 1133 | } |
1034 | 1134 | ||
1035 | /** | 1135 | /** |
1036 | * snapshot_additional_pages - estimate the number of additional pages | 1136 | * snapshot_additional_pages - Estimate the number of extra pages needed. |
1037 | * be needed for setting up the suspend image data structures for given | 1137 | * @zone: Memory zone to carry out the computation for. |
1038 | * zone (usually the returned value is greater than the exact number) | 1138 | * |
1139 | * Estimate the number of additional pages needed for setting up a hibernation | ||
1140 | * image data structures for @zone (usually, the returned value is greater than | ||
1141 | * the exact number). | ||
1039 | */ | 1142 | */ |
1040 | |||
1041 | unsigned int snapshot_additional_pages(struct zone *zone) | 1143 | unsigned int snapshot_additional_pages(struct zone *zone) |
1042 | { | 1144 | { |
1043 | unsigned int rtree, nodes; | 1145 | unsigned int rtree, nodes; |
@@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone) | |||
1055 | 1157 | ||
1056 | #ifdef CONFIG_HIGHMEM | 1158 | #ifdef CONFIG_HIGHMEM |
1057 | /** | 1159 | /** |
1058 | * count_free_highmem_pages - compute the total number of free highmem | 1160 | * count_free_highmem_pages - Compute the total number of free highmem pages. |
1059 | * pages, system-wide. | 1161 | * |
1162 | * The returned number is system-wide. | ||
1060 | */ | 1163 | */ |
1061 | |||
1062 | static unsigned int count_free_highmem_pages(void) | 1164 | static unsigned int count_free_highmem_pages(void) |
1063 | { | 1165 | { |
1064 | struct zone *zone; | 1166 | struct zone *zone; |
@@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void) | |||
1072 | } | 1174 | } |
1073 | 1175 | ||
1074 | /** | 1176 | /** |
1075 | * saveable_highmem_page - Determine whether a highmem page should be | 1177 | * saveable_highmem_page - Check if a highmem page is saveable. |
1076 | * included in the suspend image. | ||
1077 | * | 1178 | * |
1078 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | 1179 | * Determine whether a highmem page should be included in a hibernation image. |
1079 | * and it isn't a part of a free chunk of pages. | 1180 | * |
1181 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | ||
1182 | * and it isn't part of a free chunk of pages. | ||
1080 | */ | 1183 | */ |
1081 | static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | 1184 | static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) |
1082 | { | 1185 | { |
@@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | |||
1102 | } | 1205 | } |
1103 | 1206 | ||
1104 | /** | 1207 | /** |
1105 | * count_highmem_pages - compute the total number of saveable highmem | 1208 | * count_highmem_pages - Compute the total number of saveable highmem pages. |
1106 | * pages. | ||
1107 | */ | 1209 | */ |
1108 | |||
1109 | static unsigned int count_highmem_pages(void) | 1210 | static unsigned int count_highmem_pages(void) |
1110 | { | 1211 | { |
1111 | struct zone *zone; | 1212 | struct zone *zone; |
@@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) | |||
1133 | #endif /* CONFIG_HIGHMEM */ | 1234 | #endif /* CONFIG_HIGHMEM */ |
1134 | 1235 | ||
1135 | /** | 1236 | /** |
1136 | * saveable_page - Determine whether a non-highmem page should be included | 1237 | * saveable_page - Check if the given page is saveable. |
1137 | * in the suspend image. | ||
1138 | * | 1238 | * |
1139 | * We should save the page if it isn't Nosave, and is not in the range | 1239 | * Determine whether a non-highmem page should be included in a hibernation |
1140 | * of pages statically defined as 'unsaveable', and it isn't a part of | 1240 | * image. |
1141 | * a free chunk of pages. | 1241 | * |
1242 | * We should save the page if it isn't Nosave, and is not in the range | ||
1243 | * of pages statically defined as 'unsaveable', and it isn't part of | ||
1244 | * a free chunk of pages. | ||
1142 | */ | 1245 | */ |
1143 | static struct page *saveable_page(struct zone *zone, unsigned long pfn) | 1246 | static struct page *saveable_page(struct zone *zone, unsigned long pfn) |
1144 | { | 1247 | { |
@@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
1167 | } | 1270 | } |
1168 | 1271 | ||
1169 | /** | 1272 | /** |
1170 | * count_data_pages - compute the total number of saveable non-highmem | 1273 | * count_data_pages - Compute the total number of saveable non-highmem pages. |
1171 | * pages. | ||
1172 | */ | 1274 | */ |
1173 | |||
1174 | static unsigned int count_data_pages(void) | 1275 | static unsigned int count_data_pages(void) |
1175 | { | 1276 | { |
1176 | struct zone *zone; | 1277 | struct zone *zone; |
@@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void) | |||
1190 | return n; | 1291 | return n; |
1191 | } | 1292 | } |
1192 | 1293 | ||
1193 | /* This is needed, because copy_page and memcpy are not usable for copying | 1294 | /* |
1295 | * This is needed, because copy_page and memcpy are not usable for copying | ||
1194 | * task structs. | 1296 | * task structs. |
1195 | */ | 1297 | */ |
1196 | static inline void do_copy_page(long *dst, long *src) | 1298 | static inline void do_copy_page(long *dst, long *src) |
@@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src) | |||
1201 | *dst++ = *src++; | 1303 | *dst++ = *src++; |
1202 | } | 1304 | } |
1203 | 1305 | ||
1204 | |||
1205 | /** | 1306 | /** |
1206 | * safe_copy_page - check if the page we are going to copy is marked as | 1307 | * safe_copy_page - Copy a page in a safe way. |
1207 | * present in the kernel page tables (this always is the case if | 1308 | * |
1208 | * CONFIG_DEBUG_PAGEALLOC is not set and in that case | 1309 | * Check if the page we are going to copy is marked as present in the kernel |
1209 | * kernel_page_present() always returns 'true'). | 1310 | * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set |
1311 | * and in that case kernel_page_present() always returns 'true'). | ||
1210 | */ | 1312 | */ |
1211 | static void safe_copy_page(void *dst, struct page *s_page) | 1313 | static void safe_copy_page(void *dst, struct page *s_page) |
1212 | { | 1314 | { |
@@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page) | |||
1219 | } | 1321 | } |
1220 | } | 1322 | } |
1221 | 1323 | ||
1222 | |||
1223 | #ifdef CONFIG_HIGHMEM | 1324 | #ifdef CONFIG_HIGHMEM |
1224 | static inline struct page * | 1325 | static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) |
1225 | page_is_saveable(struct zone *zone, unsigned long pfn) | ||
1226 | { | 1326 | { |
1227 | return is_highmem(zone) ? | 1327 | return is_highmem(zone) ? |
1228 | saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); | 1328 | saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); |
@@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
1243 | kunmap_atomic(src); | 1343 | kunmap_atomic(src); |
1244 | } else { | 1344 | } else { |
1245 | if (PageHighMem(d_page)) { | 1345 | if (PageHighMem(d_page)) { |
1246 | /* Page pointed to by src may contain some kernel | 1346 | /* |
1347 | * The page pointed to by src may contain some kernel | ||
1247 | * data modified by kmap_atomic() | 1348 | * data modified by kmap_atomic() |
1248 | */ | 1349 | */ |
1249 | safe_copy_page(buffer, s_page); | 1350 | safe_copy_page(buffer, s_page); |
@@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
1265 | } | 1366 | } |
1266 | #endif /* CONFIG_HIGHMEM */ | 1367 | #endif /* CONFIG_HIGHMEM */ |
1267 | 1368 | ||
1268 | static void | 1369 | static void copy_data_pages(struct memory_bitmap *copy_bm, |
1269 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | 1370 | struct memory_bitmap *orig_bm) |
1270 | { | 1371 | { |
1271 | struct zone *zone; | 1372 | struct zone *zone; |
1272 | unsigned long pfn; | 1373 | unsigned long pfn; |
@@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm; | |||
1315 | static struct memory_bitmap copy_bm; | 1416 | static struct memory_bitmap copy_bm; |
1316 | 1417 | ||
1317 | /** | 1418 | /** |
1318 | * swsusp_free - free pages allocated for the suspend. | 1419 | * swsusp_free - Free pages allocated for hibernation image. |
1319 | * | 1420 | * |
1320 | * Suspend pages are alocated before the atomic copy is made, so we | 1421 | * Image pages are alocated before snapshot creation, so they need to be |
1321 | * need to release them after the resume. | 1422 | * released after resume. |
1322 | */ | 1423 | */ |
1323 | |||
1324 | void swsusp_free(void) | 1424 | void swsusp_free(void) |
1325 | { | 1425 | { |
1326 | unsigned long fb_pfn, fr_pfn; | 1426 | unsigned long fb_pfn, fr_pfn; |
@@ -1351,6 +1451,7 @@ loop: | |||
1351 | 1451 | ||
1352 | memory_bm_clear_current(forbidden_pages_map); | 1452 | memory_bm_clear_current(forbidden_pages_map); |
1353 | memory_bm_clear_current(free_pages_map); | 1453 | memory_bm_clear_current(free_pages_map); |
1454 | hibernate_restore_unprotect_page(page_address(page)); | ||
1354 | __free_page(page); | 1455 | __free_page(page); |
1355 | goto loop; | 1456 | goto loop; |
1356 | } | 1457 | } |
@@ -1362,6 +1463,7 @@ out: | |||
1362 | buffer = NULL; | 1463 | buffer = NULL; |
1363 | alloc_normal = 0; | 1464 | alloc_normal = 0; |
1364 | alloc_highmem = 0; | 1465 | alloc_highmem = 0; |
1466 | hibernate_restore_protection_end(); | ||
1365 | } | 1467 | } |
1366 | 1468 | ||
1367 | /* Helper functions used for the shrinking of memory. */ | 1469 | /* Helper functions used for the shrinking of memory. */ |
@@ -1369,7 +1471,7 @@ out: | |||
1369 | #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) | 1471 | #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) |
1370 | 1472 | ||
1371 | /** | 1473 | /** |
1372 | * preallocate_image_pages - Allocate a number of pages for hibernation image | 1474 | * preallocate_image_pages - Allocate a number of pages for hibernation image. |
1373 | * @nr_pages: Number of page frames to allocate. | 1475 | * @nr_pages: Number of page frames to allocate. |
1374 | * @mask: GFP flags to use for the allocation. | 1476 | * @mask: GFP flags to use for the allocation. |
1375 | * | 1477 | * |
@@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) | |||
1419 | } | 1521 | } |
1420 | 1522 | ||
1421 | /** | 1523 | /** |
1422 | * __fraction - Compute (an approximation of) x * (multiplier / base) | 1524 | * __fraction - Compute (an approximation of) x * (multiplier / base). |
1423 | */ | 1525 | */ |
1424 | static unsigned long __fraction(u64 x, u64 multiplier, u64 base) | 1526 | static unsigned long __fraction(u64 x, u64 multiplier, u64 base) |
1425 | { | 1527 | { |
@@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) | |||
1429 | } | 1531 | } |
1430 | 1532 | ||
1431 | static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | 1533 | static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, |
1432 | unsigned long highmem, | 1534 | unsigned long highmem, |
1433 | unsigned long total) | 1535 | unsigned long total) |
1434 | { | 1536 | { |
1435 | unsigned long alloc = __fraction(nr_pages, highmem, total); | 1537 | unsigned long alloc = __fraction(nr_pages, highmem, total); |
1436 | 1538 | ||
@@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) | |||
1443 | } | 1545 | } |
1444 | 1546 | ||
1445 | static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | 1547 | static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, |
1446 | unsigned long highmem, | 1548 | unsigned long highmem, |
1447 | unsigned long total) | 1549 | unsigned long total) |
1448 | { | 1550 | { |
1449 | return 0; | 1551 | return 0; |
1450 | } | 1552 | } |
1451 | #endif /* CONFIG_HIGHMEM */ | 1553 | #endif /* CONFIG_HIGHMEM */ |
1452 | 1554 | ||
1453 | /** | 1555 | /** |
1454 | * free_unnecessary_pages - Release preallocated pages not needed for the image | 1556 | * free_unnecessary_pages - Release preallocated pages not needed for the image. |
1455 | */ | 1557 | */ |
1456 | static unsigned long free_unnecessary_pages(void) | 1558 | static unsigned long free_unnecessary_pages(void) |
1457 | { | 1559 | { |
@@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void) | |||
1505 | } | 1607 | } |
1506 | 1608 | ||
1507 | /** | 1609 | /** |
1508 | * minimum_image_size - Estimate the minimum acceptable size of an image | 1610 | * minimum_image_size - Estimate the minimum acceptable size of an image. |
1509 | * @saveable: Number of saveable pages in the system. | 1611 | * @saveable: Number of saveable pages in the system. |
1510 | * | 1612 | * |
1511 | * We want to avoid attempting to free too much memory too hard, so estimate the | 1613 | * We want to avoid attempting to free too much memory too hard, so estimate the |
@@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
1525 | unsigned long size; | 1627 | unsigned long size; |
1526 | 1628 | ||
1527 | size = global_page_state(NR_SLAB_RECLAIMABLE) | 1629 | size = global_page_state(NR_SLAB_RECLAIMABLE) |
1528 | + global_page_state(NR_ACTIVE_ANON) | 1630 | + global_node_page_state(NR_ACTIVE_ANON) |
1529 | + global_page_state(NR_INACTIVE_ANON) | 1631 | + global_node_page_state(NR_INACTIVE_ANON) |
1530 | + global_page_state(NR_ACTIVE_FILE) | 1632 | + global_node_page_state(NR_ACTIVE_FILE) |
1531 | + global_page_state(NR_INACTIVE_FILE) | 1633 | + global_node_page_state(NR_INACTIVE_FILE) |
1532 | - global_page_state(NR_FILE_MAPPED); | 1634 | - global_node_page_state(NR_FILE_MAPPED); |
1533 | 1635 | ||
1534 | return saveable <= size ? 0 : saveable - size; | 1636 | return saveable <= size ? 0 : saveable - size; |
1535 | } | 1637 | } |
1536 | 1638 | ||
1537 | /** | 1639 | /** |
1538 | * hibernate_preallocate_memory - Preallocate memory for hibernation image | 1640 | * hibernate_preallocate_memory - Preallocate memory for hibernation image. |
1539 | * | 1641 | * |
1540 | * To create a hibernation image it is necessary to make a copy of every page | 1642 | * To create a hibernation image it is necessary to make a copy of every page |
1541 | * frame in use. We also need a number of page frames to be free during | 1643 | * frame in use. We also need a number of page frames to be free during |
@@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void) | |||
1708 | 1810 | ||
1709 | #ifdef CONFIG_HIGHMEM | 1811 | #ifdef CONFIG_HIGHMEM |
1710 | /** | 1812 | /** |
1711 | * count_pages_for_highmem - compute the number of non-highmem pages | 1813 | * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. |
1712 | * that will be necessary for creating copies of highmem pages. | 1814 | * |
1713 | */ | 1815 | * Compute the number of non-highmem pages that will be necessary for creating |
1714 | 1816 | * copies of highmem pages. | |
1817 | */ | ||
1715 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | 1818 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) |
1716 | { | 1819 | { |
1717 | unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; | 1820 | unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; |
@@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | |||
1724 | return nr_highmem; | 1827 | return nr_highmem; |
1725 | } | 1828 | } |
1726 | #else | 1829 | #else |
1727 | static unsigned int | 1830 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } |
1728 | count_pages_for_highmem(unsigned int nr_highmem) { return 0; } | ||
1729 | #endif /* CONFIG_HIGHMEM */ | 1831 | #endif /* CONFIG_HIGHMEM */ |
1730 | 1832 | ||
1731 | /** | 1833 | /** |
1732 | * enough_free_mem - Make sure we have enough free memory for the | 1834 | * enough_free_mem - Check if there is enough free memory for the image. |
1733 | * snapshot image. | ||
1734 | */ | 1835 | */ |
1735 | |||
1736 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | 1836 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) |
1737 | { | 1837 | { |
1738 | struct zone *zone; | 1838 | struct zone *zone; |
@@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | |||
1751 | 1851 | ||
1752 | #ifdef CONFIG_HIGHMEM | 1852 | #ifdef CONFIG_HIGHMEM |
1753 | /** | 1853 | /** |
1754 | * get_highmem_buffer - if there are some highmem pages in the suspend | 1854 | * get_highmem_buffer - Allocate a buffer for highmem pages. |
1755 | * image, we may need the buffer to copy them and/or load their data. | 1855 | * |
1856 | * If there are some highmem pages in the hibernation image, we may need a | ||
1857 | * buffer to copy them and/or load their data. | ||
1756 | */ | 1858 | */ |
1757 | |||
1758 | static inline int get_highmem_buffer(int safe_needed) | 1859 | static inline int get_highmem_buffer(int safe_needed) |
1759 | { | 1860 | { |
1760 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); | 1861 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); |
@@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed) | |||
1762 | } | 1863 | } |
1763 | 1864 | ||
1764 | /** | 1865 | /** |
1765 | * alloc_highmem_image_pages - allocate some highmem pages for the image. | 1866 | * alloc_highmem_image_pages - Allocate some highmem pages for the image. |
1766 | * Try to allocate as many pages as needed, but if the number of free | 1867 | * |
1767 | * highmem pages is lesser than that, allocate them all. | 1868 | * Try to allocate as many pages as needed, but if the number of free highmem |
1869 | * pages is less than that, allocate them all. | ||
1768 | */ | 1870 | */ |
1769 | 1871 | static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, | |
1770 | static inline unsigned int | 1872 | unsigned int nr_highmem) |
1771 | alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | ||
1772 | { | 1873 | { |
1773 | unsigned int to_alloc = count_free_highmem_pages(); | 1874 | unsigned int to_alloc = count_free_highmem_pages(); |
1774 | 1875 | ||
@@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | |||
1787 | #else | 1888 | #else |
1788 | static inline int get_highmem_buffer(int safe_needed) { return 0; } | 1889 | static inline int get_highmem_buffer(int safe_needed) { return 0; } |
1789 | 1890 | ||
1790 | static inline unsigned int | 1891 | static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, |
1791 | alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } | 1892 | unsigned int n) { return 0; } |
1792 | #endif /* CONFIG_HIGHMEM */ | 1893 | #endif /* CONFIG_HIGHMEM */ |
1793 | 1894 | ||
1794 | /** | 1895 | /** |
1795 | * swsusp_alloc - allocate memory for the suspend image | 1896 | * swsusp_alloc - Allocate memory for hibernation image. |
1796 | * | 1897 | * |
1797 | * We first try to allocate as many highmem pages as there are | 1898 | * We first try to allocate as many highmem pages as there are |
1798 | * saveable highmem pages in the system. If that fails, we allocate | 1899 | * saveable highmem pages in the system. If that fails, we allocate |
1799 | * non-highmem pages for the copies of the remaining highmem ones. | 1900 | * non-highmem pages for the copies of the remaining highmem ones. |
1800 | * | 1901 | * |
1801 | * In this approach it is likely that the copies of highmem pages will | 1902 | * In this approach it is likely that the copies of highmem pages will |
1802 | * also be located in the high memory, because of the way in which | 1903 | * also be located in the high memory, because of the way in which |
1803 | * copy_data_pages() works. | 1904 | * copy_data_pages() works. |
1804 | */ | 1905 | */ |
1805 | 1906 | static int swsusp_alloc(struct memory_bitmap *orig_bm, | |
1806 | static int | 1907 | struct memory_bitmap *copy_bm, |
1807 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 1908 | unsigned int nr_pages, unsigned int nr_highmem) |
1808 | unsigned int nr_pages, unsigned int nr_highmem) | ||
1809 | { | 1909 | { |
1810 | if (nr_highmem > 0) { | 1910 | if (nr_highmem > 0) { |
1811 | if (get_highmem_buffer(PG_ANY)) | 1911 | if (get_highmem_buffer(PG_ANY)) |
@@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void) | |||
1855 | return -ENOMEM; | 1955 | return -ENOMEM; |
1856 | } | 1956 | } |
1857 | 1957 | ||
1858 | /* During allocating of suspend pagedir, new cold pages may appear. | 1958 | /* |
1959 | * During allocating of suspend pagedir, new cold pages may appear. | ||
1859 | * Kill them. | 1960 | * Kill them. |
1860 | */ | 1961 | */ |
1861 | drain_local_pages(NULL); | 1962 | drain_local_pages(NULL); |
@@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info) | |||
1918 | } | 2019 | } |
1919 | 2020 | ||
1920 | /** | 2021 | /** |
1921 | * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm | 2022 | * pack_pfns - Prepare PFNs for saving. |
1922 | * are stored in the array @buf[] (1 page at a time) | 2023 | * @bm: Memory bitmap. |
2024 | * @buf: Memory buffer to store the PFNs in. | ||
2025 | * | ||
2026 | * PFNs corresponding to set bits in @bm are stored in the area of memory | ||
2027 | * pointed to by @buf (1 page at a time). | ||
1923 | */ | 2028 | */ |
1924 | 2029 | static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |
1925 | static inline void | ||
1926 | pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | ||
1927 | { | 2030 | { |
1928 | int j; | 2031 | int j; |
1929 | 2032 | ||
@@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1937 | } | 2040 | } |
1938 | 2041 | ||
1939 | /** | 2042 | /** |
1940 | * snapshot_read_next - used for reading the system memory snapshot. | 2043 | * snapshot_read_next - Get the address to read the next image page from. |
2044 | * @handle: Snapshot handle to be used for the reading. | ||
1941 | * | 2045 | * |
1942 | * On the first call to it @handle should point to a zeroed | 2046 | * On the first call, @handle should point to a zeroed snapshot_handle |
1943 | * snapshot_handle structure. The structure gets updated and a pointer | 2047 | * structure. The structure gets populated then and a pointer to it should be |
1944 | * to it should be passed to this function every next time. | 2048 | * passed to this function every next time. |
1945 | * | 2049 | * |
1946 | * On success the function returns a positive number. Then, the caller | 2050 | * On success, the function returns a positive number. Then, the caller |
1947 | * is allowed to read up to the returned number of bytes from the memory | 2051 | * is allowed to read up to the returned number of bytes from the memory |
1948 | * location computed by the data_of() macro. | 2052 | * location computed by the data_of() macro. |
1949 | * | 2053 | * |
1950 | * The function returns 0 to indicate the end of data stream condition, | 2054 | * The function returns 0 to indicate the end of the data stream condition, |
1951 | * and a negative number is returned on error. In such cases the | 2055 | * and negative numbers are returned on errors. If that happens, the structure |
1952 | * structure pointed to by @handle is not updated and should not be used | 2056 | * pointed to by @handle is not updated and should not be used any more. |
1953 | * any more. | ||
1954 | */ | 2057 | */ |
1955 | |||
1956 | int snapshot_read_next(struct snapshot_handle *handle) | 2058 | int snapshot_read_next(struct snapshot_handle *handle) |
1957 | { | 2059 | { |
1958 | if (handle->cur > nr_meta_pages + nr_copy_pages) | 2060 | if (handle->cur > nr_meta_pages + nr_copy_pages) |
@@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1981 | 2083 | ||
1982 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); | 2084 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); |
1983 | if (PageHighMem(page)) { | 2085 | if (PageHighMem(page)) { |
1984 | /* Highmem pages are copied to the buffer, | 2086 | /* |
2087 | * Highmem pages are copied to the buffer, | ||
1985 | * because we can't return with a kmapped | 2088 | * because we can't return with a kmapped |
1986 | * highmem page (we may not be called again). | 2089 | * highmem page (we may not be called again). |
1987 | */ | 2090 | */ |
@@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1999 | return PAGE_SIZE; | 2102 | return PAGE_SIZE; |
2000 | } | 2103 | } |
2001 | 2104 | ||
2002 | /** | 2105 | static void duplicate_memory_bitmap(struct memory_bitmap *dst, |
2003 | * mark_unsafe_pages - mark the pages that cannot be used for storing | 2106 | struct memory_bitmap *src) |
2004 | * the image during resume, because they conflict with the pages that | ||
2005 | * had been used before suspend | ||
2006 | */ | ||
2007 | |||
2008 | static int mark_unsafe_pages(struct memory_bitmap *bm) | ||
2009 | { | 2107 | { |
2010 | struct zone *zone; | 2108 | unsigned long pfn; |
2011 | unsigned long pfn, max_zone_pfn; | ||
2012 | 2109 | ||
2013 | /* Clear page flags */ | 2110 | memory_bm_position_reset(src); |
2014 | for_each_populated_zone(zone) { | 2111 | pfn = memory_bm_next_pfn(src); |
2015 | max_zone_pfn = zone_end_pfn(zone); | 2112 | while (pfn != BM_END_OF_MAP) { |
2016 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 2113 | memory_bm_set_bit(dst, pfn); |
2017 | if (pfn_valid(pfn)) | 2114 | pfn = memory_bm_next_pfn(src); |
2018 | swsusp_unset_page_free(pfn_to_page(pfn)); | ||
2019 | } | 2115 | } |
2020 | |||
2021 | /* Mark pages that correspond to the "original" pfns as "unsafe" */ | ||
2022 | memory_bm_position_reset(bm); | ||
2023 | do { | ||
2024 | pfn = memory_bm_next_pfn(bm); | ||
2025 | if (likely(pfn != BM_END_OF_MAP)) { | ||
2026 | if (likely(pfn_valid(pfn))) | ||
2027 | swsusp_set_page_free(pfn_to_page(pfn)); | ||
2028 | else | ||
2029 | return -EFAULT; | ||
2030 | } | ||
2031 | } while (pfn != BM_END_OF_MAP); | ||
2032 | |||
2033 | allocated_unsafe_pages = 0; | ||
2034 | |||
2035 | return 0; | ||
2036 | } | 2116 | } |
2037 | 2117 | ||
2038 | static void | 2118 | /** |
2039 | duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) | 2119 | * mark_unsafe_pages - Mark pages that were used before hibernation. |
2120 | * | ||
2121 | * Mark the pages that cannot be used for storing the image during restoration, | ||
2122 | * because they conflict with the pages that had been used before hibernation. | ||
2123 | */ | ||
2124 | static void mark_unsafe_pages(struct memory_bitmap *bm) | ||
2040 | { | 2125 | { |
2041 | unsigned long pfn; | 2126 | unsigned long pfn; |
2042 | 2127 | ||
2043 | memory_bm_position_reset(src); | 2128 | /* Clear the "free"/"unsafe" bit for all PFNs */ |
2044 | pfn = memory_bm_next_pfn(src); | 2129 | memory_bm_position_reset(free_pages_map); |
2130 | pfn = memory_bm_next_pfn(free_pages_map); | ||
2045 | while (pfn != BM_END_OF_MAP) { | 2131 | while (pfn != BM_END_OF_MAP) { |
2046 | memory_bm_set_bit(dst, pfn); | 2132 | memory_bm_clear_current(free_pages_map); |
2047 | pfn = memory_bm_next_pfn(src); | 2133 | pfn = memory_bm_next_pfn(free_pages_map); |
2048 | } | 2134 | } |
2135 | |||
2136 | /* Mark pages that correspond to the "original" PFNs as "unsafe" */ | ||
2137 | duplicate_memory_bitmap(free_pages_map, bm); | ||
2138 | |||
2139 | allocated_unsafe_pages = 0; | ||
2049 | } | 2140 | } |
2050 | 2141 | ||
2051 | static int check_header(struct swsusp_info *info) | 2142 | static int check_header(struct swsusp_info *info) |
@@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info) | |||
2063 | } | 2154 | } |
2064 | 2155 | ||
2065 | /** | 2156 | /** |
2066 | * load header - check the image header and copy data from it | 2157 | * load header - Check the image header and copy the data from it. |
2067 | */ | 2158 | */ |
2068 | 2159 | static int load_header(struct swsusp_info *info) | |
2069 | static int | ||
2070 | load_header(struct swsusp_info *info) | ||
2071 | { | 2160 | { |
2072 | int error; | 2161 | int error; |
2073 | 2162 | ||
@@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info) | |||
2081 | } | 2170 | } |
2082 | 2171 | ||
2083 | /** | 2172 | /** |
2084 | * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set | 2173 | * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. |
2085 | * the corresponding bit in the memory bitmap @bm | 2174 | * @bm: Memory bitmap. |
2175 | * @buf: Area of memory containing the PFNs. | ||
2176 | * | ||
2177 | * For each element of the array pointed to by @buf (1 page at a time), set the | ||
2178 | * corresponding bit in @bm. | ||
2086 | */ | 2179 | */ |
2087 | static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | 2180 | static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) |
2088 | { | 2181 | { |
@@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
2095 | /* Extract and buffer page key for data page (s390 only). */ | 2188 | /* Extract and buffer page key for data page (s390 only). */ |
2096 | page_key_memorize(buf + j); | 2189 | page_key_memorize(buf + j); |
2097 | 2190 | ||
2098 | if (memory_bm_pfn_present(bm, buf[j])) | 2191 | if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) |
2099 | memory_bm_set_bit(bm, buf[j]); | 2192 | memory_bm_set_bit(bm, buf[j]); |
2100 | else | 2193 | else |
2101 | return -EFAULT; | 2194 | return -EFAULT; |
@@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
2104 | return 0; | 2197 | return 0; |
2105 | } | 2198 | } |
2106 | 2199 | ||
2107 | /* List of "safe" pages that may be used to store data loaded from the suspend | ||
2108 | * image | ||
2109 | */ | ||
2110 | static struct linked_page *safe_pages_list; | ||
2111 | |||
2112 | #ifdef CONFIG_HIGHMEM | 2200 | #ifdef CONFIG_HIGHMEM |
2113 | /* struct highmem_pbe is used for creating the list of highmem pages that | 2201 | /* |
2202 | * struct highmem_pbe is used for creating the list of highmem pages that | ||
2114 | * should be restored atomically during the resume from disk, because the page | 2203 | * should be restored atomically during the resume from disk, because the page |
2115 | * frames they have occupied before the suspend are in use. | 2204 | * frames they have occupied before the suspend are in use. |
2116 | */ | 2205 | */ |
@@ -2120,7 +2209,8 @@ struct highmem_pbe { | |||
2120 | struct highmem_pbe *next; | 2209 | struct highmem_pbe *next; |
2121 | }; | 2210 | }; |
2122 | 2211 | ||
2123 | /* List of highmem PBEs needed for restoring the highmem pages that were | 2212 | /* |
2213 | * List of highmem PBEs needed for restoring the highmem pages that were | ||
2124 | * allocated before the suspend and included in the suspend image, but have | 2214 | * allocated before the suspend and included in the suspend image, but have |
2125 | * also been allocated by the "resume" kernel, so their contents cannot be | 2215 | * also been allocated by the "resume" kernel, so their contents cannot be |
2126 | * written directly to their "original" page frames. | 2216 | * written directly to their "original" page frames. |
@@ -2128,11 +2218,11 @@ struct highmem_pbe { | |||
2128 | static struct highmem_pbe *highmem_pblist; | 2218 | static struct highmem_pbe *highmem_pblist; |
2129 | 2219 | ||
2130 | /** | 2220 | /** |
2131 | * count_highmem_image_pages - compute the number of highmem pages in the | 2221 | * count_highmem_image_pages - Compute the number of highmem pages in the image. |
2132 | * suspend image. The bits in the memory bitmap @bm that correspond to the | 2222 | * @bm: Memory bitmap. |
2133 | * image pages are assumed to be set. | 2223 | * |
2224 | * The bits in @bm that correspond to image pages are assumed to be set. | ||
2134 | */ | 2225 | */ |
2135 | |||
2136 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | 2226 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) |
2137 | { | 2227 | { |
2138 | unsigned long pfn; | 2228 | unsigned long pfn; |
@@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | |||
2149 | return cnt; | 2239 | return cnt; |
2150 | } | 2240 | } |
2151 | 2241 | ||
2152 | /** | ||
2153 | * prepare_highmem_image - try to allocate as many highmem pages as | ||
2154 | * there are highmem image pages (@nr_highmem_p points to the variable | ||
2155 | * containing the number of highmem image pages). The pages that are | ||
2156 | * "safe" (ie. will not be overwritten when the suspend image is | ||
2157 | * restored) have the corresponding bits set in @bm (it must be | ||
2158 | * unitialized). | ||
2159 | * | ||
2160 | * NOTE: This function should not be called if there are no highmem | ||
2161 | * image pages. | ||
2162 | */ | ||
2163 | |||
2164 | static unsigned int safe_highmem_pages; | 2242 | static unsigned int safe_highmem_pages; |
2165 | 2243 | ||
2166 | static struct memory_bitmap *safe_highmem_bm; | 2244 | static struct memory_bitmap *safe_highmem_bm; |
2167 | 2245 | ||
2168 | static int | 2246 | /** |
2169 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | 2247 | * prepare_highmem_image - Allocate memory for loading highmem data from image. |
2248 | * @bm: Pointer to an uninitialized memory bitmap structure. | ||
2249 | * @nr_highmem_p: Pointer to the number of highmem image pages. | ||
2250 | * | ||
2251 | * Try to allocate as many highmem pages as there are highmem image pages | ||
2252 | * (@nr_highmem_p points to the variable containing the number of highmem image | ||
2253 | * pages). The pages that are "safe" (ie. will not be overwritten when the | ||
2254 | * hibernation image is restored entirely) have the corresponding bits set in | ||
2255 | * @bm (it must be unitialized). | ||
2256 | * | ||
2257 | * NOTE: This function should not be called if there are no highmem image pages. | ||
2258 | */ | ||
2259 | static int prepare_highmem_image(struct memory_bitmap *bm, | ||
2260 | unsigned int *nr_highmem_p) | ||
2170 | { | 2261 | { |
2171 | unsigned int to_alloc; | 2262 | unsigned int to_alloc; |
2172 | 2263 | ||
@@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | |||
2201 | return 0; | 2292 | return 0; |
2202 | } | 2293 | } |
2203 | 2294 | ||
2295 | static struct page *last_highmem_page; | ||
2296 | |||
2204 | /** | 2297 | /** |
2205 | * get_highmem_page_buffer - for given highmem image page find the buffer | 2298 | * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. |
2206 | * that suspend_write_next() should set for its caller to write to. | ||
2207 | * | 2299 | * |
2208 | * If the page is to be saved to its "original" page frame or a copy of | 2300 | * For a given highmem image page get a buffer that suspend_write_next() should |
2209 | * the page is to be made in the highmem, @buffer is returned. Otherwise, | 2301 | * return to its caller to write to. |
2210 | * the copy of the page is to be made in normal memory, so the address of | ||
2211 | * the copy is returned. | ||
2212 | * | 2302 | * |
2213 | * If @buffer is returned, the caller of suspend_write_next() will write | 2303 | * If the page is to be saved to its "original" page frame or a copy of |
2214 | * the page's contents to @buffer, so they will have to be copied to the | 2304 | * the page is to be made in the highmem, @buffer is returned. Otherwise, |
2215 | * right location on the next call to suspend_write_next() and it is done | 2305 | * the copy of the page is to be made in normal memory, so the address of |
2216 | * with the help of copy_last_highmem_page(). For this purpose, if | 2306 | * the copy is returned. |
2217 | * @buffer is returned, @last_highmem page is set to the page to which | 2307 | * |
2218 | * the data will have to be copied from @buffer. | 2308 | * If @buffer is returned, the caller of suspend_write_next() will write |
2309 | * the page's contents to @buffer, so they will have to be copied to the | ||
2310 | * right location on the next call to suspend_write_next() and it is done | ||
2311 | * with the help of copy_last_highmem_page(). For this purpose, if | ||
2312 | * @buffer is returned, @last_highmem_page is set to the page to which | ||
2313 | * the data will have to be copied from @buffer. | ||
2219 | */ | 2314 | */ |
2220 | 2315 | static void *get_highmem_page_buffer(struct page *page, | |
2221 | static struct page *last_highmem_page; | 2316 | struct chain_allocator *ca) |
2222 | |||
2223 | static void * | ||
2224 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
2225 | { | 2317 | { |
2226 | struct highmem_pbe *pbe; | 2318 | struct highmem_pbe *pbe; |
2227 | void *kaddr; | 2319 | void *kaddr; |
2228 | 2320 | ||
2229 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { | 2321 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { |
2230 | /* We have allocated the "original" page frame and we can | 2322 | /* |
2323 | * We have allocated the "original" page frame and we can | ||
2231 | * use it directly to store the loaded page. | 2324 | * use it directly to store the loaded page. |
2232 | */ | 2325 | */ |
2233 | last_highmem_page = page; | 2326 | last_highmem_page = page; |
2234 | return buffer; | 2327 | return buffer; |
2235 | } | 2328 | } |
2236 | /* The "original" page frame has not been allocated and we have to | 2329 | /* |
2330 | * The "original" page frame has not been allocated and we have to | ||
2237 | * use a "safe" page frame to store the loaded page. | 2331 | * use a "safe" page frame to store the loaded page. |
2238 | */ | 2332 | */ |
2239 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); | 2333 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); |
@@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | |||
2263 | } | 2357 | } |
2264 | 2358 | ||
2265 | /** | 2359 | /** |
2266 | * copy_last_highmem_page - copy the contents of a highmem image from | 2360 | * copy_last_highmem_page - Copy most the most recent highmem image page. |
2267 | * @buffer, where the caller of snapshot_write_next() has place them, | 2361 | * |
2268 | * to the right location represented by @last_highmem_page . | 2362 | * Copy the contents of a highmem image from @buffer, where the caller of |
2363 | * snapshot_write_next() has stored them, to the right location represented by | ||
2364 | * @last_highmem_page . | ||
2269 | */ | 2365 | */ |
2270 | |||
2271 | static void copy_last_highmem_page(void) | 2366 | static void copy_last_highmem_page(void) |
2272 | { | 2367 | { |
2273 | if (last_highmem_page) { | 2368 | if (last_highmem_page) { |
@@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void) | |||
2294 | free_image_page(buffer, PG_UNSAFE_CLEAR); | 2389 | free_image_page(buffer, PG_UNSAFE_CLEAR); |
2295 | } | 2390 | } |
2296 | #else | 2391 | #else |
2297 | static unsigned int | 2392 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } |
2298 | count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } | ||
2299 | 2393 | ||
2300 | static inline int | 2394 | static inline int prepare_highmem_image(struct memory_bitmap *bm, |
2301 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | 2395 | unsigned int *nr_highmem_p) { return 0; } |
2302 | { | ||
2303 | return 0; | ||
2304 | } | ||
2305 | 2396 | ||
2306 | static inline void * | 2397 | static inline void *get_highmem_page_buffer(struct page *page, |
2307 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | 2398 | struct chain_allocator *ca) |
2308 | { | 2399 | { |
2309 | return ERR_PTR(-EINVAL); | 2400 | return ERR_PTR(-EINVAL); |
2310 | } | 2401 | } |
@@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; } | |||
2314 | static inline void free_highmem_data(void) {} | 2405 | static inline void free_highmem_data(void) {} |
2315 | #endif /* CONFIG_HIGHMEM */ | 2406 | #endif /* CONFIG_HIGHMEM */ |
2316 | 2407 | ||
2408 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | ||
2409 | |||
2317 | /** | 2410 | /** |
2318 | * prepare_image - use the memory bitmap @bm to mark the pages that will | 2411 | * prepare_image - Make room for loading hibernation image. |
2319 | * be overwritten in the process of restoring the system memory state | 2412 | * @new_bm: Unitialized memory bitmap structure. |
2320 | * from the suspend image ("unsafe" pages) and allocate memory for the | 2413 | * @bm: Memory bitmap with unsafe pages marked. |
2321 | * image. | 2414 | * |
2415 | * Use @bm to mark the pages that will be overwritten in the process of | ||
2416 | * restoring the system memory state from the suspend image ("unsafe" pages) | ||
2417 | * and allocate memory for the image. | ||
2322 | * | 2418 | * |
2323 | * The idea is to allocate a new memory bitmap first and then allocate | 2419 | * The idea is to allocate a new memory bitmap first and then allocate |
2324 | * as many pages as needed for the image data, but not to assign these | 2420 | * as many pages as needed for image data, but without specifying what those |
2325 | * pages to specific tasks initially. Instead, we just mark them as | 2421 | * pages will be used for just yet. Instead, we mark them all as allocated and |
2326 | * allocated and create a lists of "safe" pages that will be used | 2422 | * create a lists of "safe" pages to be used later. On systems with high |
2327 | * later. On systems with high memory a list of "safe" highmem pages is | 2423 | * memory a list of "safe" highmem pages is created too. |
2328 | * also created. | ||
2329 | */ | 2424 | */ |
2330 | 2425 | static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |
2331 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | ||
2332 | |||
2333 | static int | ||
2334 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | ||
2335 | { | 2426 | { |
2336 | unsigned int nr_pages, nr_highmem; | 2427 | unsigned int nr_pages, nr_highmem; |
2337 | struct linked_page *sp_list, *lp; | 2428 | struct linked_page *lp; |
2338 | int error; | 2429 | int error; |
2339 | 2430 | ||
2340 | /* If there is no highmem, the buffer will not be necessary */ | 2431 | /* If there is no highmem, the buffer will not be necessary */ |
@@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
2342 | buffer = NULL; | 2433 | buffer = NULL; |
2343 | 2434 | ||
2344 | nr_highmem = count_highmem_image_pages(bm); | 2435 | nr_highmem = count_highmem_image_pages(bm); |
2345 | error = mark_unsafe_pages(bm); | 2436 | mark_unsafe_pages(bm); |
2346 | if (error) | ||
2347 | goto Free; | ||
2348 | 2437 | ||
2349 | error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); | 2438 | error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); |
2350 | if (error) | 2439 | if (error) |
@@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
2357 | if (error) | 2446 | if (error) |
2358 | goto Free; | 2447 | goto Free; |
2359 | } | 2448 | } |
2360 | /* Reserve some safe pages for potential later use. | 2449 | /* |
2450 | * Reserve some safe pages for potential later use. | ||
2361 | * | 2451 | * |
2362 | * NOTE: This way we make sure there will be enough safe pages for the | 2452 | * NOTE: This way we make sure there will be enough safe pages for the |
2363 | * chain_alloc() in get_buffer(). It is a bit wasteful, but | 2453 | * chain_alloc() in get_buffer(). It is a bit wasteful, but |
2364 | * nr_copy_pages cannot be greater than 50% of the memory anyway. | 2454 | * nr_copy_pages cannot be greater than 50% of the memory anyway. |
2455 | * | ||
2456 | * nr_copy_pages cannot be less than allocated_unsafe_pages too. | ||
2365 | */ | 2457 | */ |
2366 | sp_list = NULL; | ||
2367 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ | ||
2368 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; | 2458 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
2369 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); | 2459 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); |
2370 | while (nr_pages > 0) { | 2460 | while (nr_pages > 0) { |
@@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
2373 | error = -ENOMEM; | 2463 | error = -ENOMEM; |
2374 | goto Free; | 2464 | goto Free; |
2375 | } | 2465 | } |
2376 | lp->next = sp_list; | 2466 | lp->next = safe_pages_list; |
2377 | sp_list = lp; | 2467 | safe_pages_list = lp; |
2378 | nr_pages--; | 2468 | nr_pages--; |
2379 | } | 2469 | } |
2380 | /* Preallocate memory for the image */ | 2470 | /* Preallocate memory for the image */ |
2381 | safe_pages_list = NULL; | ||
2382 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; | 2471 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
2383 | while (nr_pages > 0) { | 2472 | while (nr_pages > 0) { |
2384 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); | 2473 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); |
@@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
2396 | swsusp_set_page_free(virt_to_page(lp)); | 2485 | swsusp_set_page_free(virt_to_page(lp)); |
2397 | nr_pages--; | 2486 | nr_pages--; |
2398 | } | 2487 | } |
2399 | /* Free the reserved safe pages so that chain_alloc() can use them */ | ||
2400 | while (sp_list) { | ||
2401 | lp = sp_list->next; | ||
2402 | free_image_page(sp_list, PG_UNSAFE_CLEAR); | ||
2403 | sp_list = lp; | ||
2404 | } | ||
2405 | return 0; | 2488 | return 0; |
2406 | 2489 | ||
2407 | Free: | 2490 | Free: |
@@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
2410 | } | 2493 | } |
2411 | 2494 | ||
2412 | /** | 2495 | /** |
2413 | * get_buffer - compute the address that snapshot_write_next() should | 2496 | * get_buffer - Get the address to store the next image data page. |
2414 | * set for its caller to write to. | 2497 | * |
2498 | * Get the address that snapshot_write_next() should return to its caller to | ||
2499 | * write to. | ||
2415 | */ | 2500 | */ |
2416 | |||
2417 | static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | 2501 | static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) |
2418 | { | 2502 | { |
2419 | struct pbe *pbe; | 2503 | struct pbe *pbe; |
@@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
2428 | return get_highmem_page_buffer(page, ca); | 2512 | return get_highmem_page_buffer(page, ca); |
2429 | 2513 | ||
2430 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) | 2514 | if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) |
2431 | /* We have allocated the "original" page frame and we can | 2515 | /* |
2516 | * We have allocated the "original" page frame and we can | ||
2432 | * use it directly to store the loaded page. | 2517 | * use it directly to store the loaded page. |
2433 | */ | 2518 | */ |
2434 | return page_address(page); | 2519 | return page_address(page); |
2435 | 2520 | ||
2436 | /* The "original" page frame has not been allocated and we have to | 2521 | /* |
2522 | * The "original" page frame has not been allocated and we have to | ||
2437 | * use a "safe" page frame to store the loaded page. | 2523 | * use a "safe" page frame to store the loaded page. |
2438 | */ | 2524 | */ |
2439 | pbe = chain_alloc(ca, sizeof(struct pbe)); | 2525 | pbe = chain_alloc(ca, sizeof(struct pbe)); |
@@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
2450 | } | 2536 | } |
2451 | 2537 | ||
2452 | /** | 2538 | /** |
2453 | * snapshot_write_next - used for writing the system memory snapshot. | 2539 | * snapshot_write_next - Get the address to store the next image page. |
2540 | * @handle: Snapshot handle structure to guide the writing. | ||
2454 | * | 2541 | * |
2455 | * On the first call to it @handle should point to a zeroed | 2542 | * On the first call, @handle should point to a zeroed snapshot_handle |
2456 | * snapshot_handle structure. The structure gets updated and a pointer | 2543 | * structure. The structure gets populated then and a pointer to it should be |
2457 | * to it should be passed to this function every next time. | 2544 | * passed to this function every next time. |
2458 | * | 2545 | * |
2459 | * On success the function returns a positive number. Then, the caller | 2546 | * On success, the function returns a positive number. Then, the caller |
2460 | * is allowed to write up to the returned number of bytes to the memory | 2547 | * is allowed to write up to the returned number of bytes to the memory |
2461 | * location computed by the data_of() macro. | 2548 | * location computed by the data_of() macro. |
2462 | * | 2549 | * |
2463 | * The function returns 0 to indicate the "end of file" condition, | 2550 | * The function returns 0 to indicate the "end of file" condition. Negative |
2464 | * and a negative number is returned on error. In such cases the | 2551 | * numbers are returned on errors, in which cases the structure pointed to by |
2465 | * structure pointed to by @handle is not updated and should not be used | 2552 | * @handle is not updated and should not be used any more. |
2466 | * any more. | ||
2467 | */ | 2553 | */ |
2468 | |||
2469 | int snapshot_write_next(struct snapshot_handle *handle) | 2554 | int snapshot_write_next(struct snapshot_handle *handle) |
2470 | { | 2555 | { |
2471 | static struct chain_allocator ca; | 2556 | static struct chain_allocator ca; |
@@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2491 | if (error) | 2576 | if (error) |
2492 | return error; | 2577 | return error; |
2493 | 2578 | ||
2579 | safe_pages_list = NULL; | ||
2580 | |||
2494 | error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); | 2581 | error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); |
2495 | if (error) | 2582 | if (error) |
2496 | return error; | 2583 | return error; |
@@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2500 | if (error) | 2587 | if (error) |
2501 | return error; | 2588 | return error; |
2502 | 2589 | ||
2590 | hibernate_restore_protection_begin(); | ||
2503 | } else if (handle->cur <= nr_meta_pages + 1) { | 2591 | } else if (handle->cur <= nr_meta_pages + 1) { |
2504 | error = unpack_orig_pfns(buffer, ©_bm); | 2592 | error = unpack_orig_pfns(buffer, ©_bm); |
2505 | if (error) | 2593 | if (error) |
@@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2522 | copy_last_highmem_page(); | 2610 | copy_last_highmem_page(); |
2523 | /* Restore page key for data page (s390 only). */ | 2611 | /* Restore page key for data page (s390 only). */ |
2524 | page_key_write(handle->buffer); | 2612 | page_key_write(handle->buffer); |
2613 | hibernate_restore_protect_page(handle->buffer); | ||
2525 | handle->buffer = get_buffer(&orig_bm, &ca); | 2614 | handle->buffer = get_buffer(&orig_bm, &ca); |
2526 | if (IS_ERR(handle->buffer)) | 2615 | if (IS_ERR(handle->buffer)) |
2527 | return PTR_ERR(handle->buffer); | 2616 | return PTR_ERR(handle->buffer); |
@@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2533 | } | 2622 | } |
2534 | 2623 | ||
2535 | /** | 2624 | /** |
2536 | * snapshot_write_finalize - must be called after the last call to | 2625 | * snapshot_write_finalize - Complete the loading of a hibernation image. |
2537 | * snapshot_write_next() in case the last page in the image happens | 2626 | * |
2538 | * to be a highmem page and its contents should be stored in the | 2627 | * Must be called after the last call to snapshot_write_next() in case the last |
2539 | * highmem. Additionally, it releases the memory that will not be | 2628 | * page in the image happens to be a highmem page and its contents should be |
2540 | * used any more. | 2629 | * stored in highmem. Additionally, it recycles bitmap memory that's not |
2630 | * necessary any more. | ||
2541 | */ | 2631 | */ |
2542 | |||
2543 | void snapshot_write_finalize(struct snapshot_handle *handle) | 2632 | void snapshot_write_finalize(struct snapshot_handle *handle) |
2544 | { | 2633 | { |
2545 | copy_last_highmem_page(); | 2634 | copy_last_highmem_page(); |
2546 | /* Restore page key for data page (s390 only). */ | 2635 | /* Restore page key for data page (s390 only). */ |
2547 | page_key_write(handle->buffer); | 2636 | page_key_write(handle->buffer); |
2548 | page_key_free(); | 2637 | page_key_free(); |
2549 | /* Free only if we have loaded the image entirely */ | 2638 | hibernate_restore_protect_page(handle->buffer); |
2639 | /* Do that only if we have loaded the image entirely */ | ||
2550 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { | 2640 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { |
2551 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 2641 | memory_bm_recycle(&orig_bm); |
2552 | free_highmem_data(); | 2642 | free_highmem_data(); |
2553 | } | 2643 | } |
2554 | } | 2644 | } |
@@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle) | |||
2561 | 2651 | ||
2562 | #ifdef CONFIG_HIGHMEM | 2652 | #ifdef CONFIG_HIGHMEM |
2563 | /* Assumes that @buf is ready and points to a "safe" page */ | 2653 | /* Assumes that @buf is ready and points to a "safe" page */ |
2564 | static inline void | 2654 | static inline void swap_two_pages_data(struct page *p1, struct page *p2, |
2565 | swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | 2655 | void *buf) |
2566 | { | 2656 | { |
2567 | void *kaddr1, *kaddr2; | 2657 | void *kaddr1, *kaddr2; |
2568 | 2658 | ||
@@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
2576 | } | 2666 | } |
2577 | 2667 | ||
2578 | /** | 2668 | /** |
2579 | * restore_highmem - for each highmem page that was allocated before | 2669 | * restore_highmem - Put highmem image pages into their original locations. |
2580 | * the suspend and included in the suspend image, and also has been | 2670 | * |
2581 | * allocated by the "resume" kernel swap its current (ie. "before | 2671 | * For each highmem page that was in use before hibernation and is included in |
2582 | * resume") contents with the previous (ie. "before suspend") one. | 2672 | * the image, and also has been allocated by the "restore" kernel, swap its |
2673 | * current contents with the previous (ie. "before hibernation") ones. | ||
2583 | * | 2674 | * |
2584 | * If the resume eventually fails, we can call this function once | 2675 | * If the restore eventually fails, we can call this function once again and |
2585 | * again and restore the "before resume" highmem state. | 2676 | * restore the highmem state as seen by the restore kernel. |
2586 | */ | 2677 | */ |
2587 | |||
2588 | int restore_highmem(void) | 2678 | int restore_highmem(void) |
2589 | { | 2679 | { |
2590 | struct highmem_pbe *pbe = highmem_pblist; | 2680 | struct highmem_pbe *pbe = highmem_pblist; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -266,16 +266,18 @@ static int suspend_test(int level) | |||
266 | */ | 266 | */ |
267 | static int suspend_prepare(suspend_state_t state) | 267 | static int suspend_prepare(suspend_state_t state) |
268 | { | 268 | { |
269 | int error; | 269 | int error, nr_calls = 0; |
270 | 270 | ||
271 | if (!sleep_state_supported(state)) | 271 | if (!sleep_state_supported(state)) |
272 | return -EPERM; | 272 | return -EPERM; |
273 | 273 | ||
274 | pm_prepare_console(); | 274 | pm_prepare_console(); |
275 | 275 | ||
276 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); | 276 | error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); |
277 | if (error) | 277 | if (error) { |
278 | nr_calls--; | ||
278 | goto Finish; | 279 | goto Finish; |
280 | } | ||
279 | 281 | ||
280 | trace_suspend_resume(TPS("freeze_processes"), 0, true); | 282 | trace_suspend_resume(TPS("freeze_processes"), 0, true); |
281 | error = suspend_freeze_processes(); | 283 | error = suspend_freeze_processes(); |
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state) | |||
286 | suspend_stats.failed_freeze++; | 288 | suspend_stats.failed_freeze++; |
287 | dpm_save_failed_step(SUSPEND_FREEZE); | 289 | dpm_save_failed_step(SUSPEND_FREEZE); |
288 | Finish: | 290 | Finish: |
289 | pm_notifier_call_chain(PM_POST_SUSPEND); | 291 | __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); |
290 | pm_restore_console(); | 292 | pm_restore_console(); |
291 | return error; | 293 | return error; |
292 | } | 294 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 160e1006640d..a3b1e617bcdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio) | |||
261 | bio_put(bio); | 261 | bio_put(bio); |
262 | } | 262 | } |
263 | 263 | ||
264 | static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | 264 | static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, |
265 | struct hib_bio_batch *hb) | 265 | struct hib_bio_batch *hb) |
266 | { | 266 | { |
267 | struct page *page = virt_to_page(addr); | 267 | struct page *page = virt_to_page(addr); |
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | |||
271 | bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); | 271 | bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); |
272 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); | 272 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); |
273 | bio->bi_bdev = hib_resume_bdev; | 273 | bio->bi_bdev = hib_resume_bdev; |
274 | bio_set_op_attrs(bio, op, op_flags); | ||
274 | 275 | ||
275 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | 276 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { |
276 | printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", | 277 | printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", |
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | |||
283 | bio->bi_end_io = hib_end_io; | 284 | bio->bi_end_io = hib_end_io; |
284 | bio->bi_private = hb; | 285 | bio->bi_private = hb; |
285 | atomic_inc(&hb->count); | 286 | atomic_inc(&hb->count); |
286 | submit_bio(rw, bio); | 287 | submit_bio(bio); |
287 | } else { | 288 | } else { |
288 | error = submit_bio_wait(rw, bio); | 289 | error = submit_bio_wait(bio); |
289 | bio_put(bio); | 290 | bio_put(bio); |
290 | } | 291 | } |
291 | 292 | ||
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
306 | { | 307 | { |
307 | int error; | 308 | int error; |
308 | 309 | ||
309 | hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); | 310 | hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, |
311 | swsusp_header, NULL); | ||
310 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || | 312 | if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || |
311 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { | 313 | !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { |
312 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); | 314 | memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); |
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
315 | swsusp_header->flags = flags; | 317 | swsusp_header->flags = flags; |
316 | if (flags & SF_CRC32_MODE) | 318 | if (flags & SF_CRC32_MODE) |
317 | swsusp_header->crc32 = handle->crc32; | 319 | swsusp_header->crc32 = handle->crc32; |
318 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 320 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
319 | swsusp_header, NULL); | 321 | swsusp_resume_block, swsusp_header, NULL); |
320 | } else { | 322 | } else { |
321 | printk(KERN_ERR "PM: Swap header not found!\n"); | 323 | printk(KERN_ERR "PM: Swap header not found!\n"); |
322 | error = -ENODEV; | 324 | error = -ENODEV; |
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void) | |||
348 | if (res < 0) | 350 | if (res < 0) |
349 | blkdev_put(hib_resume_bdev, FMODE_WRITE); | 351 | blkdev_put(hib_resume_bdev, FMODE_WRITE); |
350 | 352 | ||
353 | /* | ||
354 | * Update the resume device to the one actually used, | ||
355 | * so the test_resume mode can use it in case it is | ||
356 | * invoked from hibernate() to test the snapshot. | ||
357 | */ | ||
358 | swsusp_resume_device = hib_resume_bdev->bd_dev; | ||
351 | return res; | 359 | return res; |
352 | } | 360 | } |
353 | 361 | ||
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) | |||
389 | } else { | 397 | } else { |
390 | src = buf; | 398 | src = buf; |
391 | } | 399 | } |
392 | return hib_submit_io(WRITE_SYNC, offset, src, hb); | 400 | return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); |
393 | } | 401 | } |
394 | 402 | ||
395 | static void release_swap_writer(struct swap_map_handle *handle) | 403 | static void release_swap_writer(struct swap_map_handle *handle) |
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
992 | return -ENOMEM; | 1000 | return -ENOMEM; |
993 | } | 1001 | } |
994 | 1002 | ||
995 | error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); | 1003 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, |
1004 | tmp->map, NULL); | ||
996 | if (error) { | 1005 | if (error) { |
997 | release_swap_reader(handle); | 1006 | release_swap_reader(handle); |
998 | return error; | 1007 | return error; |
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
1016 | offset = handle->cur->entries[handle->k]; | 1025 | offset = handle->cur->entries[handle->k]; |
1017 | if (!offset) | 1026 | if (!offset) |
1018 | return -EFAULT; | 1027 | return -EFAULT; |
1019 | error = hib_submit_io(READ_SYNC, offset, buf, hb); | 1028 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); |
1020 | if (error) | 1029 | if (error) |
1021 | return error; | 1030 | return error; |
1022 | if (++handle->k >= MAP_PAGE_ENTRIES) { | 1031 | if (++handle->k >= MAP_PAGE_ENTRIES) { |
@@ -1525,7 +1534,8 @@ int swsusp_check(void) | |||
1525 | if (!IS_ERR(hib_resume_bdev)) { | 1534 | if (!IS_ERR(hib_resume_bdev)) { |
1526 | set_blocksize(hib_resume_bdev, PAGE_SIZE); | 1535 | set_blocksize(hib_resume_bdev, PAGE_SIZE); |
1527 | clear_page(swsusp_header); | 1536 | clear_page(swsusp_header); |
1528 | error = hib_submit_io(READ_SYNC, swsusp_resume_block, | 1537 | error = hib_submit_io(REQ_OP_READ, READ_SYNC, |
1538 | swsusp_resume_block, | ||
1529 | swsusp_header, NULL); | 1539 | swsusp_header, NULL); |
1530 | if (error) | 1540 | if (error) |
1531 | goto put; | 1541 | goto put; |
@@ -1533,7 +1543,8 @@ int swsusp_check(void) | |||
1533 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { | 1543 | if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { |
1534 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 1544 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
1535 | /* Reset swap signature now */ | 1545 | /* Reset swap signature now */ |
1536 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 1546 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
1547 | swsusp_resume_block, | ||
1537 | swsusp_header, NULL); | 1548 | swsusp_header, NULL); |
1538 | } else { | 1549 | } else { |
1539 | error = -EINVAL; | 1550 | error = -EINVAL; |
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void) | |||
1577 | { | 1588 | { |
1578 | int error; | 1589 | int error; |
1579 | 1590 | ||
1580 | hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); | 1591 | hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, |
1592 | swsusp_header, NULL); | ||
1581 | if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { | 1593 | if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { |
1582 | memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); | 1594 | memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); |
1583 | error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, | 1595 | error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, |
1596 | swsusp_resume_block, | ||
1584 | swsusp_header, NULL); | 1597 | swsusp_header, NULL); |
1585 | } else { | 1598 | } else { |
1586 | printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); | 1599 | printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); | |||
47 | static int snapshot_open(struct inode *inode, struct file *filp) | 47 | static int snapshot_open(struct inode *inode, struct file *filp) |
48 | { | 48 | { |
49 | struct snapshot_data *data; | 49 | struct snapshot_data *data; |
50 | int error; | 50 | int error, nr_calls = 0; |
51 | 51 | ||
52 | if (!hibernation_available()) | 52 | if (!hibernation_available()) |
53 | return -EPERM; | 53 | return -EPERM; |
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
74 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; | 74 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; |
75 | data->mode = O_RDONLY; | 75 | data->mode = O_RDONLY; |
76 | data->free_bitmaps = false; | 76 | data->free_bitmaps = false; |
77 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 77 | error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); |
78 | if (error) | 78 | if (error) |
79 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 79 | __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); |
80 | } else { | 80 | } else { |
81 | /* | 81 | /* |
82 | * Resuming. We may need to wait for the image device to | 82 | * Resuming. We may need to wait for the image device to |
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
86 | 86 | ||
87 | data->swap = -1; | 87 | data->swap = -1; |
88 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
89 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 89 | error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); |
90 | if (!error) { | 90 | if (!error) { |
91 | error = create_basic_memory_bitmaps(); | 91 | error = create_basic_memory_bitmaps(); |
92 | data->free_bitmaps = !error; | 92 | data->free_bitmaps = !error; |
93 | } | 93 | } else |
94 | nr_calls--; | ||
95 | |||
94 | if (error) | 96 | if (error) |
95 | pm_notifier_call_chain(PM_POST_RESTORE); | 97 | __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); |
96 | } | 98 | } |
97 | if (error) | 99 | if (error) |
98 | atomic_inc(&snapshot_device_available); | 100 | atomic_inc(&snapshot_device_available); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) | |||
3177 | { | 3177 | { |
3178 | dump_stack_print_info(log_lvl); | 3178 | dump_stack_print_info(log_lvl); |
3179 | 3179 | ||
3180 | printk("%stask: %p ti: %p task.ti: %p\n", | 3180 | printk("%stask: %p task.stack: %p\n", |
3181 | log_lvl, current, current_thread_info(), | 3181 | log_lvl, current, task_stack_page(current)); |
3182 | task_thread_info(current)); | ||
3183 | } | 3182 | } |
3184 | 3183 | ||
3185 | #endif | 3184 | #endif |
diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -328,68 +328,57 @@ out: | |||
328 | put_cpu(); | 328 | put_cpu(); |
329 | } | 329 | } |
330 | 330 | ||
331 | static int profile_cpu_callback(struct notifier_block *info, | 331 | static int profile_dead_cpu(unsigned int cpu) |
332 | unsigned long action, void *__cpu) | ||
333 | { | 332 | { |
334 | int node, cpu = (unsigned long)__cpu; | ||
335 | struct page *page; | 333 | struct page *page; |
334 | int i; | ||
336 | 335 | ||
337 | switch (action) { | 336 | if (prof_cpu_mask != NULL) |
338 | case CPU_UP_PREPARE: | 337 | cpumask_clear_cpu(cpu, prof_cpu_mask); |
339 | case CPU_UP_PREPARE_FROZEN: | 338 | |
340 | node = cpu_to_mem(cpu); | 339 | for (i = 0; i < 2; i++) { |
341 | per_cpu(cpu_profile_flip, cpu) = 0; | 340 | if (per_cpu(cpu_profile_hits, cpu)[i]) { |
342 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { | 341 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); |
343 | page = __alloc_pages_node(node, | 342 | per_cpu(cpu_profile_hits, cpu)[i] = NULL; |
344 | GFP_KERNEL | __GFP_ZERO, | ||
345 | 0); | ||
346 | if (!page) | ||
347 | return notifier_from_errno(-ENOMEM); | ||
348 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); | ||
349 | } | ||
350 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { | ||
351 | page = __alloc_pages_node(node, | ||
352 | GFP_KERNEL | __GFP_ZERO, | ||
353 | 0); | ||
354 | if (!page) | ||
355 | goto out_free; | ||
356 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | ||
357 | } | ||
358 | break; | ||
359 | out_free: | ||
360 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
361 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
362 | __free_page(page); | ||
363 | return notifier_from_errno(-ENOMEM); | ||
364 | case CPU_ONLINE: | ||
365 | case CPU_ONLINE_FROZEN: | ||
366 | if (prof_cpu_mask != NULL) | ||
367 | cpumask_set_cpu(cpu, prof_cpu_mask); | ||
368 | break; | ||
369 | case CPU_UP_CANCELED: | ||
370 | case CPU_UP_CANCELED_FROZEN: | ||
371 | case CPU_DEAD: | ||
372 | case CPU_DEAD_FROZEN: | ||
373 | if (prof_cpu_mask != NULL) | ||
374 | cpumask_clear_cpu(cpu, prof_cpu_mask); | ||
375 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
376 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
377 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
378 | __free_page(page); | 343 | __free_page(page); |
379 | } | 344 | } |
380 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | 345 | } |
381 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 346 | return 0; |
382 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 347 | } |
383 | __free_page(page); | 348 | |
349 | static int profile_prepare_cpu(unsigned int cpu) | ||
350 | { | ||
351 | int i, node = cpu_to_mem(cpu); | ||
352 | struct page *page; | ||
353 | |||
354 | per_cpu(cpu_profile_flip, cpu) = 0; | ||
355 | |||
356 | for (i = 0; i < 2; i++) { | ||
357 | if (per_cpu(cpu_profile_hits, cpu)[i]) | ||
358 | continue; | ||
359 | |||
360 | page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
361 | if (!page) { | ||
362 | profile_dead_cpu(cpu); | ||
363 | return -ENOMEM; | ||
384 | } | 364 | } |
385 | break; | 365 | per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); |
366 | |||
386 | } | 367 | } |
387 | return NOTIFY_OK; | 368 | return 0; |
369 | } | ||
370 | |||
371 | static int profile_online_cpu(unsigned int cpu) | ||
372 | { | ||
373 | if (prof_cpu_mask != NULL) | ||
374 | cpumask_set_cpu(cpu, prof_cpu_mask); | ||
375 | |||
376 | return 0; | ||
388 | } | 377 | } |
378 | |||
389 | #else /* !CONFIG_SMP */ | 379 | #else /* !CONFIG_SMP */ |
390 | #define profile_flip_buffers() do { } while (0) | 380 | #define profile_flip_buffers() do { } while (0) |
391 | #define profile_discard_flip_buffers() do { } while (0) | 381 | #define profile_discard_flip_buffers() do { } while (0) |
392 | #define profile_cpu_callback NULL | ||
393 | 382 | ||
394 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) | 383 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
395 | { | 384 | { |
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = { | |||
531 | .llseek = default_llseek, | 520 | .llseek = default_llseek, |
532 | }; | 521 | }; |
533 | 522 | ||
534 | #ifdef CONFIG_SMP | 523 | int __ref create_proc_profile(void) |
535 | static void profile_nop(void *unused) | ||
536 | { | ||
537 | } | ||
538 | |||
539 | static int create_hash_tables(void) | ||
540 | { | 524 | { |
541 | int cpu; | 525 | struct proc_dir_entry *entry; |
542 | 526 | #ifdef CONFIG_SMP | |
543 | for_each_online_cpu(cpu) { | 527 | enum cpuhp_state online_state; |
544 | int node = cpu_to_mem(cpu); | ||
545 | struct page *page; | ||
546 | |||
547 | page = __alloc_pages_node(node, | ||
548 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | ||
549 | 0); | ||
550 | if (!page) | ||
551 | goto out_cleanup; | ||
552 | per_cpu(cpu_profile_hits, cpu)[1] | ||
553 | = (struct profile_hit *)page_address(page); | ||
554 | page = __alloc_pages_node(node, | ||
555 | GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, | ||
556 | 0); | ||
557 | if (!page) | ||
558 | goto out_cleanup; | ||
559 | per_cpu(cpu_profile_hits, cpu)[0] | ||
560 | = (struct profile_hit *)page_address(page); | ||
561 | } | ||
562 | return 0; | ||
563 | out_cleanup: | ||
564 | prof_on = 0; | ||
565 | smp_mb(); | ||
566 | on_each_cpu(profile_nop, NULL, 1); | ||
567 | for_each_online_cpu(cpu) { | ||
568 | struct page *page; | ||
569 | |||
570 | if (per_cpu(cpu_profile_hits, cpu)[0]) { | ||
571 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); | ||
572 | per_cpu(cpu_profile_hits, cpu)[0] = NULL; | ||
573 | __free_page(page); | ||
574 | } | ||
575 | if (per_cpu(cpu_profile_hits, cpu)[1]) { | ||
576 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | ||
577 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | ||
578 | __free_page(page); | ||
579 | } | ||
580 | } | ||
581 | return -1; | ||
582 | } | ||
583 | #else | ||
584 | #define create_hash_tables() ({ 0; }) | ||
585 | #endif | 528 | #endif |
586 | 529 | ||
587 | int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ | ||
588 | { | ||
589 | struct proc_dir_entry *entry; | ||
590 | int err = 0; | 530 | int err = 0; |
591 | 531 | ||
592 | if (!prof_on) | 532 | if (!prof_on) |
593 | return 0; | 533 | return 0; |
594 | 534 | #ifdef CONFIG_SMP | |
595 | cpu_notifier_register_begin(); | 535 | err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", |
596 | 536 | profile_prepare_cpu, profile_dead_cpu); | |
597 | if (create_hash_tables()) { | 537 | if (err) |
598 | err = -ENOMEM; | 538 | return err; |
599 | goto out; | 539 | |
600 | } | 540 | err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", |
601 | 541 | profile_online_cpu, NULL); | |
542 | if (err < 0) | ||
543 | goto err_state_prep; | ||
544 | online_state = err; | ||
545 | err = 0; | ||
546 | #endif | ||
602 | entry = proc_create("profile", S_IWUSR | S_IRUGO, | 547 | entry = proc_create("profile", S_IWUSR | S_IRUGO, |
603 | NULL, &proc_profile_operations); | 548 | NULL, &proc_profile_operations); |
604 | if (!entry) | 549 | if (!entry) |
605 | goto out; | 550 | goto err_state_onl; |
606 | proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); | 551 | proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); |
607 | __hotcpu_notifier(profile_cpu_callback, 0); | ||
608 | 552 | ||
609 | out: | 553 | return err; |
610 | cpu_notifier_register_done(); | 554 | err_state_onl: |
555 | #ifdef CONFIG_SMP | ||
556 | cpuhp_remove_state(online_state); | ||
557 | err_state_prep: | ||
558 | cpuhp_remove_state(CPUHP_PROFILE_PREPARE); | ||
559 | #endif | ||
611 | return err; | 560 | return err; |
612 | } | 561 | } |
613 | subsys_initcall(create_proc_profile); | 562 | subsys_initcall(create_proc_profile); |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
@@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
59 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) | 59 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) |
60 | 60 | ||
61 | torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); | 61 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
62 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); | 62 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); |
63 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); | 63 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); |
64 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); | 64 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); |
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state; | |||
96 | #define MAX_MEAS 10000 | 96 | #define MAX_MEAS 10000 |
97 | #define MIN_MEAS 100 | 97 | #define MIN_MEAS 100 |
98 | 98 | ||
99 | #if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) | 99 | static int perf_runnable = IS_ENABLED(MODULE); |
100 | #define RCUPERF_RUNNABLE_INIT 1 | ||
101 | #else | ||
102 | #define RCUPERF_RUNNABLE_INIT 0 | ||
103 | #endif | ||
104 | static int perf_runnable = RCUPERF_RUNNABLE_INIT; | ||
105 | module_param(perf_runnable, int, 0444); | 100 | module_param(perf_runnable, int, 0444); |
106 | MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); | 101 | MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); |
107 | 102 | ||
@@ -363,8 +358,6 @@ rcu_perf_writer(void *arg) | |||
363 | u64 *wdpp = writer_durations[me]; | 358 | u64 *wdpp = writer_durations[me]; |
364 | 359 | ||
365 | VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); | 360 | VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); |
366 | WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); | ||
367 | WARN_ON(rcu_gp_is_normal() && gp_exp); | ||
368 | WARN_ON(!wdpp); | 361 | WARN_ON(!wdpp); |
369 | set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); | 362 | set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); |
370 | sp.sched_priority = 1; | 363 | sp.sched_priority = 1; |
@@ -631,12 +624,24 @@ rcu_perf_init(void) | |||
631 | firsterr = -ENOMEM; | 624 | firsterr = -ENOMEM; |
632 | goto unwind; | 625 | goto unwind; |
633 | } | 626 | } |
627 | if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { | ||
628 | VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); | ||
629 | firsterr = -EINVAL; | ||
630 | goto unwind; | ||
631 | } | ||
632 | if (rcu_gp_is_normal() && gp_exp) { | ||
633 | VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); | ||
634 | firsterr = -EINVAL; | ||
635 | goto unwind; | ||
636 | } | ||
634 | for (i = 0; i < nrealwriters; i++) { | 637 | for (i = 0; i < nrealwriters; i++) { |
635 | writer_durations[i] = | 638 | writer_durations[i] = |
636 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), | 639 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), |
637 | GFP_KERNEL); | 640 | GFP_KERNEL); |
638 | if (!writer_durations[i]) | 641 | if (!writer_durations[i]) { |
642 | firsterr = -ENOMEM; | ||
639 | goto unwind; | 643 | goto unwind; |
644 | } | ||
640 | firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, | 645 | firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, |
641 | writer_tasks[i]); | 646 | writer_tasks[i]); |
642 | if (firsterr) | 647 | if (firsterr) |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) | |||
182 | return rcu_torture_writer_state_names[i]; | 182 | return rcu_torture_writer_state_names[i]; |
183 | } | 183 | } |
184 | 184 | ||
185 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 185 | static int torture_runnable = IS_ENABLED(MODULE); |
186 | #define RCUTORTURE_RUNNABLE_INIT 1 | ||
187 | #else | ||
188 | #define RCUTORTURE_RUNNABLE_INIT 0 | ||
189 | #endif | ||
190 | static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; | ||
191 | module_param(torture_runnable, int, 0444); | 186 | module_param(torture_runnable, int, 0444); |
192 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); | 187 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); |
193 | 188 | ||
@@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
1476 | break; | 1471 | break; |
1477 | /* | 1472 | /* |
1478 | * The above smp_load_acquire() ensures barrier_phase load | 1473 | * The above smp_load_acquire() ensures barrier_phase load |
1479 | * is ordered before the folloiwng ->call(). | 1474 | * is ordered before the following ->call(). |
1480 | */ | 1475 | */ |
1481 | local_irq_disable(); /* Just to test no-irq call_rcu(). */ | 1476 | local_irq_disable(); /* Just to test no-irq call_rcu(). */ |
1482 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | 1477 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | |||
125 | /* Number of rcu_nodes at specified level. */ | 125 | /* Number of rcu_nodes at specified level. */ |
126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
128 | /* panic() on RCU Stall sysctl. */ | ||
129 | int sysctl_panic_on_rcu_stall __read_mostly; | ||
128 | 130 | ||
129 | /* | 131 | /* |
130 | * The rcu_scheduler_active variable transitions from zero to one just | 132 | * The rcu_scheduler_active variable transitions from zero to one just |
131 | * before the first task is spawned. So when this variable is zero, RCU | 133 | * before the first task is spawned. So when this variable is zero, RCU |
132 | * can assume that there is but one task, allowing RCU to (for example) | 134 | * can assume that there is but one task, allowing RCU to (for example) |
133 | * optimize synchronize_sched() to a simple barrier(). When this variable | 135 | * optimize synchronize_rcu() to a simple barrier(). When this variable |
134 | * is one, RCU must actually do all the hard work required to detect real | 136 | * is one, RCU must actually do all the hard work required to detect real |
135 | * grace periods. This variable is also used to suppress boot-time false | 137 | * grace periods. This variable is also used to suppress boot-time false |
136 | * positives from lockdep-RCU error checking. | 138 | * positives from lockdep-RCU error checking. |
@@ -159,6 +161,7 @@ static void invoke_rcu_core(void); | |||
159 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 161 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
160 | static void rcu_report_exp_rdp(struct rcu_state *rsp, | 162 | static void rcu_report_exp_rdp(struct rcu_state *rsp, |
161 | struct rcu_data *rdp, bool wake); | 163 | struct rcu_data *rdp, bool wake); |
164 | static void sync_sched_exp_online_cleanup(int cpu); | ||
162 | 165 | ||
163 | /* rcuc/rcub kthread realtime priority */ | 166 | /* rcuc/rcub kthread realtime priority */ |
164 | #ifdef CONFIG_RCU_KTHREAD_PRIO | 167 | #ifdef CONFIG_RCU_KTHREAD_PRIO |
@@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching); | |||
1070 | * offline to continue to use RCU for one jiffy after marking itself | 1073 | * offline to continue to use RCU for one jiffy after marking itself |
1071 | * offline in the cpu_online_mask. This leniency is necessary given the | 1074 | * offline in the cpu_online_mask. This leniency is necessary given the |
1072 | * non-atomic nature of the online and offline processing, for example, | 1075 | * non-atomic nature of the online and offline processing, for example, |
1073 | * the fact that a CPU enters the scheduler after completing the CPU_DYING | 1076 | * the fact that a CPU enters the scheduler after completing the teardown |
1074 | * notifiers. | 1077 | * of the CPU. |
1075 | * | 1078 | * |
1076 | * This is also why RCU internally marks CPUs online during the | 1079 | * This is also why RCU internally marks CPUs online during in the |
1077 | * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. | 1080 | * preparation phase and offline after the CPU has been taken down. |
1078 | * | 1081 | * |
1079 | * Disable checking if in an NMI handler because we cannot safely report | 1082 | * Disable checking if in an NMI handler because we cannot safely report |
1080 | * errors from NMI handlers anyway. | 1083 | * errors from NMI handlers anyway. |
@@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | |||
1284 | rcu_for_each_leaf_node(rsp, rnp) { | 1287 | rcu_for_each_leaf_node(rsp, rnp) { |
1285 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 1288 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
1286 | if (rnp->qsmask != 0) { | 1289 | if (rnp->qsmask != 0) { |
1287 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1290 | for_each_leaf_node_possible_cpu(rnp, cpu) |
1288 | if (rnp->qsmask & (1UL << cpu)) | 1291 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) |
1289 | dump_cpu_task(rnp->grplo + cpu); | 1292 | dump_cpu_task(cpu); |
1290 | } | 1293 | } |
1291 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1294 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1292 | } | 1295 | } |
@@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) | |||
1311 | } | 1314 | } |
1312 | } | 1315 | } |
1313 | 1316 | ||
1317 | static inline void panic_on_rcu_stall(void) | ||
1318 | { | ||
1319 | if (sysctl_panic_on_rcu_stall) | ||
1320 | panic("RCU Stall\n"); | ||
1321 | } | ||
1322 | |||
1314 | static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | 1323 | static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) |
1315 | { | 1324 | { |
1316 | int cpu; | 1325 | int cpu; |
@@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1351 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 1360 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
1352 | ndetected += rcu_print_task_stall(rnp); | 1361 | ndetected += rcu_print_task_stall(rnp); |
1353 | if (rnp->qsmask != 0) { | 1362 | if (rnp->qsmask != 0) { |
1354 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 1363 | for_each_leaf_node_possible_cpu(rnp, cpu) |
1355 | if (rnp->qsmask & (1UL << cpu)) { | 1364 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { |
1356 | print_cpu_stall_info(rsp, | 1365 | print_cpu_stall_info(rsp, cpu); |
1357 | rnp->grplo + cpu); | ||
1358 | ndetected++; | 1366 | ndetected++; |
1359 | } | 1367 | } |
1360 | } | 1368 | } |
@@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1390 | 1398 | ||
1391 | rcu_check_gp_kthread_starvation(rsp); | 1399 | rcu_check_gp_kthread_starvation(rsp); |
1392 | 1400 | ||
1401 | panic_on_rcu_stall(); | ||
1402 | |||
1393 | force_quiescent_state(rsp); /* Kick them all. */ | 1403 | force_quiescent_state(rsp); /* Kick them all. */ |
1394 | } | 1404 | } |
1395 | 1405 | ||
@@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
1430 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | 1440 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); |
1431 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1441 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1432 | 1442 | ||
1443 | panic_on_rcu_stall(); | ||
1444 | |||
1433 | /* | 1445 | /* |
1434 | * Attempt to revive the RCU machinery by forcing a context switch. | 1446 | * Attempt to revive the RCU machinery by forcing a context switch. |
1435 | * | 1447 | * |
@@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) | |||
1989 | * of the tree within the rsp->node[] array. Note that other CPUs | 2001 | * of the tree within the rsp->node[] array. Note that other CPUs |
1990 | * will access only the leaves of the hierarchy, thus seeing that no | 2002 | * will access only the leaves of the hierarchy, thus seeing that no |
1991 | * grace period is in progress, at least until the corresponding | 2003 | * grace period is in progress, at least until the corresponding |
1992 | * leaf node has been initialized. In addition, we have excluded | 2004 | * leaf node has been initialized. |
1993 | * CPU-hotplug operations. | ||
1994 | * | 2005 | * |
1995 | * The grace period cannot complete until the initialization | 2006 | * The grace period cannot complete until the initialization |
1996 | * process finishes, because this kthread handles both. | 2007 | * process finishes, because this kthread handles both. |
@@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
2872 | unsigned long *maxj), | 2883 | unsigned long *maxj), |
2873 | bool *isidle, unsigned long *maxj) | 2884 | bool *isidle, unsigned long *maxj) |
2874 | { | 2885 | { |
2875 | unsigned long bit; | ||
2876 | int cpu; | 2886 | int cpu; |
2877 | unsigned long flags; | 2887 | unsigned long flags; |
2878 | unsigned long mask; | 2888 | unsigned long mask; |
@@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
2907 | continue; | 2917 | continue; |
2908 | } | 2918 | } |
2909 | } | 2919 | } |
2910 | cpu = rnp->grplo; | 2920 | for_each_leaf_node_possible_cpu(rnp, cpu) { |
2911 | bit = 1; | 2921 | unsigned long bit = leaf_node_cpu_bit(rnp, cpu); |
2912 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | ||
2913 | if ((rnp->qsmask & bit) != 0) { | 2922 | if ((rnp->qsmask & bit) != 0) { |
2914 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2923 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) |
2915 | mask |= bit; | 2924 | mask |= bit; |
@@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) | |||
3448 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | 3457 | return ULONG_CMP_GE(READ_ONCE(*sp), s); |
3449 | } | 3458 | } |
3450 | 3459 | ||
3451 | /* Wrapper functions for expedited grace periods. */ | ||
3452 | static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | ||
3453 | { | ||
3454 | rcu_seq_start(&rsp->expedited_sequence); | ||
3455 | } | ||
3456 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | ||
3457 | { | ||
3458 | rcu_seq_end(&rsp->expedited_sequence); | ||
3459 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ | ||
3460 | } | ||
3461 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | ||
3462 | { | ||
3463 | unsigned long s; | ||
3464 | |||
3465 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
3466 | s = rcu_seq_snap(&rsp->expedited_sequence); | ||
3467 | trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); | ||
3468 | return s; | ||
3469 | } | ||
3470 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | ||
3471 | { | ||
3472 | return rcu_seq_done(&rsp->expedited_sequence, s); | ||
3473 | } | ||
3474 | |||
3475 | /* | ||
3476 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | ||
3477 | * recent CPU-online activity. Note that these masks are not cleared | ||
3478 | * when CPUs go offline, so they reflect the union of all CPUs that have | ||
3479 | * ever been online. This means that this function normally takes its | ||
3480 | * no-work-to-do fastpath. | ||
3481 | */ | ||
3482 | static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | ||
3483 | { | ||
3484 | bool done; | ||
3485 | unsigned long flags; | ||
3486 | unsigned long mask; | ||
3487 | unsigned long oldmask; | ||
3488 | int ncpus = READ_ONCE(rsp->ncpus); | ||
3489 | struct rcu_node *rnp; | ||
3490 | struct rcu_node *rnp_up; | ||
3491 | |||
3492 | /* If no new CPUs onlined since last time, nothing to do. */ | ||
3493 | if (likely(ncpus == rsp->ncpus_snap)) | ||
3494 | return; | ||
3495 | rsp->ncpus_snap = ncpus; | ||
3496 | |||
3497 | /* | ||
3498 | * Each pass through the following loop propagates newly onlined | ||
3499 | * CPUs for the current rcu_node structure up the rcu_node tree. | ||
3500 | */ | ||
3501 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3502 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3503 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | ||
3504 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3505 | continue; /* No new CPUs, nothing to do. */ | ||
3506 | } | ||
3507 | |||
3508 | /* Update this node's mask, track old value for propagation. */ | ||
3509 | oldmask = rnp->expmaskinit; | ||
3510 | rnp->expmaskinit = rnp->expmaskinitnext; | ||
3511 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3512 | |||
3513 | /* If was already nonzero, nothing to propagate. */ | ||
3514 | if (oldmask) | ||
3515 | continue; | ||
3516 | |||
3517 | /* Propagate the new CPU up the tree. */ | ||
3518 | mask = rnp->grpmask; | ||
3519 | rnp_up = rnp->parent; | ||
3520 | done = false; | ||
3521 | while (rnp_up) { | ||
3522 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); | ||
3523 | if (rnp_up->expmaskinit) | ||
3524 | done = true; | ||
3525 | rnp_up->expmaskinit |= mask; | ||
3526 | raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); | ||
3527 | if (done) | ||
3528 | break; | ||
3529 | mask = rnp_up->grpmask; | ||
3530 | rnp_up = rnp_up->parent; | ||
3531 | } | ||
3532 | } | ||
3533 | } | ||
3534 | |||
3535 | /* | ||
3536 | * Reset the ->expmask values in the rcu_node tree in preparation for | ||
3537 | * a new expedited grace period. | ||
3538 | */ | ||
3539 | static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | ||
3540 | { | ||
3541 | unsigned long flags; | ||
3542 | struct rcu_node *rnp; | ||
3543 | |||
3544 | sync_exp_reset_tree_hotplug(rsp); | ||
3545 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
3546 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3547 | WARN_ON_ONCE(rnp->expmask); | ||
3548 | rnp->expmask = rnp->expmaskinit; | ||
3549 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3550 | } | ||
3551 | } | ||
3552 | |||
3553 | /* | ||
3554 | * Return non-zero if there is no RCU expedited grace period in progress | ||
3555 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
3556 | * tasks covered by the specified rcu_node structure have done their bit | ||
3557 | * for the current expedited grace period. Works only for preemptible | ||
3558 | * RCU -- other RCU implementation use other means. | ||
3559 | * | ||
3560 | * Caller must hold the rcu_state's exp_mutex. | ||
3561 | */ | ||
3562 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
3563 | { | ||
3564 | return rnp->exp_tasks == NULL && | ||
3565 | READ_ONCE(rnp->expmask) == 0; | ||
3566 | } | ||
3567 | |||
3568 | /* | ||
3569 | * Report the exit from RCU read-side critical section for the last task | ||
3570 | * that queued itself during or before the current expedited preemptible-RCU | ||
3571 | * grace period. This event is reported either to the rcu_node structure on | ||
3572 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
3573 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
3574 | * iteratively!) | ||
3575 | * | ||
3576 | * Caller must hold the rcu_state's exp_mutex and the specified rcu_node | ||
3577 | * structure's ->lock. | ||
3578 | */ | ||
3579 | static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
3580 | bool wake, unsigned long flags) | ||
3581 | __releases(rnp->lock) | ||
3582 | { | ||
3583 | unsigned long mask; | ||
3584 | |||
3585 | for (;;) { | ||
3586 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
3587 | if (!rnp->expmask) | ||
3588 | rcu_initiate_boost(rnp, flags); | ||
3589 | else | ||
3590 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3591 | break; | ||
3592 | } | ||
3593 | if (rnp->parent == NULL) { | ||
3594 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3595 | if (wake) { | ||
3596 | smp_mb(); /* EGP done before wake_up(). */ | ||
3597 | swake_up(&rsp->expedited_wq); | ||
3598 | } | ||
3599 | break; | ||
3600 | } | ||
3601 | mask = rnp->grpmask; | ||
3602 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ | ||
3603 | rnp = rnp->parent; | ||
3604 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ | ||
3605 | WARN_ON_ONCE(!(rnp->expmask & mask)); | ||
3606 | rnp->expmask &= ~mask; | ||
3607 | } | ||
3608 | } | ||
3609 | |||
3610 | /* | ||
3611 | * Report expedited quiescent state for specified node. This is a | ||
3612 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | ||
3613 | * | ||
3614 | * Caller must hold the rcu_state's exp_mutex. | ||
3615 | */ | ||
3616 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | ||
3617 | struct rcu_node *rnp, bool wake) | ||
3618 | { | ||
3619 | unsigned long flags; | ||
3620 | |||
3621 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3622 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | ||
3623 | } | ||
3624 | |||
3625 | /* | ||
3626 | * Report expedited quiescent state for multiple CPUs, all covered by the | ||
3627 | * specified leaf rcu_node structure. Caller must hold the rcu_state's | ||
3628 | * exp_mutex. | ||
3629 | */ | ||
3630 | static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | ||
3631 | unsigned long mask, bool wake) | ||
3632 | { | ||
3633 | unsigned long flags; | ||
3634 | |||
3635 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3636 | if (!(rnp->expmask & mask)) { | ||
3637 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3638 | return; | ||
3639 | } | ||
3640 | rnp->expmask &= ~mask; | ||
3641 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ | ||
3642 | } | ||
3643 | |||
3644 | /* | ||
3645 | * Report expedited quiescent state for specified rcu_data (CPU). | ||
3646 | */ | ||
3647 | static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, | ||
3648 | bool wake) | ||
3649 | { | ||
3650 | rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); | ||
3651 | } | ||
3652 | |||
3653 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | ||
3654 | static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, | ||
3655 | unsigned long s) | ||
3656 | { | ||
3657 | if (rcu_exp_gp_seq_done(rsp, s)) { | ||
3658 | trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); | ||
3659 | /* Ensure test happens before caller kfree(). */ | ||
3660 | smp_mb__before_atomic(); /* ^^^ */ | ||
3661 | atomic_long_inc(stat); | ||
3662 | return true; | ||
3663 | } | ||
3664 | return false; | ||
3665 | } | ||
3666 | |||
3667 | /* | ||
3668 | * Funnel-lock acquisition for expedited grace periods. Returns true | ||
3669 | * if some other task completed an expedited grace period that this task | ||
3670 | * can piggy-back on, and with no mutex held. Otherwise, returns false | ||
3671 | * with the mutex held, indicating that the caller must actually do the | ||
3672 | * expedited grace period. | ||
3673 | */ | ||
3674 | static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | ||
3675 | { | ||
3676 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
3677 | struct rcu_node *rnp = rdp->mynode; | ||
3678 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
3679 | |||
3680 | /* Low-contention fastpath. */ | ||
3681 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && | ||
3682 | (rnp == rnp_root || | ||
3683 | ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && | ||
3684 | !mutex_is_locked(&rsp->exp_mutex) && | ||
3685 | mutex_trylock(&rsp->exp_mutex)) | ||
3686 | goto fastpath; | ||
3687 | |||
3688 | /* | ||
3689 | * Each pass through the following loop works its way up | ||
3690 | * the rcu_node tree, returning if others have done the work or | ||
3691 | * otherwise falls through to acquire rsp->exp_mutex. The mapping | ||
3692 | * from CPU to rcu_node structure can be inexact, as it is just | ||
3693 | * promoting locality and is not strictly needed for correctness. | ||
3694 | */ | ||
3695 | for (; rnp != NULL; rnp = rnp->parent) { | ||
3696 | if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) | ||
3697 | return true; | ||
3698 | |||
3699 | /* Work not done, either wait here or go up. */ | ||
3700 | spin_lock(&rnp->exp_lock); | ||
3701 | if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { | ||
3702 | |||
3703 | /* Someone else doing GP, so wait for them. */ | ||
3704 | spin_unlock(&rnp->exp_lock); | ||
3705 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | ||
3706 | rnp->grplo, rnp->grphi, | ||
3707 | TPS("wait")); | ||
3708 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
3709 | sync_exp_work_done(rsp, | ||
3710 | &rdp->exp_workdone2, s)); | ||
3711 | return true; | ||
3712 | } | ||
3713 | rnp->exp_seq_rq = s; /* Followers can wait on us. */ | ||
3714 | spin_unlock(&rnp->exp_lock); | ||
3715 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, | ||
3716 | rnp->grphi, TPS("nxtlvl")); | ||
3717 | } | ||
3718 | mutex_lock(&rsp->exp_mutex); | ||
3719 | fastpath: | ||
3720 | if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { | ||
3721 | mutex_unlock(&rsp->exp_mutex); | ||
3722 | return true; | ||
3723 | } | ||
3724 | rcu_exp_gp_seq_start(rsp); | ||
3725 | trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); | ||
3726 | return false; | ||
3727 | } | ||
3728 | |||
3729 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | ||
3730 | static void sync_sched_exp_handler(void *data) | ||
3731 | { | ||
3732 | struct rcu_data *rdp; | ||
3733 | struct rcu_node *rnp; | ||
3734 | struct rcu_state *rsp = data; | ||
3735 | |||
3736 | rdp = this_cpu_ptr(rsp->rda); | ||
3737 | rnp = rdp->mynode; | ||
3738 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | ||
3739 | __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | ||
3740 | return; | ||
3741 | if (rcu_is_cpu_rrupt_from_idle()) { | ||
3742 | rcu_report_exp_rdp(&rcu_sched_state, | ||
3743 | this_cpu_ptr(&rcu_sched_data), true); | ||
3744 | return; | ||
3745 | } | ||
3746 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | ||
3747 | resched_cpu(smp_processor_id()); | ||
3748 | } | ||
3749 | |||
3750 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | ||
3751 | static void sync_sched_exp_online_cleanup(int cpu) | ||
3752 | { | ||
3753 | struct rcu_data *rdp; | ||
3754 | int ret; | ||
3755 | struct rcu_node *rnp; | ||
3756 | struct rcu_state *rsp = &rcu_sched_state; | ||
3757 | |||
3758 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3759 | rnp = rdp->mynode; | ||
3760 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | ||
3761 | return; | ||
3762 | ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); | ||
3763 | WARN_ON_ONCE(ret); | ||
3764 | } | ||
3765 | |||
3766 | /* | ||
3767 | * Select the nodes that the upcoming expedited grace period needs | ||
3768 | * to wait for. | ||
3769 | */ | ||
3770 | static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | ||
3771 | smp_call_func_t func) | ||
3772 | { | ||
3773 | int cpu; | ||
3774 | unsigned long flags; | ||
3775 | unsigned long mask; | ||
3776 | unsigned long mask_ofl_test; | ||
3777 | unsigned long mask_ofl_ipi; | ||
3778 | int ret; | ||
3779 | struct rcu_node *rnp; | ||
3780 | |||
3781 | sync_exp_reset_tree(rsp); | ||
3782 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3783 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3784 | |||
3785 | /* Each pass checks a CPU for identity, offline, and idle. */ | ||
3786 | mask_ofl_test = 0; | ||
3787 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { | ||
3788 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3789 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
3790 | |||
3791 | if (raw_smp_processor_id() == cpu || | ||
3792 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
3793 | mask_ofl_test |= rdp->grpmask; | ||
3794 | } | ||
3795 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | ||
3796 | |||
3797 | /* | ||
3798 | * Need to wait for any blocked tasks as well. Note that | ||
3799 | * additional blocking tasks will also block the expedited | ||
3800 | * GP until such time as the ->expmask bits are cleared. | ||
3801 | */ | ||
3802 | if (rcu_preempt_has_tasks(rnp)) | ||
3803 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
3804 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3805 | |||
3806 | /* IPI the remaining CPUs for expedited quiescent state. */ | ||
3807 | mask = 1; | ||
3808 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
3809 | if (!(mask_ofl_ipi & mask)) | ||
3810 | continue; | ||
3811 | retry_ipi: | ||
3812 | ret = smp_call_function_single(cpu, func, rsp, 0); | ||
3813 | if (!ret) { | ||
3814 | mask_ofl_ipi &= ~mask; | ||
3815 | continue; | ||
3816 | } | ||
3817 | /* Failed, raced with offline. */ | ||
3818 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3819 | if (cpu_online(cpu) && | ||
3820 | (rnp->expmask & mask)) { | ||
3821 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3822 | schedule_timeout_uninterruptible(1); | ||
3823 | if (cpu_online(cpu) && | ||
3824 | (rnp->expmask & mask)) | ||
3825 | goto retry_ipi; | ||
3826 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3827 | } | ||
3828 | if (!(rnp->expmask & mask)) | ||
3829 | mask_ofl_ipi &= ~mask; | ||
3830 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3831 | } | ||
3832 | /* Report quiescent states for those that went offline. */ | ||
3833 | mask_ofl_test |= mask_ofl_ipi; | ||
3834 | if (mask_ofl_test) | ||
3835 | rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); | ||
3836 | } | ||
3837 | } | ||
3838 | |||
3839 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | ||
3840 | { | ||
3841 | int cpu; | ||
3842 | unsigned long jiffies_stall; | ||
3843 | unsigned long jiffies_start; | ||
3844 | unsigned long mask; | ||
3845 | int ndetected; | ||
3846 | struct rcu_node *rnp; | ||
3847 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
3848 | int ret; | ||
3849 | |||
3850 | jiffies_stall = rcu_jiffies_till_stall_check(); | ||
3851 | jiffies_start = jiffies; | ||
3852 | |||
3853 | for (;;) { | ||
3854 | ret = swait_event_timeout( | ||
3855 | rsp->expedited_wq, | ||
3856 | sync_rcu_preempt_exp_done(rnp_root), | ||
3857 | jiffies_stall); | ||
3858 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | ||
3859 | return; | ||
3860 | if (ret < 0) { | ||
3861 | /* Hit a signal, disable CPU stall warnings. */ | ||
3862 | swait_event(rsp->expedited_wq, | ||
3863 | sync_rcu_preempt_exp_done(rnp_root)); | ||
3864 | return; | ||
3865 | } | ||
3866 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | ||
3867 | rsp->name); | ||
3868 | ndetected = 0; | ||
3869 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3870 | ndetected += rcu_print_task_exp_stall(rnp); | ||
3871 | mask = 1; | ||
3872 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
3873 | struct rcu_data *rdp; | ||
3874 | |||
3875 | if (!(rnp->expmask & mask)) | ||
3876 | continue; | ||
3877 | ndetected++; | ||
3878 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3879 | pr_cont(" %d-%c%c%c", cpu, | ||
3880 | "O."[!!cpu_online(cpu)], | ||
3881 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | ||
3882 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | ||
3883 | } | ||
3884 | mask <<= 1; | ||
3885 | } | ||
3886 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", | ||
3887 | jiffies - jiffies_start, rsp->expedited_sequence, | ||
3888 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); | ||
3889 | if (ndetected) { | ||
3890 | pr_err("blocking rcu_node structures:"); | ||
3891 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
3892 | if (rnp == rnp_root) | ||
3893 | continue; /* printed unconditionally */ | ||
3894 | if (sync_rcu_preempt_exp_done(rnp)) | ||
3895 | continue; | ||
3896 | pr_cont(" l=%u:%d-%d:%#lx/%c", | ||
3897 | rnp->level, rnp->grplo, rnp->grphi, | ||
3898 | rnp->expmask, | ||
3899 | ".T"[!!rnp->exp_tasks]); | ||
3900 | } | ||
3901 | pr_cont("\n"); | ||
3902 | } | ||
3903 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3904 | mask = 1; | ||
3905 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
3906 | if (!(rnp->expmask & mask)) | ||
3907 | continue; | ||
3908 | dump_cpu_task(cpu); | ||
3909 | } | ||
3910 | } | ||
3911 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | ||
3912 | } | ||
3913 | } | ||
3914 | |||
3915 | /* | ||
3916 | * Wait for the current expedited grace period to complete, and then | ||
3917 | * wake up everyone who piggybacked on the just-completed expedited | ||
3918 | * grace period. Also update all the ->exp_seq_rq counters as needed | ||
3919 | * in order to avoid counter-wrap problems. | ||
3920 | */ | ||
3921 | static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | ||
3922 | { | ||
3923 | struct rcu_node *rnp; | ||
3924 | |||
3925 | synchronize_sched_expedited_wait(rsp); | ||
3926 | rcu_exp_gp_seq_end(rsp); | ||
3927 | trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); | ||
3928 | |||
3929 | /* | ||
3930 | * Switch over to wakeup mode, allowing the next GP, but -only- the | ||
3931 | * next GP, to proceed. | ||
3932 | */ | ||
3933 | mutex_lock(&rsp->exp_wake_mutex); | ||
3934 | mutex_unlock(&rsp->exp_mutex); | ||
3935 | |||
3936 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
3937 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | ||
3938 | spin_lock(&rnp->exp_lock); | ||
3939 | /* Recheck, avoid hang in case someone just arrived. */ | ||
3940 | if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) | ||
3941 | rnp->exp_seq_rq = s; | ||
3942 | spin_unlock(&rnp->exp_lock); | ||
3943 | } | ||
3944 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | ||
3945 | } | ||
3946 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | ||
3947 | mutex_unlock(&rsp->exp_wake_mutex); | ||
3948 | } | ||
3949 | |||
3950 | /** | ||
3951 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
3952 | * | ||
3953 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
3954 | * approach to force the grace period to end quickly. This consumes | ||
3955 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
3956 | * so is thus not recommended for any sort of common-case code. In fact, | ||
3957 | * if you are using synchronize_sched_expedited() in a loop, please | ||
3958 | * restructure your code to batch your updates, and then use a single | ||
3959 | * synchronize_sched() instead. | ||
3960 | * | ||
3961 | * This implementation can be thought of as an application of sequence | ||
3962 | * locking to expedited grace periods, but using the sequence counter to | ||
3963 | * determine when someone else has already done the work instead of for | ||
3964 | * retrying readers. | ||
3965 | */ | ||
3966 | void synchronize_sched_expedited(void) | ||
3967 | { | ||
3968 | unsigned long s; | ||
3969 | struct rcu_state *rsp = &rcu_sched_state; | ||
3970 | |||
3971 | /* If only one CPU, this is automatically a grace period. */ | ||
3972 | if (rcu_blocking_is_gp()) | ||
3973 | return; | ||
3974 | |||
3975 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
3976 | if (rcu_gp_is_normal()) { | ||
3977 | wait_rcu_gp(call_rcu_sched); | ||
3978 | return; | ||
3979 | } | ||
3980 | |||
3981 | /* Take a snapshot of the sequence number. */ | ||
3982 | s = rcu_exp_gp_seq_snap(rsp); | ||
3983 | if (exp_funnel_lock(rsp, s)) | ||
3984 | return; /* Someone else did our work for us. */ | ||
3985 | |||
3986 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
3987 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
3988 | |||
3989 | /* Wait and clean up, including waking everyone. */ | ||
3990 | rcu_exp_wait_wake(rsp, s); | ||
3991 | } | ||
3992 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
3993 | |||
3994 | /* | 3460 | /* |
3995 | * Check to see if there is any immediate RCU-related work to be done | 3461 | * Check to see if there is any immediate RCU-related work to be done |
3996 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 3462 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
@@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
4281 | 3747 | ||
4282 | /* Set up local state, ensuring consistent view of global state. */ | 3748 | /* Set up local state, ensuring consistent view of global state. */ |
4283 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 3749 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
4284 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | 3750 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); |
4285 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 3751 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
4286 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 3752 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
4287 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 3753 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
@@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
4340 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3806 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
4341 | } | 3807 | } |
4342 | 3808 | ||
4343 | static void rcu_prepare_cpu(int cpu) | 3809 | int rcutree_prepare_cpu(unsigned int cpu) |
4344 | { | 3810 | { |
4345 | struct rcu_state *rsp; | 3811 | struct rcu_state *rsp; |
4346 | 3812 | ||
4347 | for_each_rcu_flavor(rsp) | 3813 | for_each_rcu_flavor(rsp) |
4348 | rcu_init_percpu_data(cpu, rsp); | 3814 | rcu_init_percpu_data(cpu, rsp); |
3815 | |||
3816 | rcu_prepare_kthreads(cpu); | ||
3817 | rcu_spawn_all_nocb_kthreads(cpu); | ||
3818 | |||
3819 | return 0; | ||
3820 | } | ||
3821 | |||
3822 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | ||
3823 | { | ||
3824 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | ||
3825 | |||
3826 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); | ||
3827 | } | ||
3828 | |||
3829 | int rcutree_online_cpu(unsigned int cpu) | ||
3830 | { | ||
3831 | sync_sched_exp_online_cleanup(cpu); | ||
3832 | rcutree_affinity_setting(cpu, -1); | ||
3833 | return 0; | ||
3834 | } | ||
3835 | |||
3836 | int rcutree_offline_cpu(unsigned int cpu) | ||
3837 | { | ||
3838 | rcutree_affinity_setting(cpu, cpu); | ||
3839 | return 0; | ||
3840 | } | ||
3841 | |||
3842 | |||
3843 | int rcutree_dying_cpu(unsigned int cpu) | ||
3844 | { | ||
3845 | struct rcu_state *rsp; | ||
3846 | |||
3847 | for_each_rcu_flavor(rsp) | ||
3848 | rcu_cleanup_dying_cpu(rsp); | ||
3849 | return 0; | ||
3850 | } | ||
3851 | |||
3852 | int rcutree_dead_cpu(unsigned int cpu) | ||
3853 | { | ||
3854 | struct rcu_state *rsp; | ||
3855 | |||
3856 | for_each_rcu_flavor(rsp) { | ||
3857 | rcu_cleanup_dead_cpu(cpu, rsp); | ||
3858 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
3859 | } | ||
3860 | return 0; | ||
4349 | } | 3861 | } |
4350 | 3862 | ||
4351 | #ifdef CONFIG_HOTPLUG_CPU | 3863 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
4364 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3876 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
4365 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | 3877 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
4366 | 3878 | ||
4367 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | ||
4368 | return; | ||
4369 | |||
4370 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | 3879 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ |
4371 | mask = rdp->grpmask; | 3880 | mask = rdp->grpmask; |
4372 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ | 3881 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ |
@@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu) | |||
4388 | } | 3897 | } |
4389 | #endif | 3898 | #endif |
4390 | 3899 | ||
4391 | /* | ||
4392 | * Handle CPU online/offline notification events. | ||
4393 | */ | ||
4394 | int rcu_cpu_notify(struct notifier_block *self, | ||
4395 | unsigned long action, void *hcpu) | ||
4396 | { | ||
4397 | long cpu = (long)hcpu; | ||
4398 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | ||
4399 | struct rcu_node *rnp = rdp->mynode; | ||
4400 | struct rcu_state *rsp; | ||
4401 | |||
4402 | switch (action) { | ||
4403 | case CPU_UP_PREPARE: | ||
4404 | case CPU_UP_PREPARE_FROZEN: | ||
4405 | rcu_prepare_cpu(cpu); | ||
4406 | rcu_prepare_kthreads(cpu); | ||
4407 | rcu_spawn_all_nocb_kthreads(cpu); | ||
4408 | break; | ||
4409 | case CPU_ONLINE: | ||
4410 | case CPU_DOWN_FAILED: | ||
4411 | sync_sched_exp_online_cleanup(cpu); | ||
4412 | rcu_boost_kthread_setaffinity(rnp, -1); | ||
4413 | break; | ||
4414 | case CPU_DOWN_PREPARE: | ||
4415 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
4416 | break; | ||
4417 | case CPU_DYING: | ||
4418 | case CPU_DYING_FROZEN: | ||
4419 | for_each_rcu_flavor(rsp) | ||
4420 | rcu_cleanup_dying_cpu(rsp); | ||
4421 | break; | ||
4422 | case CPU_DEAD: | ||
4423 | case CPU_DEAD_FROZEN: | ||
4424 | case CPU_UP_CANCELED: | ||
4425 | case CPU_UP_CANCELED_FROZEN: | ||
4426 | for_each_rcu_flavor(rsp) { | ||
4427 | rcu_cleanup_dead_cpu(cpu, rsp); | ||
4428 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
4429 | } | ||
4430 | break; | ||
4431 | default: | ||
4432 | break; | ||
4433 | } | ||
4434 | return NOTIFY_OK; | ||
4435 | } | ||
4436 | |||
4437 | static int rcu_pm_notify(struct notifier_block *self, | 3900 | static int rcu_pm_notify(struct notifier_block *self, |
4438 | unsigned long action, void *hcpu) | 3901 | unsigned long action, void *hcpu) |
4439 | { | 3902 | { |
@@ -4745,10 +4208,10 @@ void __init rcu_init(void) | |||
4745 | * this is called early in boot, before either interrupts | 4208 | * this is called early in boot, before either interrupts |
4746 | * or the scheduler are operational. | 4209 | * or the scheduler are operational. |
4747 | */ | 4210 | */ |
4748 | cpu_notifier(rcu_cpu_notify, 0); | ||
4749 | pm_notifier(rcu_pm_notify, 0); | 4211 | pm_notifier(rcu_pm_notify, 0); |
4750 | for_each_online_cpu(cpu) | 4212 | for_each_online_cpu(cpu) |
4751 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 4213 | rcutree_prepare_cpu(cpu); |
4752 | } | 4214 | } |
4753 | 4215 | ||
4216 | #include "tree_exp.h" | ||
4754 | #include "tree_plugin.h" | 4217 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -254,6 +254,13 @@ struct rcu_node { | |||
254 | } ____cacheline_internodealigned_in_smp; | 254 | } ____cacheline_internodealigned_in_smp; |
255 | 255 | ||
256 | /* | 256 | /* |
257 | * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and | ||
258 | * are indexed relative to this interval rather than the global CPU ID space. | ||
259 | * This generates the bit for a CPU in node-local masks. | ||
260 | */ | ||
261 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) | ||
262 | |||
263 | /* | ||
257 | * Do a full breadth-first scan of the rcu_node structures for the | 264 | * Do a full breadth-first scan of the rcu_node structures for the |
258 | * specified rcu_state structure. | 265 | * specified rcu_state structure. |
259 | */ | 266 | */ |
@@ -281,6 +288,14 @@ struct rcu_node { | |||
281 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | 288 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
282 | 289 | ||
283 | /* | 290 | /* |
291 | * Iterate over all possible CPUs in a leaf RCU node. | ||
292 | */ | ||
293 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
294 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
295 | cpu <= rnp->grphi; \ | ||
296 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
297 | |||
298 | /* | ||
284 | * Union to allow "aggregate OR" operation on the need for a quiescent | 299 | * Union to allow "aggregate OR" operation on the need for a quiescent |
285 | * state by the normal and expedited grace periods. | 300 | * state by the normal and expedited grace periods. |
286 | */ | 301 | */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h | |||
@@ -0,0 +1,655 @@ | |||
1 | /* | ||
2 | * RCU expedited grace periods | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2016 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | /* Wrapper functions for expedited grace periods. */ | ||
24 | static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | ||
25 | { | ||
26 | rcu_seq_start(&rsp->expedited_sequence); | ||
27 | } | ||
28 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | ||
29 | { | ||
30 | rcu_seq_end(&rsp->expedited_sequence); | ||
31 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ | ||
32 | } | ||
33 | static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) | ||
34 | { | ||
35 | unsigned long s; | ||
36 | |||
37 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
38 | s = rcu_seq_snap(&rsp->expedited_sequence); | ||
39 | trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); | ||
40 | return s; | ||
41 | } | ||
42 | static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | ||
43 | { | ||
44 | return rcu_seq_done(&rsp->expedited_sequence, s); | ||
45 | } | ||
46 | |||
47 | /* | ||
48 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | ||
49 | * recent CPU-online activity. Note that these masks are not cleared | ||
50 | * when CPUs go offline, so they reflect the union of all CPUs that have | ||
51 | * ever been online. This means that this function normally takes its | ||
52 | * no-work-to-do fastpath. | ||
53 | */ | ||
54 | static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | ||
55 | { | ||
56 | bool done; | ||
57 | unsigned long flags; | ||
58 | unsigned long mask; | ||
59 | unsigned long oldmask; | ||
60 | int ncpus = READ_ONCE(rsp->ncpus); | ||
61 | struct rcu_node *rnp; | ||
62 | struct rcu_node *rnp_up; | ||
63 | |||
64 | /* If no new CPUs onlined since last time, nothing to do. */ | ||
65 | if (likely(ncpus == rsp->ncpus_snap)) | ||
66 | return; | ||
67 | rsp->ncpus_snap = ncpus; | ||
68 | |||
69 | /* | ||
70 | * Each pass through the following loop propagates newly onlined | ||
71 | * CPUs for the current rcu_node structure up the rcu_node tree. | ||
72 | */ | ||
73 | rcu_for_each_leaf_node(rsp, rnp) { | ||
74 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
75 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | ||
76 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
77 | continue; /* No new CPUs, nothing to do. */ | ||
78 | } | ||
79 | |||
80 | /* Update this node's mask, track old value for propagation. */ | ||
81 | oldmask = rnp->expmaskinit; | ||
82 | rnp->expmaskinit = rnp->expmaskinitnext; | ||
83 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
84 | |||
85 | /* If was already nonzero, nothing to propagate. */ | ||
86 | if (oldmask) | ||
87 | continue; | ||
88 | |||
89 | /* Propagate the new CPU up the tree. */ | ||
90 | mask = rnp->grpmask; | ||
91 | rnp_up = rnp->parent; | ||
92 | done = false; | ||
93 | while (rnp_up) { | ||
94 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); | ||
95 | if (rnp_up->expmaskinit) | ||
96 | done = true; | ||
97 | rnp_up->expmaskinit |= mask; | ||
98 | raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); | ||
99 | if (done) | ||
100 | break; | ||
101 | mask = rnp_up->grpmask; | ||
102 | rnp_up = rnp_up->parent; | ||
103 | } | ||
104 | } | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Reset the ->expmask values in the rcu_node tree in preparation for | ||
109 | * a new expedited grace period. | ||
110 | */ | ||
111 | static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | ||
112 | { | ||
113 | unsigned long flags; | ||
114 | struct rcu_node *rnp; | ||
115 | |||
116 | sync_exp_reset_tree_hotplug(rsp); | ||
117 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
118 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
119 | WARN_ON_ONCE(rnp->expmask); | ||
120 | rnp->expmask = rnp->expmaskinit; | ||
121 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
122 | } | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * Return non-zero if there is no RCU expedited grace period in progress | ||
127 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
128 | * tasks covered by the specified rcu_node structure have done their bit | ||
129 | * for the current expedited grace period. Works only for preemptible | ||
130 | * RCU -- other RCU implementation use other means. | ||
131 | * | ||
132 | * Caller must hold the rcu_state's exp_mutex. | ||
133 | */ | ||
134 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
135 | { | ||
136 | return rnp->exp_tasks == NULL && | ||
137 | READ_ONCE(rnp->expmask) == 0; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Report the exit from RCU read-side critical section for the last task | ||
142 | * that queued itself during or before the current expedited preemptible-RCU | ||
143 | * grace period. This event is reported either to the rcu_node structure on | ||
144 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
145 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
146 | * iteratively!) | ||
147 | * | ||
148 | * Caller must hold the rcu_state's exp_mutex and the specified rcu_node | ||
149 | * structure's ->lock. | ||
150 | */ | ||
151 | static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
152 | bool wake, unsigned long flags) | ||
153 | __releases(rnp->lock) | ||
154 | { | ||
155 | unsigned long mask; | ||
156 | |||
157 | for (;;) { | ||
158 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
159 | if (!rnp->expmask) | ||
160 | rcu_initiate_boost(rnp, flags); | ||
161 | else | ||
162 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
163 | break; | ||
164 | } | ||
165 | if (rnp->parent == NULL) { | ||
166 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
167 | if (wake) { | ||
168 | smp_mb(); /* EGP done before wake_up(). */ | ||
169 | swake_up(&rsp->expedited_wq); | ||
170 | } | ||
171 | break; | ||
172 | } | ||
173 | mask = rnp->grpmask; | ||
174 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ | ||
175 | rnp = rnp->parent; | ||
176 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ | ||
177 | WARN_ON_ONCE(!(rnp->expmask & mask)); | ||
178 | rnp->expmask &= ~mask; | ||
179 | } | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * Report expedited quiescent state for specified node. This is a | ||
184 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | ||
185 | * | ||
186 | * Caller must hold the rcu_state's exp_mutex. | ||
187 | */ | ||
188 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | ||
189 | struct rcu_node *rnp, bool wake) | ||
190 | { | ||
191 | unsigned long flags; | ||
192 | |||
193 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
194 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | ||
195 | } | ||
196 | |||
197 | /* | ||
198 | * Report expedited quiescent state for multiple CPUs, all covered by the | ||
199 | * specified leaf rcu_node structure. Caller must hold the rcu_state's | ||
200 | * exp_mutex. | ||
201 | */ | ||
202 | static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | ||
203 | unsigned long mask, bool wake) | ||
204 | { | ||
205 | unsigned long flags; | ||
206 | |||
207 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
208 | if (!(rnp->expmask & mask)) { | ||
209 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
210 | return; | ||
211 | } | ||
212 | rnp->expmask &= ~mask; | ||
213 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * Report expedited quiescent state for specified rcu_data (CPU). | ||
218 | */ | ||
219 | static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, | ||
220 | bool wake) | ||
221 | { | ||
222 | rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); | ||
223 | } | ||
224 | |||
225 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | ||
226 | static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, | ||
227 | unsigned long s) | ||
228 | { | ||
229 | if (rcu_exp_gp_seq_done(rsp, s)) { | ||
230 | trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); | ||
231 | /* Ensure test happens before caller kfree(). */ | ||
232 | smp_mb__before_atomic(); /* ^^^ */ | ||
233 | atomic_long_inc(stat); | ||
234 | return true; | ||
235 | } | ||
236 | return false; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Funnel-lock acquisition for expedited grace periods. Returns true | ||
241 | * if some other task completed an expedited grace period that this task | ||
242 | * can piggy-back on, and with no mutex held. Otherwise, returns false | ||
243 | * with the mutex held, indicating that the caller must actually do the | ||
244 | * expedited grace period. | ||
245 | */ | ||
246 | static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | ||
247 | { | ||
248 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
249 | struct rcu_node *rnp = rdp->mynode; | ||
250 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
251 | |||
252 | /* Low-contention fastpath. */ | ||
253 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && | ||
254 | (rnp == rnp_root || | ||
255 | ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && | ||
256 | mutex_trylock(&rsp->exp_mutex)) | ||
257 | goto fastpath; | ||
258 | |||
259 | /* | ||
260 | * Each pass through the following loop works its way up | ||
261 | * the rcu_node tree, returning if others have done the work or | ||
262 | * otherwise falls through to acquire rsp->exp_mutex. The mapping | ||
263 | * from CPU to rcu_node structure can be inexact, as it is just | ||
264 | * promoting locality and is not strictly needed for correctness. | ||
265 | */ | ||
266 | for (; rnp != NULL; rnp = rnp->parent) { | ||
267 | if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) | ||
268 | return true; | ||
269 | |||
270 | /* Work not done, either wait here or go up. */ | ||
271 | spin_lock(&rnp->exp_lock); | ||
272 | if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { | ||
273 | |||
274 | /* Someone else doing GP, so wait for them. */ | ||
275 | spin_unlock(&rnp->exp_lock); | ||
276 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | ||
277 | rnp->grplo, rnp->grphi, | ||
278 | TPS("wait")); | ||
279 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
280 | sync_exp_work_done(rsp, | ||
281 | &rdp->exp_workdone2, s)); | ||
282 | return true; | ||
283 | } | ||
284 | rnp->exp_seq_rq = s; /* Followers can wait on us. */ | ||
285 | spin_unlock(&rnp->exp_lock); | ||
286 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, | ||
287 | rnp->grphi, TPS("nxtlvl")); | ||
288 | } | ||
289 | mutex_lock(&rsp->exp_mutex); | ||
290 | fastpath: | ||
291 | if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { | ||
292 | mutex_unlock(&rsp->exp_mutex); | ||
293 | return true; | ||
294 | } | ||
295 | rcu_exp_gp_seq_start(rsp); | ||
296 | trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); | ||
297 | return false; | ||
298 | } | ||
299 | |||
300 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | ||
301 | static void sync_sched_exp_handler(void *data) | ||
302 | { | ||
303 | struct rcu_data *rdp; | ||
304 | struct rcu_node *rnp; | ||
305 | struct rcu_state *rsp = data; | ||
306 | |||
307 | rdp = this_cpu_ptr(rsp->rda); | ||
308 | rnp = rdp->mynode; | ||
309 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | ||
310 | __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | ||
311 | return; | ||
312 | if (rcu_is_cpu_rrupt_from_idle()) { | ||
313 | rcu_report_exp_rdp(&rcu_sched_state, | ||
314 | this_cpu_ptr(&rcu_sched_data), true); | ||
315 | return; | ||
316 | } | ||
317 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | ||
318 | resched_cpu(smp_processor_id()); | ||
319 | } | ||
320 | |||
321 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | ||
322 | static void sync_sched_exp_online_cleanup(int cpu) | ||
323 | { | ||
324 | struct rcu_data *rdp; | ||
325 | int ret; | ||
326 | struct rcu_node *rnp; | ||
327 | struct rcu_state *rsp = &rcu_sched_state; | ||
328 | |||
329 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
330 | rnp = rdp->mynode; | ||
331 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | ||
332 | return; | ||
333 | ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); | ||
334 | WARN_ON_ONCE(ret); | ||
335 | } | ||
336 | |||
337 | /* | ||
338 | * Select the nodes that the upcoming expedited grace period needs | ||
339 | * to wait for. | ||
340 | */ | ||
341 | static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | ||
342 | smp_call_func_t func) | ||
343 | { | ||
344 | int cpu; | ||
345 | unsigned long flags; | ||
346 | unsigned long mask_ofl_test; | ||
347 | unsigned long mask_ofl_ipi; | ||
348 | int ret; | ||
349 | struct rcu_node *rnp; | ||
350 | |||
351 | sync_exp_reset_tree(rsp); | ||
352 | rcu_for_each_leaf_node(rsp, rnp) { | ||
353 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
354 | |||
355 | /* Each pass checks a CPU for identity, offline, and idle. */ | ||
356 | mask_ofl_test = 0; | ||
357 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
358 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
360 | |||
361 | if (raw_smp_processor_id() == cpu || | ||
362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
363 | mask_ofl_test |= rdp->grpmask; | ||
364 | } | ||
365 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | ||
366 | |||
367 | /* | ||
368 | * Need to wait for any blocked tasks as well. Note that | ||
369 | * additional blocking tasks will also block the expedited | ||
370 | * GP until such time as the ->expmask bits are cleared. | ||
371 | */ | ||
372 | if (rcu_preempt_has_tasks(rnp)) | ||
373 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
374 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
375 | |||
376 | /* IPI the remaining CPUs for expedited quiescent state. */ | ||
377 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
378 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | ||
379 | if (!(mask_ofl_ipi & mask)) | ||
380 | continue; | ||
381 | retry_ipi: | ||
382 | ret = smp_call_function_single(cpu, func, rsp, 0); | ||
383 | if (!ret) { | ||
384 | mask_ofl_ipi &= ~mask; | ||
385 | continue; | ||
386 | } | ||
387 | /* Failed, raced with offline. */ | ||
388 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
389 | if (cpu_online(cpu) && | ||
390 | (rnp->expmask & mask)) { | ||
391 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
392 | schedule_timeout_uninterruptible(1); | ||
393 | if (cpu_online(cpu) && | ||
394 | (rnp->expmask & mask)) | ||
395 | goto retry_ipi; | ||
396 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
397 | } | ||
398 | if (!(rnp->expmask & mask)) | ||
399 | mask_ofl_ipi &= ~mask; | ||
400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
401 | } | ||
402 | /* Report quiescent states for those that went offline. */ | ||
403 | mask_ofl_test |= mask_ofl_ipi; | ||
404 | if (mask_ofl_test) | ||
405 | rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); | ||
406 | } | ||
407 | } | ||
408 | |||
409 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | ||
410 | { | ||
411 | int cpu; | ||
412 | unsigned long jiffies_stall; | ||
413 | unsigned long jiffies_start; | ||
414 | unsigned long mask; | ||
415 | int ndetected; | ||
416 | struct rcu_node *rnp; | ||
417 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
418 | int ret; | ||
419 | |||
420 | jiffies_stall = rcu_jiffies_till_stall_check(); | ||
421 | jiffies_start = jiffies; | ||
422 | |||
423 | for (;;) { | ||
424 | ret = swait_event_timeout( | ||
425 | rsp->expedited_wq, | ||
426 | sync_rcu_preempt_exp_done(rnp_root), | ||
427 | jiffies_stall); | ||
428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | ||
429 | return; | ||
430 | if (ret < 0) { | ||
431 | /* Hit a signal, disable CPU stall warnings. */ | ||
432 | swait_event(rsp->expedited_wq, | ||
433 | sync_rcu_preempt_exp_done(rnp_root)); | ||
434 | return; | ||
435 | } | ||
436 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | ||
437 | rsp->name); | ||
438 | ndetected = 0; | ||
439 | rcu_for_each_leaf_node(rsp, rnp) { | ||
440 | ndetected += rcu_print_task_exp_stall(rnp); | ||
441 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
442 | struct rcu_data *rdp; | ||
443 | |||
444 | mask = leaf_node_cpu_bit(rnp, cpu); | ||
445 | if (!(rnp->expmask & mask)) | ||
446 | continue; | ||
447 | ndetected++; | ||
448 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
449 | pr_cont(" %d-%c%c%c", cpu, | ||
450 | "O."[!!cpu_online(cpu)], | ||
451 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | ||
452 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | ||
453 | } | ||
454 | } | ||
455 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", | ||
456 | jiffies - jiffies_start, rsp->expedited_sequence, | ||
457 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); | ||
458 | if (ndetected) { | ||
459 | pr_err("blocking rcu_node structures:"); | ||
460 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
461 | if (rnp == rnp_root) | ||
462 | continue; /* printed unconditionally */ | ||
463 | if (sync_rcu_preempt_exp_done(rnp)) | ||
464 | continue; | ||
465 | pr_cont(" l=%u:%d-%d:%#lx/%c", | ||
466 | rnp->level, rnp->grplo, rnp->grphi, | ||
467 | rnp->expmask, | ||
468 | ".T"[!!rnp->exp_tasks]); | ||
469 | } | ||
470 | pr_cont("\n"); | ||
471 | } | ||
472 | rcu_for_each_leaf_node(rsp, rnp) { | ||
473 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
474 | mask = leaf_node_cpu_bit(rnp, cpu); | ||
475 | if (!(rnp->expmask & mask)) | ||
476 | continue; | ||
477 | dump_cpu_task(cpu); | ||
478 | } | ||
479 | } | ||
480 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | ||
481 | } | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * Wait for the current expedited grace period to complete, and then | ||
486 | * wake up everyone who piggybacked on the just-completed expedited | ||
487 | * grace period. Also update all the ->exp_seq_rq counters as needed | ||
488 | * in order to avoid counter-wrap problems. | ||
489 | */ | ||
490 | static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | ||
491 | { | ||
492 | struct rcu_node *rnp; | ||
493 | |||
494 | synchronize_sched_expedited_wait(rsp); | ||
495 | rcu_exp_gp_seq_end(rsp); | ||
496 | trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); | ||
497 | |||
498 | /* | ||
499 | * Switch over to wakeup mode, allowing the next GP, but -only- the | ||
500 | * next GP, to proceed. | ||
501 | */ | ||
502 | mutex_lock(&rsp->exp_wake_mutex); | ||
503 | mutex_unlock(&rsp->exp_mutex); | ||
504 | |||
505 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
506 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | ||
507 | spin_lock(&rnp->exp_lock); | ||
508 | /* Recheck, avoid hang in case someone just arrived. */ | ||
509 | if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) | ||
510 | rnp->exp_seq_rq = s; | ||
511 | spin_unlock(&rnp->exp_lock); | ||
512 | } | ||
513 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | ||
514 | } | ||
515 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | ||
516 | mutex_unlock(&rsp->exp_wake_mutex); | ||
517 | } | ||
518 | |||
519 | /** | ||
520 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
521 | * | ||
522 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
523 | * approach to force the grace period to end quickly. This consumes | ||
524 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
525 | * so is thus not recommended for any sort of common-case code. In fact, | ||
526 | * if you are using synchronize_sched_expedited() in a loop, please | ||
527 | * restructure your code to batch your updates, and then use a single | ||
528 | * synchronize_sched() instead. | ||
529 | * | ||
530 | * This implementation can be thought of as an application of sequence | ||
531 | * locking to expedited grace periods, but using the sequence counter to | ||
532 | * determine when someone else has already done the work instead of for | ||
533 | * retrying readers. | ||
534 | */ | ||
535 | void synchronize_sched_expedited(void) | ||
536 | { | ||
537 | unsigned long s; | ||
538 | struct rcu_state *rsp = &rcu_sched_state; | ||
539 | |||
540 | /* If only one CPU, this is automatically a grace period. */ | ||
541 | if (rcu_blocking_is_gp()) | ||
542 | return; | ||
543 | |||
544 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
545 | if (rcu_gp_is_normal()) { | ||
546 | wait_rcu_gp(call_rcu_sched); | ||
547 | return; | ||
548 | } | ||
549 | |||
550 | /* Take a snapshot of the sequence number. */ | ||
551 | s = rcu_exp_gp_seq_snap(rsp); | ||
552 | if (exp_funnel_lock(rsp, s)) | ||
553 | return; /* Someone else did our work for us. */ | ||
554 | |||
555 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
556 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
557 | |||
558 | /* Wait and clean up, including waking everyone. */ | ||
559 | rcu_exp_wait_wake(rsp, s); | ||
560 | } | ||
561 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
562 | |||
563 | #ifdef CONFIG_PREEMPT_RCU | ||
564 | |||
565 | /* | ||
566 | * Remote handler for smp_call_function_single(). If there is an | ||
567 | * RCU read-side critical section in effect, request that the | ||
568 | * next rcu_read_unlock() record the quiescent state up the | ||
569 | * ->expmask fields in the rcu_node tree. Otherwise, immediately | ||
570 | * report the quiescent state. | ||
571 | */ | ||
572 | static void sync_rcu_exp_handler(void *info) | ||
573 | { | ||
574 | struct rcu_data *rdp; | ||
575 | struct rcu_state *rsp = info; | ||
576 | struct task_struct *t = current; | ||
577 | |||
578 | /* | ||
579 | * Within an RCU read-side critical section, request that the next | ||
580 | * rcu_read_unlock() report. Unless this RCU read-side critical | ||
581 | * section has already blocked, in which case it is already set | ||
582 | * up for the expedited grace period to wait on it. | ||
583 | */ | ||
584 | if (t->rcu_read_lock_nesting > 0 && | ||
585 | !t->rcu_read_unlock_special.b.blocked) { | ||
586 | t->rcu_read_unlock_special.b.exp_need_qs = true; | ||
587 | return; | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * We are either exiting an RCU read-side critical section (negative | ||
592 | * values of t->rcu_read_lock_nesting) or are not in one at all | ||
593 | * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU | ||
594 | * read-side critical section that blocked before this expedited | ||
595 | * grace period started. Either way, we can immediately report | ||
596 | * the quiescent state. | ||
597 | */ | ||
598 | rdp = this_cpu_ptr(rsp->rda); | ||
599 | rcu_report_exp_rdp(rsp, rdp, true); | ||
600 | } | ||
601 | |||
602 | /** | ||
603 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
604 | * | ||
605 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
606 | * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler | ||
607 | * checks whether the CPU is in an RCU-preempt critical section, and | ||
608 | * if so, it sets a flag that causes the outermost rcu_read_unlock() | ||
609 | * to report the quiescent state. On the other hand, if the CPU is | ||
610 | * not in an RCU read-side critical section, the IPI handler reports | ||
611 | * the quiescent state immediately. | ||
612 | * | ||
613 | * Although this is a greate improvement over previous expedited | ||
614 | * implementations, it is still unfriendly to real-time workloads, so is | ||
615 | * thus not recommended for any sort of common-case code. In fact, if | ||
616 | * you are using synchronize_rcu_expedited() in a loop, please restructure | ||
617 | * your code to batch your updates, and then Use a single synchronize_rcu() | ||
618 | * instead. | ||
619 | */ | ||
620 | void synchronize_rcu_expedited(void) | ||
621 | { | ||
622 | struct rcu_state *rsp = rcu_state_p; | ||
623 | unsigned long s; | ||
624 | |||
625 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
626 | if (rcu_gp_is_normal()) { | ||
627 | wait_rcu_gp(call_rcu); | ||
628 | return; | ||
629 | } | ||
630 | |||
631 | s = rcu_exp_gp_seq_snap(rsp); | ||
632 | if (exp_funnel_lock(rsp, s)) | ||
633 | return; /* Someone else did our work for us. */ | ||
634 | |||
635 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
636 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
637 | |||
638 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | ||
639 | rcu_exp_wait_wake(rsp, s); | ||
640 | } | ||
641 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
642 | |||
643 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
644 | |||
645 | /* | ||
646 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
647 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
648 | */ | ||
649 | void synchronize_rcu_expedited(void) | ||
650 | { | ||
651 | synchronize_sched_expedited(); | ||
652 | } | ||
653 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
654 | |||
655 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
79 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | 79 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); |
80 | if (IS_ENABLED(CONFIG_PROVE_RCU)) | 80 | if (IS_ENABLED(CONFIG_PROVE_RCU)) |
81 | pr_info("\tRCU lockdep checking is enabled.\n"); | 81 | pr_info("\tRCU lockdep checking is enabled.\n"); |
82 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) | ||
83 | pr_info("\tRCU torture testing starts during boot.\n"); | ||
84 | if (RCU_NUM_LVLS >= 4) | 82 | if (RCU_NUM_LVLS >= 4) |
85 | pr_info("\tFour(or more)-level hierarchy is enabled.\n"); | 83 | pr_info("\tFour(or more)-level hierarchy is enabled.\n"); |
86 | if (RCU_FANOUT_LEAF != 16) | 84 | if (RCU_FANOUT_LEAF != 16) |
@@ -681,84 +679,6 @@ void synchronize_rcu(void) | |||
681 | } | 679 | } |
682 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 680 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
683 | 681 | ||
684 | /* | ||
685 | * Remote handler for smp_call_function_single(). If there is an | ||
686 | * RCU read-side critical section in effect, request that the | ||
687 | * next rcu_read_unlock() record the quiescent state up the | ||
688 | * ->expmask fields in the rcu_node tree. Otherwise, immediately | ||
689 | * report the quiescent state. | ||
690 | */ | ||
691 | static void sync_rcu_exp_handler(void *info) | ||
692 | { | ||
693 | struct rcu_data *rdp; | ||
694 | struct rcu_state *rsp = info; | ||
695 | struct task_struct *t = current; | ||
696 | |||
697 | /* | ||
698 | * Within an RCU read-side critical section, request that the next | ||
699 | * rcu_read_unlock() report. Unless this RCU read-side critical | ||
700 | * section has already blocked, in which case it is already set | ||
701 | * up for the expedited grace period to wait on it. | ||
702 | */ | ||
703 | if (t->rcu_read_lock_nesting > 0 && | ||
704 | !t->rcu_read_unlock_special.b.blocked) { | ||
705 | t->rcu_read_unlock_special.b.exp_need_qs = true; | ||
706 | return; | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * We are either exiting an RCU read-side critical section (negative | ||
711 | * values of t->rcu_read_lock_nesting) or are not in one at all | ||
712 | * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU | ||
713 | * read-side critical section that blocked before this expedited | ||
714 | * grace period started. Either way, we can immediately report | ||
715 | * the quiescent state. | ||
716 | */ | ||
717 | rdp = this_cpu_ptr(rsp->rda); | ||
718 | rcu_report_exp_rdp(rsp, rdp, true); | ||
719 | } | ||
720 | |||
721 | /** | ||
722 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
723 | * | ||
724 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
725 | * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler | ||
726 | * checks whether the CPU is in an RCU-preempt critical section, and | ||
727 | * if so, it sets a flag that causes the outermost rcu_read_unlock() | ||
728 | * to report the quiescent state. On the other hand, if the CPU is | ||
729 | * not in an RCU read-side critical section, the IPI handler reports | ||
730 | * the quiescent state immediately. | ||
731 | * | ||
732 | * Although this is a greate improvement over previous expedited | ||
733 | * implementations, it is still unfriendly to real-time workloads, so is | ||
734 | * thus not recommended for any sort of common-case code. In fact, if | ||
735 | * you are using synchronize_rcu_expedited() in a loop, please restructure | ||
736 | * your code to batch your updates, and then Use a single synchronize_rcu() | ||
737 | * instead. | ||
738 | */ | ||
739 | void synchronize_rcu_expedited(void) | ||
740 | { | ||
741 | struct rcu_state *rsp = rcu_state_p; | ||
742 | unsigned long s; | ||
743 | |||
744 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
745 | if (rcu_gp_is_normal()) { | ||
746 | wait_rcu_gp(call_rcu); | ||
747 | return; | ||
748 | } | ||
749 | |||
750 | s = rcu_exp_gp_seq_snap(rsp); | ||
751 | if (exp_funnel_lock(rsp, s)) | ||
752 | return; /* Someone else did our work for us. */ | ||
753 | |||
754 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
755 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
756 | |||
757 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | ||
758 | rcu_exp_wait_wake(rsp, s); | ||
759 | } | ||
760 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
761 | |||
762 | /** | 682 | /** |
763 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | 683 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. |
764 | * | 684 | * |
@@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void) | |||
883 | } | 803 | } |
884 | 804 | ||
885 | /* | 805 | /* |
886 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
887 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
888 | */ | ||
889 | void synchronize_rcu_expedited(void) | ||
890 | { | ||
891 | synchronize_sched_expedited(); | ||
892 | } | ||
893 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
894 | |||
895 | /* | ||
896 | * Because preemptible RCU does not exist, rcu_barrier() is just | 806 | * Because preemptible RCU does not exist, rcu_barrier() is just |
897 | * another name for rcu_barrier_sched(). | 807 | * another name for rcu_barrier_sched(). |
898 | */ | 808 | */ |
@@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
1254 | return; | 1164 | return; |
1255 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) | 1165 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) |
1256 | return; | 1166 | return; |
1257 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | 1167 | for_each_leaf_node_possible_cpu(rnp, cpu) |
1258 | if ((mask & 0x1) && cpu != outgoingcpu) | 1168 | if ((mask & leaf_node_cpu_bit(rnp, cpu)) && |
1169 | cpu != outgoingcpu) | ||
1259 | cpumask_set_cpu(cpu, cm); | 1170 | cpumask_set_cpu(cpu, cm); |
1260 | if (cpumask_weight(cm) == 0) | 1171 | if (cpumask_weight(cm) == 0) |
1261 | cpumask_setall(cm); | 1172 | cpumask_setall(cm); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; | |||
528 | module_param(rcu_task_stall_timeout, int, 0644); | 528 | module_param(rcu_task_stall_timeout, int, 0644); |
529 | 529 | ||
530 | static void rcu_spawn_tasks_kthread(void); | 530 | static void rcu_spawn_tasks_kthread(void); |
531 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
531 | 532 | ||
532 | /* | 533 | /* |
533 | * Post an RCU-tasks callback. First call must be from process context | 534 | * Post an RCU-tasks callback. First call must be from process context |
@@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) | |||
537 | { | 538 | { |
538 | unsigned long flags; | 539 | unsigned long flags; |
539 | bool needwake; | 540 | bool needwake; |
541 | bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); | ||
540 | 542 | ||
541 | rhp->next = NULL; | 543 | rhp->next = NULL; |
542 | rhp->func = func; | 544 | rhp->func = func; |
@@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) | |||
545 | *rcu_tasks_cbs_tail = rhp; | 547 | *rcu_tasks_cbs_tail = rhp; |
546 | rcu_tasks_cbs_tail = &rhp->next; | 548 | rcu_tasks_cbs_tail = &rhp->next; |
547 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | 549 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); |
548 | if (needwake) { | 550 | /* We can't create the thread unless interrupts are enabled. */ |
551 | if ((needwake && havetask) || | ||
552 | (!havetask && !irqs_disabled_flags(flags))) { | ||
549 | rcu_spawn_tasks_kthread(); | 553 | rcu_spawn_tasks_kthread(); |
550 | wake_up(&rcu_tasks_cbs_wq); | 554 | wake_up(&rcu_tasks_cbs_wq); |
551 | } | 555 | } |
@@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) | |||
790 | static void rcu_spawn_tasks_kthread(void) | 794 | static void rcu_spawn_tasks_kthread(void) |
791 | { | 795 | { |
792 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); | 796 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); |
793 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
794 | struct task_struct *t; | 797 | struct task_struct *t; |
795 | 798 | ||
796 | if (READ_ONCE(rcu_tasks_kthread_ptr)) { | 799 | if (READ_ONCE(rcu_tasks_kthread_ptr)) { |
diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -614,6 +614,7 @@ free_bufs: | |||
614 | 614 | ||
615 | kref_put(&chan->kref, relay_destroy_channel); | 615 | kref_put(&chan->kref, relay_destroy_channel); |
616 | mutex_unlock(&relay_channels_mutex); | 616 | mutex_unlock(&relay_channels_mutex); |
617 | kfree(chan); | ||
617 | return NULL; | 618 | return NULL; |
618 | } | 619 | } |
619 | EXPORT_SYMBOL_GPL(relay_open); | 620 | EXPORT_SYMBOL_GPL(relay_open); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f2cae4620c7..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1536 | for (;;) { | 1536 | for (;;) { |
1537 | /* Any allowed, online CPU? */ | 1537 | /* Any allowed, online CPU? */ |
1538 | for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { | 1538 | for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { |
1539 | if (!cpu_active(dest_cpu)) | 1539 | if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) |
1540 | continue; | ||
1541 | if (!cpu_online(dest_cpu)) | ||
1540 | continue; | 1542 | continue; |
1541 | goto out; | 1543 | goto out; |
1542 | } | 1544 | } |
@@ -1935,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1935 | * chain to provide order. Instead we do: | 1937 | * chain to provide order. Instead we do: |
1936 | * | 1938 | * |
1937 | * 1) smp_store_release(X->on_cpu, 0) | 1939 | * 1) smp_store_release(X->on_cpu, 0) |
1938 | * 2) smp_cond_acquire(!X->on_cpu) | 1940 | * 2) smp_cond_load_acquire(!X->on_cpu) |
1939 | * | 1941 | * |
1940 | * Example: | 1942 | * Example: |
1941 | * | 1943 | * |
@@ -1946,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1946 | * sched-out X | 1948 | * sched-out X |
1947 | * smp_store_release(X->on_cpu, 0); | 1949 | * smp_store_release(X->on_cpu, 0); |
1948 | * | 1950 | * |
1949 | * smp_cond_acquire(!X->on_cpu); | 1951 | * smp_cond_load_acquire(&X->on_cpu, !VAL); |
1950 | * X->state = WAKING | 1952 | * X->state = WAKING |
1951 | * set_task_cpu(X,2) | 1953 | * set_task_cpu(X,2) |
1952 | * | 1954 | * |
@@ -1972,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1972 | * This means that any means of doing remote wakeups must order the CPU doing | 1974 | * This means that any means of doing remote wakeups must order the CPU doing |
1973 | * the wakeup against the CPU the task is going to end up running on. This, | 1975 | * the wakeup against the CPU the task is going to end up running on. This, |
1974 | * however, is already required for the regular Program-Order guarantee above, | 1976 | * however, is already required for the regular Program-Order guarantee above, |
1975 | * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). | 1977 | * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). |
1976 | * | 1978 | * |
1977 | */ | 1979 | */ |
1978 | 1980 | ||
@@ -2045,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2045 | * This ensures that tasks getting woken will be fully ordered against | 2047 | * This ensures that tasks getting woken will be fully ordered against |
2046 | * their previous state and preserve Program Order. | 2048 | * their previous state and preserve Program Order. |
2047 | */ | 2049 | */ |
2048 | smp_cond_acquire(!p->on_cpu); | 2050 | smp_cond_load_acquire(&p->on_cpu, !VAL); |
2049 | 2051 | ||
2050 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2052 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2051 | p->state = TASK_WAKING; | 2053 | p->state = TASK_WAKING; |
@@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
2253 | #endif | 2255 | #endif |
2254 | #endif | 2256 | #endif |
2255 | 2257 | ||
2258 | #ifdef CONFIG_SCHEDSTATS | ||
2259 | |||
2256 | DEFINE_STATIC_KEY_FALSE(sched_schedstats); | 2260 | DEFINE_STATIC_KEY_FALSE(sched_schedstats); |
2261 | static bool __initdata __sched_schedstats = false; | ||
2257 | 2262 | ||
2258 | #ifdef CONFIG_SCHEDSTATS | ||
2259 | static void set_schedstats(bool enabled) | 2263 | static void set_schedstats(bool enabled) |
2260 | { | 2264 | { |
2261 | if (enabled) | 2265 | if (enabled) |
@@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str) | |||
2278 | if (!str) | 2282 | if (!str) |
2279 | goto out; | 2283 | goto out; |
2280 | 2284 | ||
2285 | /* | ||
2286 | * This code is called before jump labels have been set up, so we can't | ||
2287 | * change the static branch directly just yet. Instead set a temporary | ||
2288 | * variable so init_schedstats() can do it later. | ||
2289 | */ | ||
2281 | if (!strcmp(str, "enable")) { | 2290 | if (!strcmp(str, "enable")) { |
2282 | set_schedstats(true); | 2291 | __sched_schedstats = true; |
2283 | ret = 1; | 2292 | ret = 1; |
2284 | } else if (!strcmp(str, "disable")) { | 2293 | } else if (!strcmp(str, "disable")) { |
2285 | set_schedstats(false); | 2294 | __sched_schedstats = false; |
2286 | ret = 1; | 2295 | ret = 1; |
2287 | } | 2296 | } |
2288 | out: | 2297 | out: |
@@ -2293,6 +2302,11 @@ out: | |||
2293 | } | 2302 | } |
2294 | __setup("schedstats=", setup_schedstats); | 2303 | __setup("schedstats=", setup_schedstats); |
2295 | 2304 | ||
2305 | static void __init init_schedstats(void) | ||
2306 | { | ||
2307 | set_schedstats(__sched_schedstats); | ||
2308 | } | ||
2309 | |||
2296 | #ifdef CONFIG_PROC_SYSCTL | 2310 | #ifdef CONFIG_PROC_SYSCTL |
2297 | int sysctl_schedstats(struct ctl_table *table, int write, | 2311 | int sysctl_schedstats(struct ctl_table *table, int write, |
2298 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2312 | void __user *buffer, size_t *lenp, loff_t *ppos) |
@@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write, | |||
2313 | set_schedstats(state); | 2327 | set_schedstats(state); |
2314 | return err; | 2328 | return err; |
2315 | } | 2329 | } |
2316 | #endif | 2330 | #endif /* CONFIG_PROC_SYSCTL */ |
2317 | #endif | 2331 | #else /* !CONFIG_SCHEDSTATS */ |
2332 | static inline void init_schedstats(void) {} | ||
2333 | #endif /* CONFIG_SCHEDSTATS */ | ||
2318 | 2334 | ||
2319 | /* | 2335 | /* |
2320 | * fork()/clone()-time setup: | 2336 | * fork()/clone()-time setup: |
@@ -2326,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2326 | 2342 | ||
2327 | __sched_fork(clone_flags, p); | 2343 | __sched_fork(clone_flags, p); |
2328 | /* | 2344 | /* |
2329 | * We mark the process as running here. This guarantees that | 2345 | * We mark the process as NEW here. This guarantees that |
2330 | * nobody will actually run it, and a signal or other external | 2346 | * nobody will actually run it, and a signal or other external |
2331 | * event cannot wake it up and insert it on the runqueue either. | 2347 | * event cannot wake it up and insert it on the runqueue either. |
2332 | */ | 2348 | */ |
2333 | p->state = TASK_RUNNING; | 2349 | p->state = TASK_NEW; |
2334 | 2350 | ||
2335 | /* | 2351 | /* |
2336 | * Make sure we do not leak PI boosting priority to the child. | 2352 | * Make sure we do not leak PI boosting priority to the child. |
@@ -2367,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2367 | p->sched_class = &fair_sched_class; | 2383 | p->sched_class = &fair_sched_class; |
2368 | } | 2384 | } |
2369 | 2385 | ||
2370 | if (p->sched_class->task_fork) | 2386 | init_entity_runnable_average(&p->se); |
2371 | p->sched_class->task_fork(p); | ||
2372 | 2387 | ||
2373 | /* | 2388 | /* |
2374 | * The child is not yet in the pid-hash so no cgroup attach races, | 2389 | * The child is not yet in the pid-hash so no cgroup attach races, |
@@ -2378,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2378 | * Silence PROVE_RCU. | 2393 | * Silence PROVE_RCU. |
2379 | */ | 2394 | */ |
2380 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2395 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2381 | set_task_cpu(p, cpu); | 2396 | /* |
2397 | * We're setting the cpu for the first time, we don't migrate, | ||
2398 | * so use __set_task_cpu(). | ||
2399 | */ | ||
2400 | __set_task_cpu(p, cpu); | ||
2401 | if (p->sched_class->task_fork) | ||
2402 | p->sched_class->task_fork(p); | ||
2382 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2403 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2383 | 2404 | ||
2384 | #ifdef CONFIG_SCHED_INFO | 2405 | #ifdef CONFIG_SCHED_INFO |
@@ -2510,21 +2531,22 @@ void wake_up_new_task(struct task_struct *p) | |||
2510 | struct rq_flags rf; | 2531 | struct rq_flags rf; |
2511 | struct rq *rq; | 2532 | struct rq *rq; |
2512 | 2533 | ||
2513 | /* Initialize new task's runnable average */ | ||
2514 | init_entity_runnable_average(&p->se); | ||
2515 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); | 2534 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); |
2535 | p->state = TASK_RUNNING; | ||
2516 | #ifdef CONFIG_SMP | 2536 | #ifdef CONFIG_SMP |
2517 | /* | 2537 | /* |
2518 | * Fork balancing, do it here and not earlier because: | 2538 | * Fork balancing, do it here and not earlier because: |
2519 | * - cpus_allowed can change in the fork path | 2539 | * - cpus_allowed can change in the fork path |
2520 | * - any previously selected cpu might disappear through hotplug | 2540 | * - any previously selected cpu might disappear through hotplug |
2541 | * | ||
2542 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | ||
2543 | * as we're not fully set-up yet. | ||
2521 | */ | 2544 | */ |
2522 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2545 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2523 | #endif | 2546 | #endif |
2524 | /* Post initialize new task's util average when its cfs_rq is set */ | 2547 | rq = __task_rq_lock(p, &rf); |
2525 | post_init_entity_util_avg(&p->se); | 2548 | post_init_entity_util_avg(&p->se); |
2526 | 2549 | ||
2527 | rq = __task_rq_lock(p, &rf); | ||
2528 | activate_task(rq, p, 0); | 2550 | activate_task(rq, p, 0); |
2529 | p->on_rq = TASK_ON_RQ_QUEUED; | 2551 | p->on_rq = TASK_ON_RQ_QUEUED; |
2530 | trace_sched_wakeup_new(p); | 2552 | trace_sched_wakeup_new(p); |
@@ -3146,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3146 | pr_cont("\n"); | 3168 | pr_cont("\n"); |
3147 | } | 3169 | } |
3148 | #endif | 3170 | #endif |
3171 | if (panic_on_warn) | ||
3172 | panic("scheduling while atomic\n"); | ||
3173 | |||
3149 | dump_stack(); | 3174 | dump_stack(); |
3150 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | 3175 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
3151 | } | 3176 | } |
@@ -3156,7 +3181,8 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3156 | static inline void schedule_debug(struct task_struct *prev) | 3181 | static inline void schedule_debug(struct task_struct *prev) |
3157 | { | 3182 | { |
3158 | #ifdef CONFIG_SCHED_STACK_END_CHECK | 3183 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
3159 | BUG_ON(task_stack_end_corrupted(prev)); | 3184 | if (task_stack_end_corrupted(prev)) |
3185 | panic("corrupted stack end detected inside scheduler\n"); | ||
3160 | #endif | 3186 | #endif |
3161 | 3187 | ||
3162 | if (unlikely(in_atomic_preempt_off())) { | 3188 | if (unlikely(in_atomic_preempt_off())) { |
@@ -4736,7 +4762,8 @@ out_unlock: | |||
4736 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4762 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4737 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4763 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
4738 | * | 4764 | * |
4739 | * Return: 0 on success. An error code otherwise. | 4765 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
4766 | * error code otherwise. | ||
4740 | */ | 4767 | */ |
4741 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 4768 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
4742 | unsigned long __user *, user_mask_ptr) | 4769 | unsigned long __user *, user_mask_ptr) |
@@ -5133,14 +5160,16 @@ void show_state_filter(unsigned long state_filter) | |||
5133 | /* | 5160 | /* |
5134 | * reset the NMI-timeout, listing all files on a slow | 5161 | * reset the NMI-timeout, listing all files on a slow |
5135 | * console might take a lot of time: | 5162 | * console might take a lot of time: |
5163 | * Also, reset softlockup watchdogs on all CPUs, because | ||
5164 | * another CPU might be blocked waiting for us to process | ||
5165 | * an IPI. | ||
5136 | */ | 5166 | */ |
5137 | touch_nmi_watchdog(); | 5167 | touch_nmi_watchdog(); |
5168 | touch_all_softlockup_watchdogs(); | ||
5138 | if (!state_filter || (p->state & state_filter)) | 5169 | if (!state_filter || (p->state & state_filter)) |
5139 | sched_show_task(p); | 5170 | sched_show_task(p); |
5140 | } | 5171 | } |
5141 | 5172 | ||
5142 | touch_all_softlockup_watchdogs(); | ||
5143 | |||
5144 | #ifdef CONFIG_SCHED_DEBUG | 5173 | #ifdef CONFIG_SCHED_DEBUG |
5145 | if (!state_filter) | 5174 | if (!state_filter) |
5146 | sysrq_sched_debug_show(); | 5175 | sysrq_sched_debug_show(); |
@@ -5376,13 +5405,15 @@ void idle_task_exit(void) | |||
5376 | /* | 5405 | /* |
5377 | * Since this CPU is going 'away' for a while, fold any nr_active delta | 5406 | * Since this CPU is going 'away' for a while, fold any nr_active delta |
5378 | * we might have. Assumes we're called after migrate_tasks() so that the | 5407 | * we might have. Assumes we're called after migrate_tasks() so that the |
5379 | * nr_active count is stable. | 5408 | * nr_active count is stable. We need to take the teardown thread which |
5409 | * is calling this into account, so we hand in adjust = 1 to the load | ||
5410 | * calculation. | ||
5380 | * | 5411 | * |
5381 | * Also see the comment "Global load-average calculations". | 5412 | * Also see the comment "Global load-average calculations". |
5382 | */ | 5413 | */ |
5383 | static void calc_load_migrate(struct rq *rq) | 5414 | static void calc_load_migrate(struct rq *rq) |
5384 | { | 5415 | { |
5385 | long delta = calc_load_fold_active(rq); | 5416 | long delta = calc_load_fold_active(rq, 1); |
5386 | if (delta) | 5417 | if (delta) |
5387 | atomic_long_add(delta, &calc_load_tasks); | 5418 | atomic_long_add(delta, &calc_load_tasks); |
5388 | } | 5419 | } |
@@ -7213,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) | |||
7213 | struct rq *rq = cpu_rq(cpu); | 7244 | struct rq *rq = cpu_rq(cpu); |
7214 | 7245 | ||
7215 | rq->calc_load_update = calc_load_update; | 7246 | rq->calc_load_update = calc_load_update; |
7216 | account_reset_rq(rq); | ||
7217 | update_max_interval(); | 7247 | update_max_interval(); |
7218 | } | 7248 | } |
7219 | 7249 | ||
@@ -7487,6 +7517,8 @@ void __init sched_init(void) | |||
7487 | #endif | 7517 | #endif |
7488 | init_sched_fair_class(); | 7518 | init_sched_fair_class(); |
7489 | 7519 | ||
7520 | init_schedstats(); | ||
7521 | |||
7490 | scheduler_running = 1; | 7522 | scheduler_running = 1; |
7491 | } | 7523 | } |
7492 | 7524 | ||
@@ -7691,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7691 | INIT_LIST_HEAD(&tg->children); | 7723 | INIT_LIST_HEAD(&tg->children); |
7692 | list_add_rcu(&tg->siblings, &parent->children); | 7724 | list_add_rcu(&tg->siblings, &parent->children); |
7693 | spin_unlock_irqrestore(&task_group_lock, flags); | 7725 | spin_unlock_irqrestore(&task_group_lock, flags); |
7726 | |||
7727 | online_fair_sched_group(tg); | ||
7694 | } | 7728 | } |
7695 | 7729 | ||
7696 | /* rcu callback to free various structures associated with a task group */ | 7730 | /* rcu callback to free various structures associated with a task group */ |
@@ -7719,27 +7753,9 @@ void sched_offline_group(struct task_group *tg) | |||
7719 | spin_unlock_irqrestore(&task_group_lock, flags); | 7753 | spin_unlock_irqrestore(&task_group_lock, flags); |
7720 | } | 7754 | } |
7721 | 7755 | ||
7722 | /* change task's runqueue when it moves between groups. | 7756 | static void sched_change_group(struct task_struct *tsk, int type) |
7723 | * The caller of this function should have put the task in its new group | ||
7724 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | ||
7725 | * reflect its new group. | ||
7726 | */ | ||
7727 | void sched_move_task(struct task_struct *tsk) | ||
7728 | { | 7757 | { |
7729 | struct task_group *tg; | 7758 | struct task_group *tg; |
7730 | int queued, running; | ||
7731 | struct rq_flags rf; | ||
7732 | struct rq *rq; | ||
7733 | |||
7734 | rq = task_rq_lock(tsk, &rf); | ||
7735 | |||
7736 | running = task_current(rq, tsk); | ||
7737 | queued = task_on_rq_queued(tsk); | ||
7738 | |||
7739 | if (queued) | ||
7740 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
7741 | if (unlikely(running)) | ||
7742 | put_prev_task(rq, tsk); | ||
7743 | 7759 | ||
7744 | /* | 7760 | /* |
7745 | * All callers are synchronized by task_rq_lock(); we do not use RCU | 7761 | * All callers are synchronized by task_rq_lock(); we do not use RCU |
@@ -7752,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk) | |||
7752 | tsk->sched_task_group = tg; | 7768 | tsk->sched_task_group = tg; |
7753 | 7769 | ||
7754 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7770 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7755 | if (tsk->sched_class->task_move_group) | 7771 | if (tsk->sched_class->task_change_group) |
7756 | tsk->sched_class->task_move_group(tsk); | 7772 | tsk->sched_class->task_change_group(tsk, type); |
7757 | else | 7773 | else |
7758 | #endif | 7774 | #endif |
7759 | set_task_rq(tsk, task_cpu(tsk)); | 7775 | set_task_rq(tsk, task_cpu(tsk)); |
7776 | } | ||
7777 | |||
7778 | /* | ||
7779 | * Change task's runqueue when it moves between groups. | ||
7780 | * | ||
7781 | * The caller of this function should have put the task in its new group by | ||
7782 | * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect | ||
7783 | * its new group. | ||
7784 | */ | ||
7785 | void sched_move_task(struct task_struct *tsk) | ||
7786 | { | ||
7787 | int queued, running; | ||
7788 | struct rq_flags rf; | ||
7789 | struct rq *rq; | ||
7790 | |||
7791 | rq = task_rq_lock(tsk, &rf); | ||
7792 | |||
7793 | running = task_current(rq, tsk); | ||
7794 | queued = task_on_rq_queued(tsk); | ||
7795 | |||
7796 | if (queued) | ||
7797 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | ||
7798 | if (unlikely(running)) | ||
7799 | put_prev_task(rq, tsk); | ||
7800 | |||
7801 | sched_change_group(tsk, TASK_MOVE_GROUP); | ||
7760 | 7802 | ||
7761 | if (unlikely(running)) | 7803 | if (unlikely(running)) |
7762 | tsk->sched_class->set_curr_task(rq); | 7804 | tsk->sched_class->set_curr_task(rq); |
@@ -8184,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) | |||
8184 | sched_free_group(tg); | 8226 | sched_free_group(tg); |
8185 | } | 8227 | } |
8186 | 8228 | ||
8229 | /* | ||
8230 | * This is called before wake_up_new_task(), therefore we really only | ||
8231 | * have to set its group bits, all the other stuff does not apply. | ||
8232 | */ | ||
8187 | static void cpu_cgroup_fork(struct task_struct *task) | 8233 | static void cpu_cgroup_fork(struct task_struct *task) |
8188 | { | 8234 | { |
8189 | sched_move_task(task); | 8235 | struct rq_flags rf; |
8236 | struct rq *rq; | ||
8237 | |||
8238 | rq = task_rq_lock(task, &rf); | ||
8239 | |||
8240 | sched_change_group(task, TASK_SET_GROUP); | ||
8241 | |||
8242 | task_rq_unlock(rq, task, &rf); | ||
8190 | } | 8243 | } |
8191 | 8244 | ||
8192 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | 8245 | static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) |
8193 | { | 8246 | { |
8194 | struct task_struct *task; | 8247 | struct task_struct *task; |
8195 | struct cgroup_subsys_state *css; | 8248 | struct cgroup_subsys_state *css; |
8249 | int ret = 0; | ||
8196 | 8250 | ||
8197 | cgroup_taskset_for_each(task, css, tset) { | 8251 | cgroup_taskset_for_each(task, css, tset) { |
8198 | #ifdef CONFIG_RT_GROUP_SCHED | 8252 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -8203,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) | |||
8203 | if (task->sched_class != &fair_sched_class) | 8257 | if (task->sched_class != &fair_sched_class) |
8204 | return -EINVAL; | 8258 | return -EINVAL; |
8205 | #endif | 8259 | #endif |
8260 | /* | ||
8261 | * Serialize against wake_up_new_task() such that if its | ||
8262 | * running, we're sure to observe its full state. | ||
8263 | */ | ||
8264 | raw_spin_lock_irq(&task->pi_lock); | ||
8265 | /* | ||
8266 | * Avoid calling sched_move_task() before wake_up_new_task() | ||
8267 | * has happened. This would lead to problems with PELT, due to | ||
8268 | * move wanting to detach+attach while we're not attached yet. | ||
8269 | */ | ||
8270 | if (task->state == TASK_NEW) | ||
8271 | ret = -EINVAL; | ||
8272 | raw_spin_unlock_irq(&task->pi_lock); | ||
8273 | |||
8274 | if (ret) | ||
8275 | break; | ||
8206 | } | 8276 | } |
8207 | return 0; | 8277 | return ret; |
8208 | } | 8278 | } |
8209 | 8279 | ||
8210 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) | 8280 | static void cpu_cgroup_attach(struct cgroup_taskset *tset) |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -25,15 +25,13 @@ enum cpuacct_stat_index { | |||
25 | CPUACCT_STAT_NSTATS, | 25 | CPUACCT_STAT_NSTATS, |
26 | }; | 26 | }; |
27 | 27 | ||
28 | enum cpuacct_usage_index { | 28 | static const char * const cpuacct_stat_desc[] = { |
29 | CPUACCT_USAGE_USER, /* ... user mode */ | 29 | [CPUACCT_STAT_USER] = "user", |
30 | CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ | 30 | [CPUACCT_STAT_SYSTEM] = "system", |
31 | |||
32 | CPUACCT_USAGE_NRUSAGE, | ||
33 | }; | 31 | }; |
34 | 32 | ||
35 | struct cpuacct_usage { | 33 | struct cpuacct_usage { |
36 | u64 usages[CPUACCT_USAGE_NRUSAGE]; | 34 | u64 usages[CPUACCT_STAT_NSTATS]; |
37 | }; | 35 | }; |
38 | 36 | ||
39 | /* track cpu usage of a group of tasks and its child groups */ | 37 | /* track cpu usage of a group of tasks and its child groups */ |
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) | |||
108 | } | 106 | } |
109 | 107 | ||
110 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | 108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, |
111 | enum cpuacct_usage_index index) | 109 | enum cpuacct_stat_index index) |
112 | { | 110 | { |
113 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 111 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
114 | u64 data; | 112 | u64 data; |
115 | 113 | ||
116 | /* | 114 | /* |
117 | * We allow index == CPUACCT_USAGE_NRUSAGE here to read | 115 | * We allow index == CPUACCT_STAT_NSTATS here to read |
118 | * the sum of suages. | 116 | * the sum of suages. |
119 | */ | 117 | */ |
120 | BUG_ON(index > CPUACCT_USAGE_NRUSAGE); | 118 | BUG_ON(index > CPUACCT_STAT_NSTATS); |
121 | 119 | ||
122 | #ifndef CONFIG_64BIT | 120 | #ifndef CONFIG_64BIT |
123 | /* | 121 | /* |
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, | |||
126 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 124 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
127 | #endif | 125 | #endif |
128 | 126 | ||
129 | if (index == CPUACCT_USAGE_NRUSAGE) { | 127 | if (index == CPUACCT_STAT_NSTATS) { |
130 | int i = 0; | 128 | int i = 0; |
131 | 129 | ||
132 | data = 0; | 130 | data = 0; |
133 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 131 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
134 | data += cpuusage->usages[i]; | 132 | data += cpuusage->usages[i]; |
135 | } else { | 133 | } else { |
136 | data = cpuusage->usages[index]; | 134 | data = cpuusage->usages[index]; |
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
155 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | 153 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); |
156 | #endif | 154 | #endif |
157 | 155 | ||
158 | for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) | 156 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) |
159 | cpuusage->usages[i] = val; | 157 | cpuusage->usages[i] = val; |
160 | 158 | ||
161 | #ifndef CONFIG_64BIT | 159 | #ifndef CONFIG_64BIT |
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
165 | 163 | ||
166 | /* return total cpu usage (in nanoseconds) of a group */ | 164 | /* return total cpu usage (in nanoseconds) of a group */ |
167 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 165 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
168 | enum cpuacct_usage_index index) | 166 | enum cpuacct_stat_index index) |
169 | { | 167 | { |
170 | struct cpuacct *ca = css_ca(css); | 168 | struct cpuacct *ca = css_ca(css); |
171 | u64 totalcpuusage = 0; | 169 | u64 totalcpuusage = 0; |
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, | |||
180 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, | 178 | static u64 cpuusage_user_read(struct cgroup_subsys_state *css, |
181 | struct cftype *cft) | 179 | struct cftype *cft) |
182 | { | 180 | { |
183 | return __cpuusage_read(css, CPUACCT_USAGE_USER); | 181 | return __cpuusage_read(css, CPUACCT_STAT_USER); |
184 | } | 182 | } |
185 | 183 | ||
186 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, | 184 | static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, |
187 | struct cftype *cft) | 185 | struct cftype *cft) |
188 | { | 186 | { |
189 | return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); | 187 | return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); |
190 | } | 188 | } |
191 | 189 | ||
192 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) | 190 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
193 | { | 191 | { |
194 | return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); | 192 | return __cpuusage_read(css, CPUACCT_STAT_NSTATS); |
195 | } | 193 | } |
196 | 194 | ||
197 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | 195 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | |||
213 | } | 211 | } |
214 | 212 | ||
215 | static int __cpuacct_percpu_seq_show(struct seq_file *m, | 213 | static int __cpuacct_percpu_seq_show(struct seq_file *m, |
216 | enum cpuacct_usage_index index) | 214 | enum cpuacct_stat_index index) |
217 | { | 215 | { |
218 | struct cpuacct *ca = css_ca(seq_css(m)); | 216 | struct cpuacct *ca = css_ca(seq_css(m)); |
219 | u64 percpu; | 217 | u64 percpu; |
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, | |||
229 | 227 | ||
230 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) | 228 | static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) |
231 | { | 229 | { |
232 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); | 230 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); |
233 | } | 231 | } |
234 | 232 | ||
235 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) | 233 | static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) |
236 | { | 234 | { |
237 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); | 235 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); |
238 | } | 236 | } |
239 | 237 | ||
240 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) | 238 | static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) |
241 | { | 239 | { |
242 | return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); | 240 | return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); |
243 | } | 241 | } |
244 | 242 | ||
245 | static const char * const cpuacct_stat_desc[] = { | 243 | static int cpuacct_all_seq_show(struct seq_file *m, void *V) |
246 | [CPUACCT_STAT_USER] = "user", | 244 | { |
247 | [CPUACCT_STAT_SYSTEM] = "system", | 245 | struct cpuacct *ca = css_ca(seq_css(m)); |
248 | }; | 246 | int index; |
247 | int cpu; | ||
248 | |||
249 | seq_puts(m, "cpu"); | ||
250 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) | ||
251 | seq_printf(m, " %s", cpuacct_stat_desc[index]); | ||
252 | seq_puts(m, "\n"); | ||
253 | |||
254 | for_each_possible_cpu(cpu) { | ||
255 | struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
256 | |||
257 | seq_printf(m, "%d", cpu); | ||
258 | |||
259 | for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { | ||
260 | #ifndef CONFIG_64BIT | ||
261 | /* | ||
262 | * Take rq->lock to make 64-bit read safe on 32-bit | ||
263 | * platforms. | ||
264 | */ | ||
265 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
266 | #endif | ||
267 | |||
268 | seq_printf(m, " %llu", cpuusage->usages[index]); | ||
269 | |||
270 | #ifndef CONFIG_64BIT | ||
271 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
272 | #endif | ||
273 | } | ||
274 | seq_puts(m, "\n"); | ||
275 | } | ||
276 | return 0; | ||
277 | } | ||
249 | 278 | ||
250 | static int cpuacct_stats_show(struct seq_file *sf, void *v) | 279 | static int cpuacct_stats_show(struct seq_file *sf, void *v) |
251 | { | 280 | { |
252 | struct cpuacct *ca = css_ca(seq_css(sf)); | 281 | struct cpuacct *ca = css_ca(seq_css(sf)); |
282 | s64 val[CPUACCT_STAT_NSTATS]; | ||
253 | int cpu; | 283 | int cpu; |
254 | s64 val = 0; | 284 | int stat; |
255 | 285 | ||
286 | memset(val, 0, sizeof(val)); | ||
256 | for_each_possible_cpu(cpu) { | 287 | for_each_possible_cpu(cpu) { |
257 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 288 | u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; |
258 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
259 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
260 | } | ||
261 | val = cputime64_to_clock_t(val); | ||
262 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
263 | 289 | ||
264 | val = 0; | 290 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; |
265 | for_each_possible_cpu(cpu) { | 291 | val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; |
266 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | 292 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; |
267 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | 293 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; |
268 | val += kcpustat->cpustat[CPUTIME_IRQ]; | 294 | val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; |
269 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
270 | } | 295 | } |
271 | 296 | ||
272 | val = cputime64_to_clock_t(val); | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
273 | seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | 298 | seq_printf(sf, "%s %lld\n", |
299 | cpuacct_stat_desc[stat], | ||
300 | cputime64_to_clock_t(val[stat])); | ||
301 | } | ||
274 | 302 | ||
275 | return 0; | 303 | return 0; |
276 | } | 304 | } |
@@ -302,6 +330,10 @@ static struct cftype files[] = { | |||
302 | .seq_show = cpuacct_percpu_sys_seq_show, | 330 | .seq_show = cpuacct_percpu_sys_seq_show, |
303 | }, | 331 | }, |
304 | { | 332 | { |
333 | .name = "usage_all", | ||
334 | .seq_show = cpuacct_all_seq_show, | ||
335 | }, | ||
336 | { | ||
305 | .name = "stat", | 337 | .name = "stat", |
306 | .seq_show = cpuacct_stats_show, | 338 | .seq_show = cpuacct_stats_show, |
307 | }, | 339 | }, |
@@ -316,11 +348,11 @@ static struct cftype files[] = { | |||
316 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 348 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
317 | { | 349 | { |
318 | struct cpuacct *ca; | 350 | struct cpuacct *ca; |
319 | int index = CPUACCT_USAGE_SYSTEM; | 351 | int index = CPUACCT_STAT_SYSTEM; |
320 | struct pt_regs *regs = task_pt_regs(tsk); | 352 | struct pt_regs *regs = task_pt_regs(tsk); |
321 | 353 | ||
322 | if (regs && user_mode(regs)) | 354 | if (regs && user_mode(regs)) |
323 | index = CPUACCT_USAGE_USER; | 355 | index = CPUACCT_STAT_USER; |
324 | 356 | ||
325 | rcu_read_lock(); | 357 | rcu_read_lock(); |
326 | 358 | ||
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 14c4aa25cc45..a84641b222c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -47,6 +47,8 @@ struct sugov_cpu { | |||
47 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
48 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
49 | 49 | ||
50 | unsigned int cached_raw_freq; | ||
51 | |||
50 | /* The fields below are only needed when sharing a policy. */ | 52 | /* The fields below are only needed when sharing a policy. */ |
51 | unsigned long util; | 53 | unsigned long util; |
52 | unsigned long max; | 54 | unsigned long max; |
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
106 | 108 | ||
107 | /** | 109 | /** |
108 | * get_next_freq - Compute a new frequency for a given cpufreq policy. | 110 | * get_next_freq - Compute a new frequency for a given cpufreq policy. |
109 | * @policy: cpufreq policy object to compute the new frequency for. | 111 | * @sg_cpu: schedutil cpu object to compute the new frequency for. |
110 | * @util: Current CPU utilization. | 112 | * @util: Current CPU utilization. |
111 | * @max: CPU capacity. | 113 | * @max: CPU capacity. |
112 | * | 114 | * |
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
121 | * next_freq = C * curr_freq * util_raw / max | 123 | * next_freq = C * curr_freq * util_raw / max |
122 | * | 124 | * |
123 | * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. | 125 | * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. |
126 | * | ||
127 | * The lowest driver-supported frequency which is equal or greater than the raw | ||
128 | * next_freq (as calculated above) is returned, subject to policy min/max and | ||
129 | * cpufreq driver limitations. | ||
124 | */ | 130 | */ |
125 | static unsigned int get_next_freq(struct cpufreq_policy *policy, | 131 | static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, |
126 | unsigned long util, unsigned long max) | 132 | unsigned long max) |
127 | { | 133 | { |
134 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | ||
135 | struct cpufreq_policy *policy = sg_policy->policy; | ||
128 | unsigned int freq = arch_scale_freq_invariant() ? | 136 | unsigned int freq = arch_scale_freq_invariant() ? |
129 | policy->cpuinfo.max_freq : policy->cur; | 137 | policy->cpuinfo.max_freq : policy->cur; |
130 | 138 | ||
131 | return (freq + (freq >> 2)) * util / max; | 139 | freq = (freq + (freq >> 2)) * util / max; |
140 | |||
141 | if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) | ||
142 | return sg_policy->next_freq; | ||
143 | sg_cpu->cached_raw_freq = freq; | ||
144 | return cpufreq_driver_resolve_freq(policy, freq); | ||
132 | } | 145 | } |
133 | 146 | ||
134 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 147 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
143 | return; | 156 | return; |
144 | 157 | ||
145 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : | 158 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : |
146 | get_next_freq(policy, util, max); | 159 | get_next_freq(sg_cpu, util, max); |
147 | sugov_update_commit(sg_policy, time, next_f); | 160 | sugov_update_commit(sg_policy, time, next_f); |
148 | } | 161 | } |
149 | 162 | ||
150 | static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, | 163 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, |
151 | unsigned long util, unsigned long max) | 164 | unsigned long util, unsigned long max) |
152 | { | 165 | { |
166 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | ||
153 | struct cpufreq_policy *policy = sg_policy->policy; | 167 | struct cpufreq_policy *policy = sg_policy->policy; |
154 | unsigned int max_f = policy->cpuinfo.max_freq; | 168 | unsigned int max_f = policy->cpuinfo.max_freq; |
155 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | 169 | u64 last_freq_update_time = sg_policy->last_freq_update_time; |
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, | |||
189 | } | 203 | } |
190 | } | 204 | } |
191 | 205 | ||
192 | return get_next_freq(policy, util, max); | 206 | return get_next_freq(sg_cpu, util, max); |
193 | } | 207 | } |
194 | 208 | ||
195 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 209 | static void sugov_update_shared(struct update_util_data *hook, u64 time, |
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
206 | sg_cpu->last_update = time; | 220 | sg_cpu->last_update = time; |
207 | 221 | ||
208 | if (sugov_should_update_freq(sg_policy, time)) { | 222 | if (sugov_should_update_freq(sg_policy, time)) { |
209 | next_f = sugov_next_freq_shared(sg_policy, util, max); | 223 | next_f = sugov_next_freq_shared(sg_cpu, util, max); |
210 | sugov_update_commit(sg_policy, time, next_f); | 224 | sugov_update_commit(sg_policy, time, next_f); |
211 | } | 225 | } |
212 | 226 | ||
@@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy) | |||
394 | return ret; | 408 | return ret; |
395 | } | 409 | } |
396 | 410 | ||
397 | static int sugov_exit(struct cpufreq_policy *policy) | 411 | static void sugov_exit(struct cpufreq_policy *policy) |
398 | { | 412 | { |
399 | struct sugov_policy *sg_policy = policy->governor_data; | 413 | struct sugov_policy *sg_policy = policy->governor_data; |
400 | struct sugov_tunables *tunables = sg_policy->tunables; | 414 | struct sugov_tunables *tunables = sg_policy->tunables; |
@@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy) | |||
412 | mutex_unlock(&global_tunables_lock); | 426 | mutex_unlock(&global_tunables_lock); |
413 | 427 | ||
414 | sugov_policy_free(sg_policy); | 428 | sugov_policy_free(sg_policy); |
415 | return 0; | ||
416 | } | 429 | } |
417 | 430 | ||
418 | static int sugov_start(struct cpufreq_policy *policy) | 431 | static int sugov_start(struct cpufreq_policy *policy) |
@@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
434 | sg_cpu->util = ULONG_MAX; | 447 | sg_cpu->util = ULONG_MAX; |
435 | sg_cpu->max = 0; | 448 | sg_cpu->max = 0; |
436 | sg_cpu->last_update = 0; | 449 | sg_cpu->last_update = 0; |
450 | sg_cpu->cached_raw_freq = 0; | ||
437 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 451 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
438 | sugov_update_shared); | 452 | sugov_update_shared); |
439 | } else { | 453 | } else { |
@@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
444 | return 0; | 458 | return 0; |
445 | } | 459 | } |
446 | 460 | ||
447 | static int sugov_stop(struct cpufreq_policy *policy) | 461 | static void sugov_stop(struct cpufreq_policy *policy) |
448 | { | 462 | { |
449 | struct sugov_policy *sg_policy = policy->governor_data; | 463 | struct sugov_policy *sg_policy = policy->governor_data; |
450 | unsigned int cpu; | 464 | unsigned int cpu; |
@@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy) | |||
456 | 470 | ||
457 | irq_work_sync(&sg_policy->irq_work); | 471 | irq_work_sync(&sg_policy->irq_work); |
458 | cancel_work_sync(&sg_policy->work); | 472 | cancel_work_sync(&sg_policy->work); |
459 | return 0; | ||
460 | } | 473 | } |
461 | 474 | ||
462 | static int sugov_limits(struct cpufreq_policy *policy) | 475 | static void sugov_limits(struct cpufreq_policy *policy) |
463 | { | 476 | { |
464 | struct sugov_policy *sg_policy = policy->governor_data; | 477 | struct sugov_policy *sg_policy = policy->governor_data; |
465 | 478 | ||
466 | if (!policy->fast_switch_enabled) { | 479 | if (!policy->fast_switch_enabled) { |
467 | mutex_lock(&sg_policy->work_lock); | 480 | mutex_lock(&sg_policy->work_lock); |
468 | 481 | cpufreq_policy_apply_limits(policy); | |
469 | if (policy->max < policy->cur) | ||
470 | __cpufreq_driver_target(policy, policy->max, | ||
471 | CPUFREQ_RELATION_H); | ||
472 | else if (policy->min > policy->cur) | ||
473 | __cpufreq_driver_target(policy, policy->min, | ||
474 | CPUFREQ_RELATION_L); | ||
475 | |||
476 | mutex_unlock(&sg_policy->work_lock); | 482 | mutex_unlock(&sg_policy->work_lock); |
477 | } | 483 | } |
478 | 484 | ||
479 | sg_policy->need_freq_update = true; | 485 | sg_policy->need_freq_update = true; |
480 | return 0; | ||
481 | } | ||
482 | |||
483 | int sugov_governor(struct cpufreq_policy *policy, unsigned int event) | ||
484 | { | ||
485 | if (event == CPUFREQ_GOV_POLICY_INIT) { | ||
486 | return sugov_init(policy); | ||
487 | } else if (policy->governor_data) { | ||
488 | switch (event) { | ||
489 | case CPUFREQ_GOV_POLICY_EXIT: | ||
490 | return sugov_exit(policy); | ||
491 | case CPUFREQ_GOV_START: | ||
492 | return sugov_start(policy); | ||
493 | case CPUFREQ_GOV_STOP: | ||
494 | return sugov_stop(policy); | ||
495 | case CPUFREQ_GOV_LIMITS: | ||
496 | return sugov_limits(policy); | ||
497 | } | ||
498 | } | ||
499 | return -EINVAL; | ||
500 | } | 486 | } |
501 | 487 | ||
502 | static struct cpufreq_governor schedutil_gov = { | 488 | static struct cpufreq_governor schedutil_gov = { |
503 | .name = "schedutil", | 489 | .name = "schedutil", |
504 | .governor = sugov_governor, | ||
505 | .owner = THIS_MODULE, | 490 | .owner = THIS_MODULE, |
491 | .init = sugov_init, | ||
492 | .exit = sugov_exit, | ||
493 | .start = sugov_start, | ||
494 | .stop = sugov_stop, | ||
495 | .limits = sugov_limits, | ||
506 | }; | 496 | }; |
507 | 497 | ||
508 | static int __init sugov_module_init(void) | 498 | static int __init sugov_module_init(void) |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..1934f658c036 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |||
49 | */ | 49 | */ |
50 | void irqtime_account_irq(struct task_struct *curr) | 50 | void irqtime_account_irq(struct task_struct *curr) |
51 | { | 51 | { |
52 | unsigned long flags; | ||
53 | s64 delta; | 52 | s64 delta; |
54 | int cpu; | 53 | int cpu; |
55 | 54 | ||
56 | if (!sched_clock_irqtime) | 55 | if (!sched_clock_irqtime) |
57 | return; | 56 | return; |
58 | 57 | ||
59 | local_irq_save(flags); | ||
60 | |||
61 | cpu = smp_processor_id(); | 58 | cpu = smp_processor_id(); |
62 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
63 | __this_cpu_add(irq_start_time, delta); | 60 | __this_cpu_add(irq_start_time, delta); |
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr) | |||
75 | __this_cpu_add(cpu_softirq_time, delta); | 72 | __this_cpu_add(cpu_softirq_time, delta); |
76 | 73 | ||
77 | irq_time_write_end(); | 74 | irq_time_write_end(); |
78 | local_irq_restore(flags); | ||
79 | } | 75 | } |
80 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
81 | 77 | ||
82 | static int irqtime_account_hi_update(void) | 78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) |
83 | { | 79 | { |
84 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 80 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
85 | unsigned long flags; | 81 | unsigned long flags; |
86 | u64 latest_ns; | 82 | cputime_t irq_cputime; |
87 | int ret = 0; | ||
88 | 83 | ||
89 | local_irq_save(flags); | 84 | local_irq_save(flags); |
90 | latest_ns = this_cpu_read(cpu_hardirq_time); | 85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - |
91 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | 86 | cpustat[CPUTIME_IRQ]; |
92 | ret = 1; | 87 | irq_cputime = min(irq_cputime, maxtime); |
88 | cpustat[CPUTIME_IRQ] += irq_cputime; | ||
93 | local_irq_restore(flags); | 89 | local_irq_restore(flags); |
94 | return ret; | 90 | return irq_cputime; |
95 | } | 91 | } |
96 | 92 | ||
97 | static int irqtime_account_si_update(void) | 93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) |
98 | { | 94 | { |
99 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 95 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
100 | unsigned long flags; | 96 | unsigned long flags; |
101 | u64 latest_ns; | 97 | cputime_t softirq_cputime; |
102 | int ret = 0; | ||
103 | 98 | ||
104 | local_irq_save(flags); | 99 | local_irq_save(flags); |
105 | latest_ns = this_cpu_read(cpu_softirq_time); | 100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - |
106 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | 101 | cpustat[CPUTIME_SOFTIRQ]; |
107 | ret = 1; | 102 | softirq_cputime = min(softirq_cputime, maxtime); |
103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | ||
108 | local_irq_restore(flags); | 104 | local_irq_restore(flags); |
109 | return ret; | 105 | return softirq_cputime; |
110 | } | 106 | } |
111 | 107 | ||
112 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
113 | 109 | ||
114 | #define sched_clock_irqtime (0) | 110 | #define sched_clock_irqtime (0) |
115 | 111 | ||
112 | static cputime_t irqtime_account_hi_update(cputime_t dummy) | ||
113 | { | ||
114 | return 0; | ||
115 | } | ||
116 | |||
117 | static cputime_t irqtime_account_si_update(cputime_t dummy) | ||
118 | { | ||
119 | return 0; | ||
120 | } | ||
121 | |||
116 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | 122 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ |
117 | 123 | ||
118 | static inline void task_group_account_field(struct task_struct *p, int index, | 124 | static inline void task_group_account_field(struct task_struct *p, int index, |
@@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime) | |||
257 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 263 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
258 | } | 264 | } |
259 | 265 | ||
260 | static __always_inline bool steal_account_process_tick(void) | 266 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) |
261 | { | 267 | { |
262 | #ifdef CONFIG_PARAVIRT | 268 | #ifdef CONFIG_PARAVIRT |
263 | if (static_key_false(¶virt_steal_enabled)) { | 269 | if (static_key_false(¶virt_steal_enabled)) { |
270 | cputime_t steal_cputime; | ||
264 | u64 steal; | 271 | u64 steal; |
265 | unsigned long steal_jiffies; | ||
266 | 272 | ||
267 | steal = paravirt_steal_clock(smp_processor_id()); | 273 | steal = paravirt_steal_clock(smp_processor_id()); |
268 | steal -= this_rq()->prev_steal_time; | 274 | steal -= this_rq()->prev_steal_time; |
269 | 275 | ||
270 | /* | 276 | steal_cputime = min(nsecs_to_cputime(steal), maxtime); |
271 | * steal is in nsecs but our caller is expecting steal | 277 | account_steal_time(steal_cputime); |
272 | * time in jiffies. Lets cast the result to jiffies | 278 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); |
273 | * granularity and account the rest on the next rounds. | ||
274 | */ | ||
275 | steal_jiffies = nsecs_to_jiffies(steal); | ||
276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); | ||
277 | 279 | ||
278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); | 280 | return steal_cputime; |
279 | return steal_jiffies; | ||
280 | } | 281 | } |
281 | #endif | 282 | #endif |
282 | return false; | 283 | return 0; |
284 | } | ||
285 | |||
286 | /* | ||
287 | * Account how much elapsed time was spent in steal, irq, or softirq time. | ||
288 | */ | ||
289 | static inline cputime_t account_other_time(cputime_t max) | ||
290 | { | ||
291 | cputime_t accounted; | ||
292 | |||
293 | accounted = steal_account_process_time(max); | ||
294 | |||
295 | if (accounted < max) | ||
296 | accounted += irqtime_account_hi_update(max - accounted); | ||
297 | |||
298 | if (accounted < max) | ||
299 | accounted += irqtime_account_si_update(max - accounted); | ||
300 | |||
301 | return accounted; | ||
283 | } | 302 | } |
284 | 303 | ||
285 | /* | 304 | /* |
@@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
342 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 361 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
343 | struct rq *rq, int ticks) | 362 | struct rq *rq, int ticks) |
344 | { | 363 | { |
345 | cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); | 364 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; |
346 | u64 cputime = (__force u64) cputime_one_jiffy; | 365 | cputime_t scaled, other; |
347 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
348 | 366 | ||
349 | if (steal_account_process_tick()) | 367 | /* |
368 | * When returning from idle, many ticks can get accounted at | ||
369 | * once, including some ticks of steal, irq, and softirq time. | ||
370 | * Subtract those ticks from the amount of time accounted to | ||
371 | * idle, or potentially user or system time. Due to rounding, | ||
372 | * other time can exceed ticks occasionally. | ||
373 | */ | ||
374 | other = account_other_time(cputime); | ||
375 | if (other >= cputime) | ||
350 | return; | 376 | return; |
377 | cputime -= other; | ||
378 | scaled = cputime_to_scaled(cputime); | ||
351 | 379 | ||
352 | cputime *= ticks; | 380 | if (this_cpu_ksoftirqd() == p) { |
353 | scaled *= ticks; | ||
354 | |||
355 | if (irqtime_account_hi_update()) { | ||
356 | cpustat[CPUTIME_IRQ] += cputime; | ||
357 | } else if (irqtime_account_si_update()) { | ||
358 | cpustat[CPUTIME_SOFTIRQ] += cputime; | ||
359 | } else if (this_cpu_ksoftirqd() == p) { | ||
360 | /* | 381 | /* |
361 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 382 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
362 | * So, we have to handle it separately here. | 383 | * So, we have to handle it separately here. |
@@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
406 | } | 427 | } |
407 | #endif | 428 | #endif |
408 | 429 | ||
430 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
431 | |||
432 | |||
433 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
409 | /* | 434 | /* |
410 | * Archs that account the whole time spent in the idle task | 435 | * Archs that account the whole time spent in the idle task |
411 | * (outside irq) as idle time can rely on this and just implement | 436 | * (outside irq) as idle time can rely on this and just implement |
@@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
415 | * vtime_account(). | 440 | * vtime_account(). |
416 | */ | 441 | */ |
417 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 442 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
418 | void vtime_common_account_irq_enter(struct task_struct *tsk) | 443 | void vtime_account_irq_enter(struct task_struct *tsk) |
419 | { | 444 | { |
420 | if (!in_interrupt()) { | 445 | if (!in_interrupt() && is_idle_task(tsk)) |
421 | /* | 446 | vtime_account_idle(tsk); |
422 | * If we interrupted user, context_tracking_in_user() | 447 | else |
423 | * is 1 because the context tracking don't hook | 448 | vtime_account_system(tsk); |
424 | * on irq entry/exit. This way we know if | ||
425 | * we need to flush user time on kernel entry. | ||
426 | */ | ||
427 | if (context_tracking_in_user()) { | ||
428 | vtime_account_user(tsk); | ||
429 | return; | ||
430 | } | ||
431 | |||
432 | if (is_idle_task(tsk)) { | ||
433 | vtime_account_idle(tsk); | ||
434 | return; | ||
435 | } | ||
436 | } | ||
437 | vtime_account_system(tsk); | ||
438 | } | 449 | } |
439 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); | 450 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
440 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 451 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
441 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | ||
442 | 452 | ||
443 | |||
444 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
445 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 453 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
446 | { | 454 | { |
447 | *ut = p->utime; | 455 | *ut = p->utime; |
@@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
466 | */ | 474 | */ |
467 | void account_process_tick(struct task_struct *p, int user_tick) | 475 | void account_process_tick(struct task_struct *p, int user_tick) |
468 | { | 476 | { |
469 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 477 | cputime_t cputime, scaled, steal; |
470 | struct rq *rq = this_rq(); | 478 | struct rq *rq = this_rq(); |
471 | 479 | ||
472 | if (vtime_accounting_cpu_enabled()) | 480 | if (vtime_accounting_cpu_enabled()) |
@@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
477 | return; | 485 | return; |
478 | } | 486 | } |
479 | 487 | ||
480 | if (steal_account_process_tick()) | 488 | cputime = cputime_one_jiffy; |
489 | steal = steal_account_process_time(cputime); | ||
490 | |||
491 | if (steal >= cputime) | ||
481 | return; | 492 | return; |
482 | 493 | ||
494 | cputime -= steal; | ||
495 | scaled = cputime_to_scaled(cputime); | ||
496 | |||
483 | if (user_tick) | 497 | if (user_tick) |
484 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 498 | account_user_time(p, cputime, scaled); |
485 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 499 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
486 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | 500 | account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); |
487 | one_jiffy_scaled); | ||
488 | else | 501 | else |
489 | account_idle_time(cputime_one_jiffy); | 502 | account_idle_time(cputime); |
490 | } | ||
491 | |||
492 | /* | ||
493 | * Account multiple ticks of steal time. | ||
494 | * @p: the process from which the cpu time has been stolen | ||
495 | * @ticks: number of stolen ticks | ||
496 | */ | ||
497 | void account_steal_ticks(unsigned long ticks) | ||
498 | { | ||
499 | account_steal_time(jiffies_to_cputime(ticks)); | ||
500 | } | 503 | } |
501 | 504 | ||
502 | /* | 505 | /* |
@@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) | |||
681 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 684 | static cputime_t get_vtime_delta(struct task_struct *tsk) |
682 | { | 685 | { |
683 | unsigned long now = READ_ONCE(jiffies); | 686 | unsigned long now = READ_ONCE(jiffies); |
684 | unsigned long delta = now - tsk->vtime_snap; | 687 | cputime_t delta, other; |
685 | 688 | ||
689 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | ||
690 | other = account_other_time(delta); | ||
686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 691 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
687 | tsk->vtime_snap = now; | 692 | tsk->vtime_snap = now; |
688 | 693 | ||
689 | return jiffies_to_cputime(delta); | 694 | return delta - other; |
690 | } | 695 | } |
691 | 696 | ||
692 | static void __vtime_account_system(struct task_struct *tsk) | 697 | static void __vtime_account_system(struct task_struct *tsk) |
@@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk) | |||
706 | write_seqcount_end(&tsk->vtime_seqcount); | 711 | write_seqcount_end(&tsk->vtime_seqcount); |
707 | } | 712 | } |
708 | 713 | ||
709 | void vtime_gen_account_irq_exit(struct task_struct *tsk) | ||
710 | { | ||
711 | write_seqcount_begin(&tsk->vtime_seqcount); | ||
712 | if (vtime_delta(tsk)) | ||
713 | __vtime_account_system(tsk); | ||
714 | if (context_tracking_in_user()) | ||
715 | tsk->vtime_snap_whence = VTIME_USER; | ||
716 | write_seqcount_end(&tsk->vtime_seqcount); | ||
717 | } | ||
718 | |||
719 | void vtime_account_user(struct task_struct *tsk) | 714 | void vtime_account_user(struct task_struct *tsk) |
720 | { | 715 | { |
721 | cputime_t delta_cpu; | 716 | cputime_t delta_cpu; |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cf905f655ba1..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
427 | SPLIT_NS(p->se.vruntime), | 427 | SPLIT_NS(p->se.vruntime), |
428 | (long long)(p->nvcsw + p->nivcsw), | 428 | (long long)(p->nvcsw + p->nivcsw), |
429 | p->prio); | 429 | p->prio); |
430 | #ifdef CONFIG_SCHEDSTATS | 430 | |
431 | if (schedstat_enabled()) { | ||
432 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | ||
433 | SPLIT_NS(p->se.statistics.wait_sum), | ||
434 | SPLIT_NS(p->se.sum_exec_runtime), | ||
435 | SPLIT_NS(p->se.statistics.sum_sleep_runtime)); | ||
436 | } | ||
437 | #else | ||
438 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 431 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
439 | 0LL, 0L, | 432 | SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), |
440 | SPLIT_NS(p->se.sum_exec_runtime), | 433 | SPLIT_NS(p->se.sum_exec_runtime), |
441 | 0LL, 0L); | 434 | SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); |
442 | #endif | 435 | |
443 | #ifdef CONFIG_NUMA_BALANCING | 436 | #ifdef CONFIG_NUMA_BALANCING |
444 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); | 437 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); |
445 | #endif | 438 | #endif |
@@ -886,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
886 | 879 | ||
887 | nr_switches = p->nvcsw + p->nivcsw; | 880 | nr_switches = p->nvcsw + p->nivcsw; |
888 | 881 | ||
889 | #ifdef CONFIG_SCHEDSTATS | ||
890 | P(se.nr_migrations); | 882 | P(se.nr_migrations); |
891 | 883 | ||
884 | #ifdef CONFIG_SCHEDSTATS | ||
892 | if (schedstat_enabled()) { | 885 | if (schedstat_enabled()) { |
893 | u64 avg_atom, avg_per_cpu; | 886 | u64 avg_atom, avg_per_cpu; |
894 | 887 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 690 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
691 | } | 691 | } |
692 | 692 | ||
693 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
694 | static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); | ||
695 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); | ||
696 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||
697 | |||
693 | /* | 698 | /* |
694 | * With new tasks being created, their initial util_avgs are extrapolated | 699 | * With new tasks being created, their initial util_avgs are extrapolated |
695 | * based on the cfs_rq's current util_avg: | 700 | * based on the cfs_rq's current util_avg: |
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
720 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 725 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
721 | struct sched_avg *sa = &se->avg; | 726 | struct sched_avg *sa = &se->avg; |
722 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
728 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
729 | int tg_update; | ||
723 | 730 | ||
724 | if (cap > 0) { | 731 | if (cap > 0) { |
725 | if (cfs_rq->avg.util_avg != 0) { | 732 | if (cfs_rq->avg.util_avg != 0) { |
@@ -733,18 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
733 | } | 740 | } |
734 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; | 741 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
735 | } | 742 | } |
743 | |||
744 | if (entity_is_task(se)) { | ||
745 | struct task_struct *p = task_of(se); | ||
746 | if (p->sched_class != &fair_sched_class) { | ||
747 | /* | ||
748 | * For !fair tasks do: | ||
749 | * | ||
750 | update_cfs_rq_load_avg(now, cfs_rq, false); | ||
751 | attach_entity_load_avg(cfs_rq, se); | ||
752 | switched_from_fair(rq, p); | ||
753 | * | ||
754 | * such that the next switched_to_fair() has the | ||
755 | * expected state. | ||
756 | */ | ||
757 | se->avg.last_update_time = now; | ||
758 | return; | ||
759 | } | ||
760 | } | ||
761 | |||
762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
763 | attach_entity_load_avg(cfs_rq, se); | ||
764 | if (tg_update) | ||
765 | update_tg_load_avg(cfs_rq, false); | ||
736 | } | 766 | } |
737 | 767 | ||
738 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); | 768 | #else /* !CONFIG_SMP */ |
739 | static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); | ||
740 | #else | ||
741 | void init_entity_runnable_average(struct sched_entity *se) | 769 | void init_entity_runnable_average(struct sched_entity *se) |
742 | { | 770 | { |
743 | } | 771 | } |
744 | void post_init_entity_util_avg(struct sched_entity *se) | 772 | void post_init_entity_util_avg(struct sched_entity *se) |
745 | { | 773 | { |
746 | } | 774 | } |
747 | #endif | 775 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
776 | { | ||
777 | } | ||
778 | #endif /* CONFIG_SMP */ | ||
748 | 779 | ||
749 | /* | 780 | /* |
750 | * Update the current task's runtime statistics. | 781 | * Update the current task's runtime statistics. |
@@ -1305,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1305 | { | 1336 | { |
1306 | if (env->best_task) | 1337 | if (env->best_task) |
1307 | put_task_struct(env->best_task); | 1338 | put_task_struct(env->best_task); |
1339 | if (p) | ||
1340 | get_task_struct(p); | ||
1308 | 1341 | ||
1309 | env->best_task = p; | 1342 | env->best_task = p; |
1310 | env->best_imp = imp; | 1343 | env->best_imp = imp; |
@@ -1372,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1372 | long imp = env->p->numa_group ? groupimp : taskimp; | 1405 | long imp = env->p->numa_group ? groupimp : taskimp; |
1373 | long moveimp = imp; | 1406 | long moveimp = imp; |
1374 | int dist = env->dist; | 1407 | int dist = env->dist; |
1375 | bool assigned = false; | ||
1376 | 1408 | ||
1377 | rcu_read_lock(); | 1409 | rcu_read_lock(); |
1378 | 1410 | cur = task_rcu_dereference(&dst_rq->curr); | |
1379 | raw_spin_lock_irq(&dst_rq->lock); | 1411 | if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) |
1380 | cur = dst_rq->curr; | ||
1381 | /* | ||
1382 | * No need to move the exiting task or idle task. | ||
1383 | */ | ||
1384 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
1385 | cur = NULL; | 1412 | cur = NULL; |
1386 | else { | ||
1387 | /* | ||
1388 | * The task_struct must be protected here to protect the | ||
1389 | * p->numa_faults access in the task_weight since the | ||
1390 | * numa_faults could already be freed in the following path: | ||
1391 | * finish_task_switch() | ||
1392 | * --> put_task_struct() | ||
1393 | * --> __put_task_struct() | ||
1394 | * --> task_numa_free() | ||
1395 | */ | ||
1396 | get_task_struct(cur); | ||
1397 | } | ||
1398 | |||
1399 | raw_spin_unlock_irq(&dst_rq->lock); | ||
1400 | 1413 | ||
1401 | /* | 1414 | /* |
1402 | * Because we have preemption enabled we can get migrated around and | 1415 | * Because we have preemption enabled we can get migrated around and |
@@ -1479,7 +1492,6 @@ balance: | |||
1479 | */ | 1492 | */ |
1480 | if (!load_too_imbalanced(src_load, dst_load, env)) { | 1493 | if (!load_too_imbalanced(src_load, dst_load, env)) { |
1481 | imp = moveimp - 1; | 1494 | imp = moveimp - 1; |
1482 | put_task_struct(cur); | ||
1483 | cur = NULL; | 1495 | cur = NULL; |
1484 | goto assign; | 1496 | goto assign; |
1485 | } | 1497 | } |
@@ -1505,16 +1517,9 @@ balance: | |||
1505 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); |
1506 | 1518 | ||
1507 | assign: | 1519 | assign: |
1508 | assigned = true; | ||
1509 | task_numa_assign(env, cur, imp); | 1520 | task_numa_assign(env, cur, imp); |
1510 | unlock: | 1521 | unlock: |
1511 | rcu_read_unlock(); | 1522 | rcu_read_unlock(); |
1512 | /* | ||
1513 | * The dst_rq->curr isn't assigned. The protection for task_struct is | ||
1514 | * finished. | ||
1515 | */ | ||
1516 | if (cur && !assigned) | ||
1517 | put_task_struct(cur); | ||
1518 | } | 1523 | } |
1519 | 1524 | ||
1520 | static void task_numa_find_cpu(struct task_numa_env *env, | 1525 | static void task_numa_find_cpu(struct task_numa_env *env, |
@@ -2499,28 +2504,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2499 | 2504 | ||
2500 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2505 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2501 | # ifdef CONFIG_SMP | 2506 | # ifdef CONFIG_SMP |
2502 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | 2507 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
2503 | { | 2508 | { |
2504 | long tg_weight; | 2509 | long tg_weight, load, shares; |
2505 | 2510 | ||
2506 | /* | 2511 | /* |
2507 | * Use this CPU's real-time load instead of the last load contribution | 2512 | * This really should be: cfs_rq->avg.load_avg, but instead we use |
2508 | * as the updating of the contribution is delayed, and we will use the | 2513 | * cfs_rq->load.weight, which is its upper bound. This helps ramp up |
2509 | * the real-time load to calc the share. See update_tg_load_avg(). | 2514 | * the shares for small weight interactive tasks. |
2510 | */ | 2515 | */ |
2511 | tg_weight = atomic_long_read(&tg->load_avg); | 2516 | load = scale_load_down(cfs_rq->load.weight); |
2512 | tg_weight -= cfs_rq->tg_load_avg_contrib; | ||
2513 | tg_weight += cfs_rq->load.weight; | ||
2514 | 2517 | ||
2515 | return tg_weight; | 2518 | tg_weight = atomic_long_read(&tg->load_avg); |
2516 | } | ||
2517 | |||
2518 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
2519 | { | ||
2520 | long tg_weight, load, shares; | ||
2521 | 2519 | ||
2522 | tg_weight = calc_tg_weight(tg, cfs_rq); | 2520 | /* Ensure tg_weight >= load */ |
2523 | load = cfs_rq->load.weight; | 2521 | tg_weight -= cfs_rq->tg_load_avg_contrib; |
2522 | tg_weight += load; | ||
2524 | 2523 | ||
2525 | shares = (tg->shares * load); | 2524 | shares = (tg->shares * load); |
2526 | if (tg_weight) | 2525 | if (tg_weight) |
@@ -2539,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2539 | return tg->shares; | 2538 | return tg->shares; |
2540 | } | 2539 | } |
2541 | # endif /* CONFIG_SMP */ | 2540 | # endif /* CONFIG_SMP */ |
2541 | |||
2542 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 2542 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
2543 | unsigned long weight) | 2543 | unsigned long weight) |
2544 | { | 2544 | { |
@@ -2873,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, | |||
2873 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 2873 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
2874 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2874 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2875 | 2875 | ||
2876 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
2877 | |||
2878 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
2879 | { | 2877 | { |
2880 | struct rq *rq = rq_of(cfs_rq); | 2878 | struct rq *rq = rq_of(cfs_rq); |
@@ -2904,7 +2902,40 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
2904 | } | 2902 | } |
2905 | } | 2903 | } |
2906 | 2904 | ||
2907 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | 2905 | /* |
2906 | * Unsigned subtract and clamp on underflow. | ||
2907 | * | ||
2908 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
2909 | * memory. This allows lockless observations without ever seeing the negative | ||
2910 | * values. | ||
2911 | */ | ||
2912 | #define sub_positive(_ptr, _val) do { \ | ||
2913 | typeof(_ptr) ptr = (_ptr); \ | ||
2914 | typeof(*ptr) val = (_val); \ | ||
2915 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
2916 | res = var - val; \ | ||
2917 | if (res > var) \ | ||
2918 | res = 0; \ | ||
2919 | WRITE_ONCE(*ptr, res); \ | ||
2920 | } while (0) | ||
2921 | |||
2922 | /** | ||
2923 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages | ||
2924 | * @now: current time, as per cfs_rq_clock_task() | ||
2925 | * @cfs_rq: cfs_rq to update | ||
2926 | * @update_freq: should we call cfs_rq_util_change() or will the call do so | ||
2927 | * | ||
2928 | * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) | ||
2929 | * avg. The immediate corollary is that all (fair) tasks must be attached, see | ||
2930 | * post_init_entity_util_avg(). | ||
2931 | * | ||
2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | ||
2933 | * | ||
2934 | * Returns true if the load decayed or we removed utilization. It is expected | ||
2935 | * that one calls update_tg_load_avg() on this condition, but after you've | ||
2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | ||
2937 | * avg up. | ||
2938 | */ | ||
2908 | static inline int | 2939 | static inline int |
2909 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
2910 | { | 2941 | { |
@@ -2913,15 +2944,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
2913 | 2944 | ||
2914 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { | 2945 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { |
2915 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | 2946 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); |
2916 | sa->load_avg = max_t(long, sa->load_avg - r, 0); | 2947 | sub_positive(&sa->load_avg, r); |
2917 | sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); | 2948 | sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); |
2918 | removed_load = 1; | 2949 | removed_load = 1; |
2919 | } | 2950 | } |
2920 | 2951 | ||
2921 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 2952 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
2922 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | 2953 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
2923 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | 2954 | sub_positive(&sa->util_avg, r); |
2924 | sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); | 2955 | sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); |
2925 | removed_util = 1; | 2956 | removed_util = 1; |
2926 | } | 2957 | } |
2927 | 2958 | ||
@@ -2959,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) | |||
2959 | update_tg_load_avg(cfs_rq, 0); | 2990 | update_tg_load_avg(cfs_rq, 0); |
2960 | } | 2991 | } |
2961 | 2992 | ||
2993 | /** | ||
2994 | * attach_entity_load_avg - attach this entity to its cfs_rq load avg | ||
2995 | * @cfs_rq: cfs_rq to attach to | ||
2996 | * @se: sched_entity to attach | ||
2997 | * | ||
2998 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
2999 | * cfs_rq->avg.last_update_time being current. | ||
3000 | */ | ||
2962 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3001 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2963 | { | 3002 | { |
2964 | if (!sched_feat(ATTACH_AGE_LOAD)) | 3003 | if (!sched_feat(ATTACH_AGE_LOAD)) |
@@ -2967,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
2967 | /* | 3006 | /* |
2968 | * If we got migrated (either between CPUs or between cgroups) we'll | 3007 | * If we got migrated (either between CPUs or between cgroups) we'll |
2969 | * have aged the average right before clearing @last_update_time. | 3008 | * have aged the average right before clearing @last_update_time. |
3009 | * | ||
3010 | * Or we're fresh through post_init_entity_util_avg(). | ||
2970 | */ | 3011 | */ |
2971 | if (se->avg.last_update_time) { | 3012 | if (se->avg.last_update_time) { |
2972 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3013 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
@@ -2988,16 +3029,24 @@ skip_aging: | |||
2988 | cfs_rq_util_change(cfs_rq); | 3029 | cfs_rq_util_change(cfs_rq); |
2989 | } | 3030 | } |
2990 | 3031 | ||
3032 | /** | ||
3033 | * detach_entity_load_avg - detach this entity from its cfs_rq load avg | ||
3034 | * @cfs_rq: cfs_rq to detach from | ||
3035 | * @se: sched_entity to detach | ||
3036 | * | ||
3037 | * Must call update_cfs_rq_load_avg() before this, since we rely on | ||
3038 | * cfs_rq->avg.last_update_time being current. | ||
3039 | */ | ||
2991 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3040 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2992 | { | 3041 | { |
2993 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | 3042 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), |
2994 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | 3043 | &se->avg, se->on_rq * scale_load_down(se->load.weight), |
2995 | cfs_rq->curr == se, NULL); | 3044 | cfs_rq->curr == se, NULL); |
2996 | 3045 | ||
2997 | cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | 3046 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); |
2998 | cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | 3047 | sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); |
2999 | cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | 3048 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); |
3000 | cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | 3049 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); |
3001 | 3050 | ||
3002 | cfs_rq_util_change(cfs_rq); | 3051 | cfs_rq_util_change(cfs_rq); |
3003 | } | 3052 | } |
@@ -3072,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
3072 | u64 last_update_time; | 3121 | u64 last_update_time; |
3073 | 3122 | ||
3074 | /* | 3123 | /* |
3075 | * Newly created task or never used group entity should not be removed | 3124 | * tasks cannot exit without having gone through wake_up_new_task() -> |
3076 | * from its (source) cfs_rq | 3125 | * post_init_entity_util_avg() which will have added things to the |
3126 | * cfs_rq, so we can remove unconditionally. | ||
3127 | * | ||
3128 | * Similarly for groups, they will have passed through | ||
3129 | * post_init_entity_util_avg() before unregister_sched_fair_group() | ||
3130 | * calls this. | ||
3077 | */ | 3131 | */ |
3078 | if (se->avg.last_update_time == 0) | ||
3079 | return; | ||
3080 | 3132 | ||
3081 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3133 | last_update_time = cfs_rq_last_update_time(cfs_rq); |
3082 | 3134 | ||
@@ -3099,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); | |||
3099 | 3151 | ||
3100 | #else /* CONFIG_SMP */ | 3152 | #else /* CONFIG_SMP */ |
3101 | 3153 | ||
3154 | static inline int | ||
3155 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | ||
3156 | { | ||
3157 | return 0; | ||
3158 | } | ||
3159 | |||
3102 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
3103 | { | 3161 | { |
3104 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
@@ -3246,7 +3304,7 @@ static inline void check_schedstat_required(void) | |||
3246 | trace_sched_stat_iowait_enabled() || | 3304 | trace_sched_stat_iowait_enabled() || |
3247 | trace_sched_stat_blocked_enabled() || | 3305 | trace_sched_stat_blocked_enabled() || |
3248 | trace_sched_stat_runtime_enabled()) { | 3306 | trace_sched_stat_runtime_enabled()) { |
3249 | pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " | 3307 | printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " |
3250 | "stat_blocked and stat_runtime require the " | 3308 | "stat_blocked and stat_runtime require the " |
3251 | "kernel parameter schedstats=enabled or " | 3309 | "kernel parameter schedstats=enabled or " |
3252 | "kernel.sched_schedstats=1\n"); | 3310 | "kernel.sched_schedstats=1\n"); |
@@ -3688,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
3688 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 3746 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
3689 | { | 3747 | { |
3690 | if (unlikely(cfs_rq->throttle_count)) | 3748 | if (unlikely(cfs_rq->throttle_count)) |
3691 | return cfs_rq->throttled_clock_task; | 3749 | return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; |
3692 | 3750 | ||
3693 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; | 3751 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; |
3694 | } | 3752 | } |
@@ -3826,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
3826 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 3884 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
3827 | 3885 | ||
3828 | cfs_rq->throttle_count--; | 3886 | cfs_rq->throttle_count--; |
3829 | #ifdef CONFIG_SMP | ||
3830 | if (!cfs_rq->throttle_count) { | 3887 | if (!cfs_rq->throttle_count) { |
3831 | /* adjust cfs_rq_clock_task() */ | 3888 | /* adjust cfs_rq_clock_task() */ |
3832 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - | 3889 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
3833 | cfs_rq->throttled_clock_task; | 3890 | cfs_rq->throttled_clock_task; |
3834 | } | 3891 | } |
3835 | #endif | ||
3836 | 3892 | ||
3837 | return 0; | 3893 | return 0; |
3838 | } | 3894 | } |
@@ -4199,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
4199 | throttle_cfs_rq(cfs_rq); | 4255 | throttle_cfs_rq(cfs_rq); |
4200 | } | 4256 | } |
4201 | 4257 | ||
4258 | static void sync_throttle(struct task_group *tg, int cpu) | ||
4259 | { | ||
4260 | struct cfs_rq *pcfs_rq, *cfs_rq; | ||
4261 | |||
4262 | if (!cfs_bandwidth_used()) | ||
4263 | return; | ||
4264 | |||
4265 | if (!tg->parent) | ||
4266 | return; | ||
4267 | |||
4268 | cfs_rq = tg->cfs_rq[cpu]; | ||
4269 | pcfs_rq = tg->parent->cfs_rq[cpu]; | ||
4270 | |||
4271 | cfs_rq->throttle_count = pcfs_rq->throttle_count; | ||
4272 | pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); | ||
4273 | } | ||
4274 | |||
4202 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 4275 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
4203 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4276 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
4204 | { | 4277 | { |
@@ -4338,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | |||
4338 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 4411 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
4339 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } | 4412 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
4340 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 4413 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
4414 | static inline void sync_throttle(struct task_group *tg, int cpu) {} | ||
4341 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 4415 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
4342 | 4416 | ||
4343 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 4417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
@@ -4446,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4446 | * | 4520 | * |
4447 | * note: in the case of encountering a throttled cfs_rq we will | 4521 | * note: in the case of encountering a throttled cfs_rq we will |
4448 | * post the final h_nr_running increment below. | 4522 | * post the final h_nr_running increment below. |
4449 | */ | 4523 | */ |
4450 | if (cfs_rq_throttled(cfs_rq)) | 4524 | if (cfs_rq_throttled(cfs_rq)) |
4451 | break; | 4525 | break; |
4452 | cfs_rq->h_nr_running++; | 4526 | cfs_rq->h_nr_running++; |
@@ -4500,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4500 | 4574 | ||
4501 | /* Don't dequeue parent if it has other entities besides us */ | 4575 | /* Don't dequeue parent if it has other entities besides us */ |
4502 | if (cfs_rq->load.weight) { | 4576 | if (cfs_rq->load.weight) { |
4577 | /* Avoid re-evaluating load for this entity: */ | ||
4578 | se = parent_entity(se); | ||
4503 | /* | 4579 | /* |
4504 | * Bias pick_next to pick a task from this cfs_rq, as | 4580 | * Bias pick_next to pick a task from this cfs_rq, as |
4505 | * p is sleeping when it is within its sched_slice. | 4581 | * p is sleeping when it is within its sched_slice. |
4506 | */ | 4582 | */ |
4507 | if (task_sleep && parent_entity(se)) | 4583 | if (task_sleep && se && !throttled_hierarchy(cfs_rq)) |
4508 | set_next_buddy(parent_entity(se)); | 4584 | set_next_buddy(se); |
4509 | |||
4510 | /* avoid re-evaluating load for this entity */ | ||
4511 | se = parent_entity(se); | ||
4512 | break; | 4585 | break; |
4513 | } | 4586 | } |
4514 | flags |= DEQUEUE_SLEEP; | 4587 | flags |= DEQUEUE_SLEEP; |
@@ -4910,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
4910 | return wl; | 4983 | return wl; |
4911 | 4984 | ||
4912 | for_each_sched_entity(se) { | 4985 | for_each_sched_entity(se) { |
4913 | long w, W; | 4986 | struct cfs_rq *cfs_rq = se->my_q; |
4987 | long W, w = cfs_rq_load_avg(cfs_rq); | ||
4914 | 4988 | ||
4915 | tg = se->my_q->tg; | 4989 | tg = cfs_rq->tg; |
4916 | 4990 | ||
4917 | /* | 4991 | /* |
4918 | * W = @wg + \Sum rw_j | 4992 | * W = @wg + \Sum rw_j |
4919 | */ | 4993 | */ |
4920 | W = wg + calc_tg_weight(tg, se->my_q); | 4994 | W = wg + atomic_long_read(&tg->load_avg); |
4995 | |||
4996 | /* Ensure \Sum rw_j >= rw_i */ | ||
4997 | W -= cfs_rq->tg_load_avg_contrib; | ||
4998 | W += w; | ||
4921 | 4999 | ||
4922 | /* | 5000 | /* |
4923 | * w = rw_i + @wl | 5001 | * w = rw_i + @wl |
4924 | */ | 5002 | */ |
4925 | w = cfs_rq_load_avg(se->my_q) + wl; | 5003 | w += wl; |
4926 | 5004 | ||
4927 | /* | 5005 | /* |
4928 | * wl = S * s'_i; see (2) | 5006 | * wl = S * s'_i; see (2) |
@@ -8283,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) | |||
8283 | { | 8361 | { |
8284 | struct cfs_rq *cfs_rq; | 8362 | struct cfs_rq *cfs_rq; |
8285 | struct sched_entity *se = &p->se, *curr; | 8363 | struct sched_entity *se = &p->se, *curr; |
8286 | int this_cpu = smp_processor_id(); | ||
8287 | struct rq *rq = this_rq(); | 8364 | struct rq *rq = this_rq(); |
8288 | unsigned long flags; | ||
8289 | |||
8290 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8291 | 8365 | ||
8366 | raw_spin_lock(&rq->lock); | ||
8292 | update_rq_clock(rq); | 8367 | update_rq_clock(rq); |
8293 | 8368 | ||
8294 | cfs_rq = task_cfs_rq(current); | 8369 | cfs_rq = task_cfs_rq(current); |
8295 | curr = cfs_rq->curr; | 8370 | curr = cfs_rq->curr; |
8296 | 8371 | if (curr) { | |
8297 | /* | 8372 | update_curr(cfs_rq); |
8298 | * Not only the cpu but also the task_group of the parent might have | ||
8299 | * been changed after parent->se.parent,cfs_rq were copied to | ||
8300 | * child->se.parent,cfs_rq. So call __set_task_cpu() to make those | ||
8301 | * of child point to valid ones. | ||
8302 | */ | ||
8303 | rcu_read_lock(); | ||
8304 | __set_task_cpu(p, this_cpu); | ||
8305 | rcu_read_unlock(); | ||
8306 | |||
8307 | update_curr(cfs_rq); | ||
8308 | |||
8309 | if (curr) | ||
8310 | se->vruntime = curr->vruntime; | 8373 | se->vruntime = curr->vruntime; |
8374 | } | ||
8311 | place_entity(cfs_rq, se, 1); | 8375 | place_entity(cfs_rq, se, 1); |
8312 | 8376 | ||
8313 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { | 8377 | if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { |
@@ -8320,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) | |||
8320 | } | 8384 | } |
8321 | 8385 | ||
8322 | se->vruntime -= cfs_rq->min_vruntime; | 8386 | se->vruntime -= cfs_rq->min_vruntime; |
8323 | 8387 | raw_spin_unlock(&rq->lock); | |
8324 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8325 | } | 8388 | } |
8326 | 8389 | ||
8327 | /* | 8390 | /* |
@@ -8377,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8377 | { | 8440 | { |
8378 | struct sched_entity *se = &p->se; | 8441 | struct sched_entity *se = &p->se; |
8379 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8443 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8444 | int tg_update; | ||
8380 | 8445 | ||
8381 | if (!vruntime_normalized(p)) { | 8446 | if (!vruntime_normalized(p)) { |
8382 | /* | 8447 | /* |
@@ -8388,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
8388 | } | 8453 | } |
8389 | 8454 | ||
8390 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8455 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
8391 | detach_entity_load_avg(cfs_rq, se); | 8457 | detach_entity_load_avg(cfs_rq, se); |
8458 | if (tg_update) | ||
8459 | update_tg_load_avg(cfs_rq, false); | ||
8392 | } | 8460 | } |
8393 | 8461 | ||
8394 | static void attach_task_cfs_rq(struct task_struct *p) | 8462 | static void attach_task_cfs_rq(struct task_struct *p) |
8395 | { | 8463 | { |
8396 | struct sched_entity *se = &p->se; | 8464 | struct sched_entity *se = &p->se; |
8397 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8466 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8467 | int tg_update; | ||
8398 | 8468 | ||
8399 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8469 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8400 | /* | 8470 | /* |
@@ -8405,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8405 | #endif | 8475 | #endif |
8406 | 8476 | ||
8407 | /* Synchronize task with its cfs_rq */ | 8477 | /* Synchronize task with its cfs_rq */ |
8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | ||
8408 | attach_entity_load_avg(cfs_rq, se); | 8479 | attach_entity_load_avg(cfs_rq, se); |
8480 | if (tg_update) | ||
8481 | update_tg_load_avg(cfs_rq, false); | ||
8409 | 8482 | ||
8410 | if (!vruntime_normalized(p)) | 8483 | if (!vruntime_normalized(p)) |
8411 | se->vruntime += cfs_rq->min_vruntime; | 8484 | se->vruntime += cfs_rq->min_vruntime; |
@@ -8465,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
8465 | } | 8538 | } |
8466 | 8539 | ||
8467 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8541 | static void task_set_group_fair(struct task_struct *p) | ||
8542 | { | ||
8543 | struct sched_entity *se = &p->se; | ||
8544 | |||
8545 | set_task_rq(p, task_cpu(p)); | ||
8546 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
8547 | } | ||
8548 | |||
8468 | static void task_move_group_fair(struct task_struct *p) | 8549 | static void task_move_group_fair(struct task_struct *p) |
8469 | { | 8550 | { |
8470 | detach_task_cfs_rq(p); | 8551 | detach_task_cfs_rq(p); |
@@ -8477,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) | |||
8477 | attach_task_cfs_rq(p); | 8558 | attach_task_cfs_rq(p); |
8478 | } | 8559 | } |
8479 | 8560 | ||
8561 | static void task_change_group_fair(struct task_struct *p, int type) | ||
8562 | { | ||
8563 | switch (type) { | ||
8564 | case TASK_SET_GROUP: | ||
8565 | task_set_group_fair(p); | ||
8566 | break; | ||
8567 | |||
8568 | case TASK_MOVE_GROUP: | ||
8569 | task_move_group_fair(p); | ||
8570 | break; | ||
8571 | } | ||
8572 | } | ||
8573 | |||
8480 | void free_fair_sched_group(struct task_group *tg) | 8574 | void free_fair_sched_group(struct task_group *tg) |
8481 | { | 8575 | { |
8482 | int i; | 8576 | int i; |
@@ -8496,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg) | |||
8496 | 8590 | ||
8497 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | 8591 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) |
8498 | { | 8592 | { |
8499 | struct cfs_rq *cfs_rq; | ||
8500 | struct sched_entity *se; | 8593 | struct sched_entity *se; |
8594 | struct cfs_rq *cfs_rq; | ||
8595 | struct rq *rq; | ||
8501 | int i; | 8596 | int i; |
8502 | 8597 | ||
8503 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8598 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8512,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8512 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | 8607 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); |
8513 | 8608 | ||
8514 | for_each_possible_cpu(i) { | 8609 | for_each_possible_cpu(i) { |
8610 | rq = cpu_rq(i); | ||
8611 | |||
8515 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8612 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8516 | GFP_KERNEL, cpu_to_node(i)); | 8613 | GFP_KERNEL, cpu_to_node(i)); |
8517 | if (!cfs_rq) | 8614 | if (!cfs_rq) |
@@ -8525,7 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8525 | init_cfs_rq(cfs_rq); | 8622 | init_cfs_rq(cfs_rq); |
8526 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8623 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8527 | init_entity_runnable_average(se); | 8624 | init_entity_runnable_average(se); |
8528 | post_init_entity_util_avg(se); | ||
8529 | } | 8625 | } |
8530 | 8626 | ||
8531 | return 1; | 8627 | return 1; |
@@ -8536,6 +8632,23 @@ err: | |||
8536 | return 0; | 8632 | return 0; |
8537 | } | 8633 | } |
8538 | 8634 | ||
8635 | void online_fair_sched_group(struct task_group *tg) | ||
8636 | { | ||
8637 | struct sched_entity *se; | ||
8638 | struct rq *rq; | ||
8639 | int i; | ||
8640 | |||
8641 | for_each_possible_cpu(i) { | ||
8642 | rq = cpu_rq(i); | ||
8643 | se = tg->se[i]; | ||
8644 | |||
8645 | raw_spin_lock_irq(&rq->lock); | ||
8646 | post_init_entity_util_avg(se); | ||
8647 | sync_throttle(tg, i); | ||
8648 | raw_spin_unlock_irq(&rq->lock); | ||
8649 | } | ||
8650 | } | ||
8651 | |||
8539 | void unregister_fair_sched_group(struct task_group *tg) | 8652 | void unregister_fair_sched_group(struct task_group *tg) |
8540 | { | 8653 | { |
8541 | unsigned long flags; | 8654 | unsigned long flags; |
@@ -8640,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8640 | return 1; | 8753 | return 1; |
8641 | } | 8754 | } |
8642 | 8755 | ||
8756 | void online_fair_sched_group(struct task_group *tg) { } | ||
8757 | |||
8643 | void unregister_fair_sched_group(struct task_group *tg) { } | 8758 | void unregister_fair_sched_group(struct task_group *tg) { } |
8644 | 8759 | ||
8645 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8760 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -8699,7 +8814,7 @@ const struct sched_class fair_sched_class = { | |||
8699 | .update_curr = update_curr_fair, | 8814 | .update_curr = update_curr_fair, |
8700 | 8815 | ||
8701 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8816 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8702 | .task_move_group = task_move_group_fair, | 8817 | .task_change_group = task_change_group_fair, |
8703 | #endif | 8818 | #endif |
8704 | }; | 8819 | }; |
8705 | 8820 | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, | |||
127 | */ | 127 | */ |
128 | static void cpuidle_idle_call(void) | 128 | static void cpuidle_idle_call(void) |
129 | { | 129 | { |
130 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | 130 | struct cpuidle_device *dev = cpuidle_get_device(); |
131 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | 131 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); |
132 | int next_state, entered_state; | 132 | int next_state, entered_state; |
133 | 133 | ||
@@ -201,6 +201,8 @@ exit_idle: | |||
201 | */ | 201 | */ |
202 | static void cpu_idle_loop(void) | 202 | static void cpu_idle_loop(void) |
203 | { | 203 | { |
204 | int cpu = smp_processor_id(); | ||
205 | |||
204 | while (1) { | 206 | while (1) { |
205 | /* | 207 | /* |
206 | * If the arch has a polling bit, we maintain an invariant: | 208 | * If the arch has a polling bit, we maintain an invariant: |
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void) | |||
219 | check_pgt_cache(); | 221 | check_pgt_cache(); |
220 | rmb(); | 222 | rmb(); |
221 | 223 | ||
222 | if (cpu_is_offline(smp_processor_id())) { | 224 | if (cpu_is_offline(cpu)) { |
223 | cpuhp_report_idle_dead(); | 225 | cpuhp_report_idle_dead(); |
224 | arch_cpu_idle_dead(); | 226 | arch_cpu_idle_dead(); |
225 | } | 227 | } |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
78 | loads[2] = (avenrun[2] + offset) << shift; | 78 | loads[2] = (avenrun[2] + offset) << shift; |
79 | } | 79 | } |
80 | 80 | ||
81 | long calc_load_fold_active(struct rq *this_rq) | 81 | long calc_load_fold_active(struct rq *this_rq, long adjust) |
82 | { | 82 | { |
83 | long nr_active, delta = 0; | 83 | long nr_active, delta = 0; |
84 | 84 | ||
85 | nr_active = this_rq->nr_running; | 85 | nr_active = this_rq->nr_running - adjust; |
86 | nr_active += (long)this_rq->nr_uninterruptible; | 86 | nr_active += (long)this_rq->nr_uninterruptible; |
87 | 87 | ||
88 | if (nr_active != this_rq->calc_load_active) { | 88 | if (nr_active != this_rq->calc_load_active) { |
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void) | |||
188 | * We're going into NOHZ mode, if there's any pending delta, fold it | 188 | * We're going into NOHZ mode, if there's any pending delta, fold it |
189 | * into the pending idle delta. | 189 | * into the pending idle delta. |
190 | */ | 190 | */ |
191 | delta = calc_load_fold_active(this_rq); | 191 | delta = calc_load_fold_active(this_rq, 0); |
192 | if (delta) { | 192 | if (delta) { |
193 | int idx = calc_load_write_idx(); | 193 | int idx = calc_load_write_idx(); |
194 | 194 | ||
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) | |||
389 | if (time_before(jiffies, this_rq->calc_load_update)) | 389 | if (time_before(jiffies, this_rq->calc_load_update)) |
390 | return; | 390 | return; |
391 | 391 | ||
392 | delta = calc_load_fold_active(this_rq); | 392 | delta = calc_load_fold_active(this_rq, 0); |
393 | if (delta) | 393 | if (delta) |
394 | atomic_long_add(delta, &calc_load_tasks); | 394 | atomic_long_add(delta, &calc_load_tasks); |
395 | 395 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update; | |||
28 | extern atomic_long_t calc_load_tasks; | 28 | extern atomic_long_t calc_load_tasks; |
29 | 29 | ||
30 | extern void calc_global_load_tick(struct rq *this_rq); | 30 | extern void calc_global_load_tick(struct rq *this_rq); |
31 | extern long calc_load_fold_active(struct rq *this_rq); | 31 | extern long calc_load_fold_active(struct rq *this_rq, long adjust); |
32 | 32 | ||
33 | #ifdef CONFIG_SMP | 33 | #ifdef CONFIG_SMP |
34 | extern void cpu_load_update_active(struct rq *this_rq); | 34 | extern void cpu_load_update_active(struct rq *this_rq); |
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); | |||
321 | 321 | ||
322 | extern void free_fair_sched_group(struct task_group *tg); | 322 | extern void free_fair_sched_group(struct task_group *tg); |
323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | 323 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); |
324 | extern void online_fair_sched_group(struct task_group *tg); | ||
324 | extern void unregister_fair_sched_group(struct task_group *tg); | 325 | extern void unregister_fair_sched_group(struct task_group *tg); |
325 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 326 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
326 | struct sched_entity *se, int cpu, | 327 | struct sched_entity *se, int cpu, |
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
1113 | * In particular, the load of prev->state in finish_task_switch() must | 1114 | * In particular, the load of prev->state in finish_task_switch() must |
1114 | * happen before this. | 1115 | * happen before this. |
1115 | * | 1116 | * |
1116 | * Pairs with the smp_cond_acquire() in try_to_wake_up(). | 1117 | * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). |
1117 | */ | 1118 | */ |
1118 | smp_store_release(&prev->on_cpu, 0); | 1119 | smp_store_release(&prev->on_cpu, 0); |
1119 | #endif | 1120 | #endif |
@@ -1246,8 +1247,11 @@ struct sched_class { | |||
1246 | 1247 | ||
1247 | void (*update_curr) (struct rq *rq); | 1248 | void (*update_curr) (struct rq *rq); |
1248 | 1249 | ||
1250 | #define TASK_SET_GROUP 0 | ||
1251 | #define TASK_MOVE_GROUP 1 | ||
1252 | |||
1249 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1253 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1250 | void (*task_move_group) (struct task_struct *p); | 1254 | void (*task_change_group) (struct task_struct *p, int type); |
1251 | #endif | 1255 | #endif |
1252 | }; | 1256 | }; |
1253 | 1257 | ||
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} | |||
1809 | #else /* arch_scale_freq_capacity */ | 1813 | #else /* arch_scale_freq_capacity */ |
1810 | #define arch_scale_freq_invariant() (false) | 1814 | #define arch_scale_freq_invariant() (false) |
1811 | #endif | 1815 | #endif |
1812 | |||
1813 | static inline void account_reset_rq(struct rq *rq) | ||
1814 | { | ||
1815 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1816 | rq->prev_irq_time = 0; | ||
1817 | #endif | ||
1818 | #ifdef CONFIG_PARAVIRT | ||
1819 | rq->prev_steal_time = 0; | ||
1820 | #endif | ||
1821 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
1822 | rq->prev_steal_time_rq = 0; | ||
1823 | #endif | ||
1824 | } | ||
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 70b3b6a20fb0..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) | 33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) |
34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) | 34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) |
35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
36 | # define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) | ||
37 | |||
36 | #else /* !CONFIG_SCHEDSTATS */ | 38 | #else /* !CONFIG_SCHEDSTATS */ |
37 | static inline void | 39 | static inline void |
38 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 40 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
47 | # define schedstat_inc(rq, field) do { } while (0) | 49 | # define schedstat_inc(rq, field) do { } while (0) |
48 | # define schedstat_add(rq, field, amt) do { } while (0) | 50 | # define schedstat_add(rq, field, amt) do { } while (0) |
49 | # define schedstat_set(var, val) do { } while (0) | 51 | # define schedstat_set(var, val) do { } while (0) |
52 | # define schedstat_val(rq, field) 0 | ||
50 | #endif | 53 | #endif |
51 | 54 | ||
52 | #ifdef CONFIG_SCHED_INFO | 55 | #ifdef CONFIG_SCHED_INFO |
diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
2751 | * @ts: upper bound on process time suspension | 2751 | * @ts: upper bound on process time suspension |
2752 | */ | 2752 | */ |
2753 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | 2753 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, |
2754 | const struct timespec *ts) | 2754 | const struct timespec *ts) |
2755 | { | 2755 | { |
2756 | ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; | ||
2756 | struct task_struct *tsk = current; | 2757 | struct task_struct *tsk = current; |
2757 | long timeout = MAX_SCHEDULE_TIMEOUT; | ||
2758 | sigset_t mask = *which; | 2758 | sigset_t mask = *which; |
2759 | int sig; | 2759 | int sig, ret = 0; |
2760 | 2760 | ||
2761 | if (ts) { | 2761 | if (ts) { |
2762 | if (!timespec_valid(ts)) | 2762 | if (!timespec_valid(ts)) |
2763 | return -EINVAL; | 2763 | return -EINVAL; |
2764 | timeout = timespec_to_jiffies(ts); | 2764 | timeout = timespec_to_ktime(*ts); |
2765 | /* | 2765 | to = &timeout; |
2766 | * We can be close to the next tick, add another one | ||
2767 | * to ensure we will wait at least the time asked for. | ||
2768 | */ | ||
2769 | if (ts->tv_sec || ts->tv_nsec) | ||
2770 | timeout++; | ||
2771 | } | 2766 | } |
2772 | 2767 | ||
2773 | /* | 2768 | /* |
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
2778 | 2773 | ||
2779 | spin_lock_irq(&tsk->sighand->siglock); | 2774 | spin_lock_irq(&tsk->sighand->siglock); |
2780 | sig = dequeue_signal(tsk, &mask, info); | 2775 | sig = dequeue_signal(tsk, &mask, info); |
2781 | if (!sig && timeout) { | 2776 | if (!sig && timeout.tv64) { |
2782 | /* | 2777 | /* |
2783 | * None ready, temporarily unblock those we're interested | 2778 | * None ready, temporarily unblock those we're interested |
2784 | * while we are sleeping in so that we'll be awakened when | 2779 | * while we are sleeping in so that we'll be awakened when |
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
2790 | recalc_sigpending(); | 2785 | recalc_sigpending(); |
2791 | spin_unlock_irq(&tsk->sighand->siglock); | 2786 | spin_unlock_irq(&tsk->sighand->siglock); |
2792 | 2787 | ||
2793 | timeout = freezable_schedule_timeout_interruptible(timeout); | 2788 | __set_current_state(TASK_INTERRUPTIBLE); |
2794 | 2789 | ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, | |
2790 | HRTIMER_MODE_REL); | ||
2795 | spin_lock_irq(&tsk->sighand->siglock); | 2791 | spin_lock_irq(&tsk->sighand->siglock); |
2796 | __set_task_blocked(tsk, &tsk->real_blocked); | 2792 | __set_task_blocked(tsk, &tsk->real_blocked); |
2797 | sigemptyset(&tsk->real_blocked); | 2793 | sigemptyset(&tsk->real_blocked); |
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
2801 | 2797 | ||
2802 | if (sig) | 2798 | if (sig) |
2803 | return sig; | 2799 | return sig; |
2804 | return timeout ? -EINTR : -EAGAIN; | 2800 | return ret ? -EINTR : -EAGAIN; |
2805 | } | 2801 | } |
2806 | 2802 | ||
2807 | /** | 2803 | /** |
diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); | |||
33 | 33 | ||
34 | static void flush_smp_call_function_queue(bool warn_cpu_offline); | 34 | static void flush_smp_call_function_queue(bool warn_cpu_offline); |
35 | 35 | ||
36 | static int | 36 | int smpcfd_prepare_cpu(unsigned int cpu) |
37 | hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
38 | { | 37 | { |
39 | long cpu = (long)hcpu; | ||
40 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); | 38 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); |
41 | 39 | ||
42 | switch (action) { | 40 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
43 | case CPU_UP_PREPARE: | 41 | cpu_to_node(cpu))) |
44 | case CPU_UP_PREPARE_FROZEN: | 42 | return -ENOMEM; |
45 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 43 | cfd->csd = alloc_percpu(struct call_single_data); |
46 | cpu_to_node(cpu))) | 44 | if (!cfd->csd) { |
47 | return notifier_from_errno(-ENOMEM); | ||
48 | cfd->csd = alloc_percpu(struct call_single_data); | ||
49 | if (!cfd->csd) { | ||
50 | free_cpumask_var(cfd->cpumask); | ||
51 | return notifier_from_errno(-ENOMEM); | ||
52 | } | ||
53 | break; | ||
54 | |||
55 | #ifdef CONFIG_HOTPLUG_CPU | ||
56 | case CPU_UP_CANCELED: | ||
57 | case CPU_UP_CANCELED_FROZEN: | ||
58 | /* Fall-through to the CPU_DEAD[_FROZEN] case. */ | ||
59 | |||
60 | case CPU_DEAD: | ||
61 | case CPU_DEAD_FROZEN: | ||
62 | free_cpumask_var(cfd->cpumask); | 45 | free_cpumask_var(cfd->cpumask); |
63 | free_percpu(cfd->csd); | 46 | return -ENOMEM; |
64 | break; | 47 | } |
65 | 48 | ||
66 | case CPU_DYING: | 49 | return 0; |
67 | case CPU_DYING_FROZEN: | 50 | } |
68 | /* | 51 | |
69 | * The IPIs for the smp-call-function callbacks queued by other | 52 | int smpcfd_dead_cpu(unsigned int cpu) |
70 | * CPUs might arrive late, either due to hardware latencies or | 53 | { |
71 | * because this CPU disabled interrupts (inside stop-machine) | 54 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); |
72 | * before the IPIs were sent. So flush out any pending callbacks | ||
73 | * explicitly (without waiting for the IPIs to arrive), to | ||
74 | * ensure that the outgoing CPU doesn't go offline with work | ||
75 | * still pending. | ||
76 | */ | ||
77 | flush_smp_call_function_queue(false); | ||
78 | break; | ||
79 | #endif | ||
80 | }; | ||
81 | 55 | ||
82 | return NOTIFY_OK; | 56 | free_cpumask_var(cfd->cpumask); |
57 | free_percpu(cfd->csd); | ||
58 | return 0; | ||
83 | } | 59 | } |
84 | 60 | ||
85 | static struct notifier_block hotplug_cfd_notifier = { | 61 | int smpcfd_dying_cpu(unsigned int cpu) |
86 | .notifier_call = hotplug_cfd, | 62 | { |
87 | }; | 63 | /* |
64 | * The IPIs for the smp-call-function callbacks queued by other | ||
65 | * CPUs might arrive late, either due to hardware latencies or | ||
66 | * because this CPU disabled interrupts (inside stop-machine) | ||
67 | * before the IPIs were sent. So flush out any pending callbacks | ||
68 | * explicitly (without waiting for the IPIs to arrive), to | ||
69 | * ensure that the outgoing CPU doesn't go offline with work | ||
70 | * still pending. | ||
71 | */ | ||
72 | flush_smp_call_function_queue(false); | ||
73 | return 0; | ||
74 | } | ||
88 | 75 | ||
89 | void __init call_function_init(void) | 76 | void __init call_function_init(void) |
90 | { | 77 | { |
91 | void *cpu = (void *)(long)smp_processor_id(); | ||
92 | int i; | 78 | int i; |
93 | 79 | ||
94 | for_each_possible_cpu(i) | 80 | for_each_possible_cpu(i) |
95 | init_llist_head(&per_cpu(call_single_queue, i)); | 81 | init_llist_head(&per_cpu(call_single_queue, i)); |
96 | 82 | ||
97 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 83 | smpcfd_prepare_cpu(smp_processor_id()); |
98 | register_cpu_notifier(&hotplug_cfd_notifier); | ||
99 | } | 84 | } |
100 | 85 | ||
101 | /* | 86 | /* |
@@ -107,7 +92,7 @@ void __init call_function_init(void) | |||
107 | */ | 92 | */ |
108 | static __always_inline void csd_lock_wait(struct call_single_data *csd) | 93 | static __always_inline void csd_lock_wait(struct call_single_data *csd) |
109 | { | 94 | { |
110 | smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); | 95 | smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); |
111 | } | 96 | } |
112 | 97 | ||
113 | static __always_inline void csd_lock(struct call_single_data *csd) | 98 | static __always_inline void csd_lock(struct call_single_data *csd) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = { | |||
1205 | .extra2 = &one, | 1205 | .extra2 = &one, |
1206 | }, | 1206 | }, |
1207 | #endif | 1207 | #endif |
1208 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) | ||
1209 | { | ||
1210 | .procname = "panic_on_rcu_stall", | ||
1211 | .data = &sysctl_panic_on_rcu_stall, | ||
1212 | .maxlen = sizeof(sysctl_panic_on_rcu_stall), | ||
1213 | .mode = 0644, | ||
1214 | .proc_handler = proc_dointvec_minmax, | ||
1215 | .extra1 = &zero, | ||
1216 | .extra2 = &one, | ||
1217 | }, | ||
1218 | #endif | ||
1208 | { } | 1219 | { } |
1209 | }; | 1220 | }; |
1210 | 1221 | ||
@@ -1497,8 +1508,8 @@ static struct ctl_table vm_table[] = { | |||
1497 | #ifdef CONFIG_NUMA | 1508 | #ifdef CONFIG_NUMA |
1498 | { | 1509 | { |
1499 | .procname = "zone_reclaim_mode", | 1510 | .procname = "zone_reclaim_mode", |
1500 | .data = &zone_reclaim_mode, | 1511 | .data = &node_reclaim_mode, |
1501 | .maxlen = sizeof(zone_reclaim_mode), | 1512 | .maxlen = sizeof(node_reclaim_mode), |
1502 | .mode = 0644, | 1513 | .mode = 0644, |
1503 | .proc_handler = proc_dointvec, | 1514 | .proc_handler = proc_dointvec, |
1504 | .extra1 = &zero, | 1515 | .extra1 = &zero, |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -108,7 +108,6 @@ void task_work_run(void) | |||
108 | * fail, but it can play with *work and other entries. | 108 | * fail, but it can play with *work and other entries. |
109 | */ | 109 | */ |
110 | raw_spin_unlock_wait(&task->pi_lock); | 110 | raw_spin_unlock_wait(&task->pi_lock); |
111 | smp_mb(); | ||
112 | 111 | ||
113 | do { | 112 | do { |
114 | next = work->next; | 113 | next = work->next; |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -30,7 +30,6 @@ | |||
30 | * struct alarm_base - Alarm timer bases | 30 | * struct alarm_base - Alarm timer bases |
31 | * @lock: Lock for syncrhonized access to the base | 31 | * @lock: Lock for syncrhonized access to the base |
32 | * @timerqueue: Timerqueue head managing the list of events | 32 | * @timerqueue: Timerqueue head managing the list of events |
33 | * @timer: hrtimer used to schedule events while running | ||
34 | * @gettime: Function to read the time correlating to the base | 33 | * @gettime: Function to read the time correlating to the base |
35 | * @base_clockid: clockid for the base | 34 | * @base_clockid: clockid for the base |
36 | */ | 35 | */ |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) | |||
645 | #endif | 645 | #endif |
646 | 646 | ||
647 | #ifdef CONFIG_SYSFS | 647 | #ifdef CONFIG_SYSFS |
648 | struct bus_type clockevents_subsys = { | 648 | static struct bus_type clockevents_subsys = { |
649 | .name = "clockevents", | 649 | .name = "clockevents", |
650 | .dev_name = "clockevent", | 650 | .dev_name = "clockevent", |
651 | }; | 651 | }; |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
669 | struct list_head *entry = &clocksource_list; | 669 | struct list_head *entry = &clocksource_list; |
670 | struct clocksource *tmp; | 670 | struct clocksource *tmp; |
671 | 671 | ||
672 | list_for_each_entry(tmp, &clocksource_list, list) | 672 | list_for_each_entry(tmp, &clocksource_list, list) { |
673 | /* Keep track of the place, where to insert */ | 673 | /* Keep track of the place, where to insert */ |
674 | if (tmp->rating >= cs->rating) | 674 | if (tmp->rating < cs->rating) |
675 | entry = &tmp->list; | 675 | break; |
676 | entry = &tmp->list; | ||
677 | } | ||
676 | list_add(&cs->list, entry); | 678 | list_add(&cs->list, entry); |
677 | } | 679 | } |
678 | 680 | ||
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e99df0ff1d42..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) | |||
177 | #endif | 177 | #endif |
178 | } | 178 | } |
179 | 179 | ||
180 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 180 | #ifdef CONFIG_NO_HZ_COMMON |
181 | static inline | 181 | static inline |
182 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | 182 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, |
183 | int pinned) | 183 | int pinned) |
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | |||
1590 | /* | 1590 | /* |
1591 | * Functions related to boot-time initialization: | 1591 | * Functions related to boot-time initialization: |
1592 | */ | 1592 | */ |
1593 | static void init_hrtimers_cpu(int cpu) | 1593 | int hrtimers_prepare_cpu(unsigned int cpu) |
1594 | { | 1594 | { |
1595 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1595 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
1596 | int i; | 1596 | int i; |
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu) | |||
1602 | 1602 | ||
1603 | cpu_base->cpu = cpu; | 1603 | cpu_base->cpu = cpu; |
1604 | hrtimer_init_hres(cpu_base); | 1604 | hrtimer_init_hres(cpu_base); |
1605 | return 0; | ||
1605 | } | 1606 | } |
1606 | 1607 | ||
1607 | #ifdef CONFIG_HOTPLUG_CPU | 1608 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
1636 | } | 1637 | } |
1637 | } | 1638 | } |
1638 | 1639 | ||
1639 | static void migrate_hrtimers(int scpu) | 1640 | int hrtimers_dead_cpu(unsigned int scpu) |
1640 | { | 1641 | { |
1641 | struct hrtimer_cpu_base *old_base, *new_base; | 1642 | struct hrtimer_cpu_base *old_base, *new_base; |
1642 | int i; | 1643 | int i; |
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu) | |||
1665 | /* Check, if we got expired work to do */ | 1666 | /* Check, if we got expired work to do */ |
1666 | __hrtimer_peek_ahead_timers(); | 1667 | __hrtimer_peek_ahead_timers(); |
1667 | local_irq_enable(); | 1668 | local_irq_enable(); |
1669 | return 0; | ||
1668 | } | 1670 | } |
1669 | 1671 | ||
1670 | #endif /* CONFIG_HOTPLUG_CPU */ | 1672 | #endif /* CONFIG_HOTPLUG_CPU */ |
1671 | 1673 | ||
1672 | static int hrtimer_cpu_notify(struct notifier_block *self, | ||
1673 | unsigned long action, void *hcpu) | ||
1674 | { | ||
1675 | int scpu = (long)hcpu; | ||
1676 | |||
1677 | switch (action) { | ||
1678 | |||
1679 | case CPU_UP_PREPARE: | ||
1680 | case CPU_UP_PREPARE_FROZEN: | ||
1681 | init_hrtimers_cpu(scpu); | ||
1682 | break; | ||
1683 | |||
1684 | #ifdef CONFIG_HOTPLUG_CPU | ||
1685 | case CPU_DEAD: | ||
1686 | case CPU_DEAD_FROZEN: | ||
1687 | migrate_hrtimers(scpu); | ||
1688 | break; | ||
1689 | #endif | ||
1690 | |||
1691 | default: | ||
1692 | break; | ||
1693 | } | ||
1694 | |||
1695 | return NOTIFY_OK; | ||
1696 | } | ||
1697 | |||
1698 | static struct notifier_block hrtimers_nb = { | ||
1699 | .notifier_call = hrtimer_cpu_notify, | ||
1700 | }; | ||
1701 | |||
1702 | void __init hrtimers_init(void) | 1674 | void __init hrtimers_init(void) |
1703 | { | 1675 | { |
1704 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1676 | hrtimers_prepare_cpu(smp_processor_id()); |
1705 | (void *)(long)smp_processor_id()); | ||
1706 | register_cpu_notifier(&hrtimers_nb); | ||
1707 | } | 1677 | } |
1708 | 1678 | ||
1709 | /** | 1679 | /** |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
777 | timer->it.cpu.expires = 0; | 777 | timer->it.cpu.expires = 0; |
778 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, | 778 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, |
779 | &itp->it_value); | 779 | &itp->it_value); |
780 | return; | ||
780 | } else { | 781 | } else { |
781 | cpu_timer_sample_group(timer->it_clock, p, &now); | 782 | cpu_timer_sample_group(timer->it_clock, p, &now); |
782 | unlock_task_sighand(p, &flags); | 783 | unlock_task_sighand(p, &flags); |
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c | |||
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) | |||
43 | int allowed_error_ns = usecs * 5; | 43 | int allowed_error_ns = usecs * 5; |
44 | 44 | ||
45 | for (i = 0; i < iters; ++i) { | 45 | for (i = 0; i < iters; ++i) { |
46 | struct timespec ts1, ts2; | 46 | s64 kt1, kt2; |
47 | int time_passed; | 47 | int time_passed; |
48 | 48 | ||
49 | ktime_get_ts(&ts1); | 49 | kt1 = ktime_get_ns(); |
50 | udelay(usecs); | 50 | udelay(usecs); |
51 | ktime_get_ts(&ts2); | 51 | kt2 = ktime_get_ns(); |
52 | time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); | 52 | time_passed = kt2 - kt1; |
53 | 53 | ||
54 | if (i == 0 || time_passed < min) | 54 | if (i == 0 || time_passed < min) |
55 | min = time_passed; | 55 | min = time_passed; |
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v) | |||
87 | if (usecs > 0 && iters > 0) { | 87 | if (usecs > 0 && iters > 0) { |
88 | return udelay_test_single(s, usecs, iters); | 88 | return udelay_test_single(s, usecs, iters); |
89 | } else if (usecs == 0) { | 89 | } else if (usecs == 0) { |
90 | struct timespec ts; | 90 | struct timespec64 ts; |
91 | 91 | ||
92 | ktime_get_ts(&ts); | 92 | ktime_get_ts64(&ts); |
93 | seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", | 93 | seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", |
94 | loops_per_jiffy, ts.tv_sec, ts.tv_nsec); | 94 | loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); |
95 | seq_puts(s, "usage:\n"); | 95 | seq_puts(s, "usage:\n"); |
96 | seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); | 96 | seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); |
97 | seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); | 97 | seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
75 | } | 75 | } |
76 | 76 | ||
77 | static struct clock_event_device ce_broadcast_hrtimer = { | 77 | static struct clock_event_device ce_broadcast_hrtimer = { |
78 | .name = "bc_hrtimer", | ||
78 | .set_state_shutdown = bc_shutdown, | 79 | .set_state_shutdown = bc_shutdown, |
79 | .set_next_ktime = bc_set_next, | 80 | .set_next_ktime = bc_set_next, |
80 | .features = CLOCK_EVT_FEAT_ONESHOT | | 81 | .features = CLOCK_EVT_FEAT_ONESHOT | |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } | |||
164 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | 164 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); |
165 | 165 | ||
166 | extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); | 166 | extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); |
167 | void timer_clear_idle(void); | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 536ada80f6dd..204fdc86863d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -31,7 +31,7 @@ | |||
31 | #include <trace/events/timer.h> | 31 | #include <trace/events/timer.h> |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Per cpu nohz control structure | 34 | * Per-CPU nohz control structure |
35 | */ | 35 | */ |
36 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 36 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
37 | 37 | ||
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
61 | if (delta.tv64 < tick_period.tv64) | 61 | if (delta.tv64 < tick_period.tv64) |
62 | return; | 62 | return; |
63 | 63 | ||
64 | /* Reevalute with jiffies_lock held */ | 64 | /* Reevaluate with jiffies_lock held */ |
65 | write_seqlock(&jiffies_lock); | 65 | write_seqlock(&jiffies_lock); |
66 | 66 | ||
67 | delta = ktime_sub(now, last_jiffies_update); | 67 | delta = ktime_sub(now, last_jiffies_update); |
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) | |||
116 | #ifdef CONFIG_NO_HZ_COMMON | 116 | #ifdef CONFIG_NO_HZ_COMMON |
117 | /* | 117 | /* |
118 | * Check if the do_timer duty was dropped. We don't care about | 118 | * Check if the do_timer duty was dropped. We don't care about |
119 | * concurrency: This happens only when the cpu in charge went | 119 | * concurrency: This happens only when the CPU in charge went |
120 | * into a long sleep. If two cpus happen to assign themself to | 120 | * into a long sleep. If two CPUs happen to assign themselves to |
121 | * this duty, then the jiffies update is still serialized by | 121 | * this duty, then the jiffies update is still serialized by |
122 | * jiffies_lock. | 122 | * jiffies_lock. |
123 | */ | 123 | */ |
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi | |||
349 | /* | 349 | /* |
350 | * Re-evaluate the need for the tick as we switch the current task. | 350 | * Re-evaluate the need for the tick as we switch the current task. |
351 | * It might need the tick due to per task/process properties: | 351 | * It might need the tick due to per task/process properties: |
352 | * perf events, posix cpu timers, ... | 352 | * perf events, posix CPU timers, ... |
353 | */ | 353 | */ |
354 | void __tick_nohz_task_switch(void) | 354 | void __tick_nohz_task_switch(void) |
355 | { | 355 | { |
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) | |||
509 | * | 509 | * |
510 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | 510 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies |
511 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | 511 | * must be updated. Otherwise an interrupt handler could use a stale jiffy |
512 | * value. We do this unconditionally on any cpu, as we don't know whether the | 512 | * value. We do this unconditionally on any CPU, as we don't know whether the |
513 | * cpu, which has the update task assigned is in a long sleep. | 513 | * CPU, which has the update task assigned is in a long sleep. |
514 | */ | 514 | */ |
515 | static void tick_nohz_update_jiffies(ktime_t now) | 515 | static void tick_nohz_update_jiffies(ktime_t now) |
516 | { | 516 | { |
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) | |||
526 | } | 526 | } |
527 | 527 | ||
528 | /* | 528 | /* |
529 | * Updates the per cpu time idle statistics counters | 529 | * Updates the per-CPU time idle statistics counters |
530 | */ | 530 | */ |
531 | static void | 531 | static void |
532 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) | 532 | update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) |
@@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) | |||
566 | } | 566 | } |
567 | 567 | ||
568 | /** | 568 | /** |
569 | * get_cpu_idle_time_us - get the total idle time of a cpu | 569 | * get_cpu_idle_time_us - get the total idle time of a CPU |
570 | * @cpu: CPU number to query | 570 | * @cpu: CPU number to query |
571 | * @last_update_time: variable to store update time in. Do not update | 571 | * @last_update_time: variable to store update time in. Do not update |
572 | * counters if NULL. | 572 | * counters if NULL. |
573 | * | 573 | * |
574 | * Return the cummulative idle time (since boot) for a given | 574 | * Return the cumulative idle time (since boot) for a given |
575 | * CPU, in microseconds. | 575 | * CPU, in microseconds. |
576 | * | 576 | * |
577 | * This time is measured via accounting rather than sampling, | 577 | * This time is measured via accounting rather than sampling, |
@@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | |||
607 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | 607 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); |
608 | 608 | ||
609 | /** | 609 | /** |
610 | * get_cpu_iowait_time_us - get the total iowait time of a cpu | 610 | * get_cpu_iowait_time_us - get the total iowait time of a CPU |
611 | * @cpu: CPU number to query | 611 | * @cpu: CPU number to query |
612 | * @last_update_time: variable to store update time in. Do not update | 612 | * @last_update_time: variable to store update time in. Do not update |
613 | * counters if NULL. | 613 | * counters if NULL. |
614 | * | 614 | * |
615 | * Return the cummulative iowait time (since boot) for a given | 615 | * Return the cumulative iowait time (since boot) for a given |
616 | * CPU, in microseconds. | 616 | * CPU, in microseconds. |
617 | * | 617 | * |
618 | * This time is measured via accounting rather than sampling, | 618 | * This time is measured via accounting rather than sampling, |
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
700 | delta = next_tick - basemono; | 700 | delta = next_tick - basemono; |
701 | if (delta <= (u64)TICK_NSEC) { | 701 | if (delta <= (u64)TICK_NSEC) { |
702 | tick.tv64 = 0; | 702 | tick.tv64 = 0; |
703 | |||
704 | /* | ||
705 | * Tell the timer code that the base is not idle, i.e. undo | ||
706 | * the effect of get_next_timer_interrupt(): | ||
707 | */ | ||
708 | timer_clear_idle(); | ||
703 | /* | 709 | /* |
704 | * We've not stopped the tick yet, and there's a timer in the | 710 | * We've not stopped the tick yet, and there's a timer in the |
705 | * next period, so no point in stopping it either, bail. | 711 | * next period, so no point in stopping it either, bail. |
@@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
726 | } | 732 | } |
727 | 733 | ||
728 | /* | 734 | /* |
729 | * If this cpu is the one which updates jiffies, then give up | 735 | * If this CPU is the one which updates jiffies, then give up |
730 | * the assignment and let it be taken by the cpu which runs | 736 | * the assignment and let it be taken by the CPU which runs |
731 | * the tick timer next, which might be this cpu as well. If we | 737 | * the tick timer next, which might be this CPU as well. If we |
732 | * don't drop this here the jiffies might be stale and | 738 | * don't drop this here the jiffies might be stale and |
733 | * do_timer() never invoked. Keep track of the fact that it | 739 | * do_timer() never invoked. Keep track of the fact that it |
734 | * was the one which had the do_timer() duty last. If this cpu | 740 | * was the one which had the do_timer() duty last. If this CPU |
735 | * is the one which had the do_timer() duty last, we limit the | 741 | * is the one which had the do_timer() duty last, we limit the |
736 | * sleep time to the timekeeping max_deferement value. | 742 | * sleep time to the timekeeping max_deferment value. |
737 | * Otherwise we can sleep as long as we want. | 743 | * Otherwise we can sleep as long as we want. |
738 | */ | 744 | */ |
739 | delta = timekeeping_max_deferment(); | 745 | delta = timekeeping_max_deferment(); |
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
809 | tick_do_update_jiffies64(now); | 815 | tick_do_update_jiffies64(now); |
810 | cpu_load_update_nohz_stop(); | 816 | cpu_load_update_nohz_stop(); |
811 | 817 | ||
818 | /* | ||
819 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and | ||
820 | * the clock forward checks in the enqueue path: | ||
821 | */ | ||
822 | timer_clear_idle(); | ||
823 | |||
812 | calc_load_exit_idle(); | 824 | calc_load_exit_idle(); |
813 | touch_softlockup_watchdog_sched(); | 825 | touch_softlockup_watchdog_sched(); |
814 | /* | 826 | /* |
@@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) | |||
841 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | 853 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
842 | { | 854 | { |
843 | /* | 855 | /* |
844 | * If this cpu is offline and it is the one which updates | 856 | * If this CPU is offline and it is the one which updates |
845 | * jiffies, then give up the assignment and let it be taken by | 857 | * jiffies, then give up the assignment and let it be taken by |
846 | * the cpu which runs the tick timer next. If we don't drop | 858 | * the CPU which runs the tick timer next. If we don't drop |
847 | * this here the jiffies might be stale and do_timer() never | 859 | * this here the jiffies might be stale and do_timer() never |
848 | * invoked. | 860 | * invoked. |
849 | */ | 861 | */ |
@@ -896,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) | |||
896 | ktime_t now, expires; | 908 | ktime_t now, expires; |
897 | int cpu = smp_processor_id(); | 909 | int cpu = smp_processor_id(); |
898 | 910 | ||
899 | now = tick_nohz_start_idle(ts); | ||
900 | |||
901 | if (can_stop_idle_tick(cpu, ts)) { | 911 | if (can_stop_idle_tick(cpu, ts)) { |
902 | int was_stopped = ts->tick_stopped; | 912 | int was_stopped = ts->tick_stopped; |
903 | 913 | ||
914 | now = tick_nohz_start_idle(ts); | ||
904 | ts->idle_calls++; | 915 | ts->idle_calls++; |
905 | 916 | ||
906 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | 917 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); |
@@ -933,11 +944,11 @@ void tick_nohz_idle_enter(void) | |||
933 | WARN_ON_ONCE(irqs_disabled()); | 944 | WARN_ON_ONCE(irqs_disabled()); |
934 | 945 | ||
935 | /* | 946 | /* |
936 | * Update the idle state in the scheduler domain hierarchy | 947 | * Update the idle state in the scheduler domain hierarchy |
937 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | 948 | * when tick_nohz_stop_sched_tick() is called from the idle loop. |
938 | * State will be updated to busy during the first busy tick after | 949 | * State will be updated to busy during the first busy tick after |
939 | * exiting idle. | 950 | * exiting idle. |
940 | */ | 951 | */ |
941 | set_cpu_sd_state_idle(); | 952 | set_cpu_sd_state_idle(); |
942 | 953 | ||
943 | local_irq_disable(); | 954 | local_irq_disable(); |
@@ -1092,35 +1103,6 @@ static void tick_nohz_switch_to_nohz(void) | |||
1092 | tick_nohz_activate(ts, NOHZ_MODE_LOWRES); | 1103 | tick_nohz_activate(ts, NOHZ_MODE_LOWRES); |
1093 | } | 1104 | } |
1094 | 1105 | ||
1095 | /* | ||
1096 | * When NOHZ is enabled and the tick is stopped, we need to kick the | ||
1097 | * tick timer from irq_enter() so that the jiffies update is kept | ||
1098 | * alive during long running softirqs. That's ugly as hell, but | ||
1099 | * correctness is key even if we need to fix the offending softirq in | ||
1100 | * the first place. | ||
1101 | * | ||
1102 | * Note, this is different to tick_nohz_restart. We just kick the | ||
1103 | * timer and do not touch the other magic bits which need to be done | ||
1104 | * when idle is left. | ||
1105 | */ | ||
1106 | static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) | ||
1107 | { | ||
1108 | #if 0 | ||
1109 | /* Switch back to 2.6.27 behaviour */ | ||
1110 | ktime_t delta; | ||
1111 | |||
1112 | /* | ||
1113 | * Do not touch the tick device, when the next expiry is either | ||
1114 | * already reached or less/equal than the tick period. | ||
1115 | */ | ||
1116 | delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); | ||
1117 | if (delta.tv64 <= tick_period.tv64) | ||
1118 | return; | ||
1119 | |||
1120 | tick_nohz_restart(ts, now); | ||
1121 | #endif | ||
1122 | } | ||
1123 | |||
1124 | static inline void tick_nohz_irq_enter(void) | 1106 | static inline void tick_nohz_irq_enter(void) |
1125 | { | 1107 | { |
1126 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | 1108 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
@@ -1131,10 +1113,8 @@ static inline void tick_nohz_irq_enter(void) | |||
1131 | now = ktime_get(); | 1113 | now = ktime_get(); |
1132 | if (ts->idle_active) | 1114 | if (ts->idle_active) |
1133 | tick_nohz_stop_idle(ts, now); | 1115 | tick_nohz_stop_idle(ts, now); |
1134 | if (ts->tick_stopped) { | 1116 | if (ts->tick_stopped) |
1135 | tick_nohz_update_jiffies(now); | 1117 | tick_nohz_update_jiffies(now); |
1136 | tick_nohz_kick_tick(ts, now); | ||
1137 | } | ||
1138 | } | 1118 | } |
1139 | 1119 | ||
1140 | #else | 1120 | #else |
@@ -1211,7 +1191,7 @@ void tick_setup_sched_timer(void) | |||
1211 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 1191 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
1212 | ts->sched_timer.function = tick_sched_timer; | 1192 | ts->sched_timer.function = tick_sched_timer; |
1213 | 1193 | ||
1214 | /* Get the next period (per cpu) */ | 1194 | /* Get the next period (per-CPU) */ |
1215 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 1195 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
1216 | 1196 | ||
1217 | /* Offset the tick to avert jiffies_lock contention. */ | 1197 | /* Offset the tick to avert jiffies_lock contention. */ |
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c | |||
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = { | |||
67 | #define SECS_PER_DAY (SECS_PER_HOUR * 24) | 67 | #define SECS_PER_DAY (SECS_PER_HOUR * 24) |
68 | 68 | ||
69 | /** | 69 | /** |
70 | * time_to_tm - converts the calendar time to local broken-down time | 70 | * time64_to_tm - converts the calendar time to local broken-down time |
71 | * | 71 | * |
72 | * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, | 72 | * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, |
73 | * Coordinated Universal Time (UTC). | 73 | * Coordinated Universal Time (UTC). |
74 | * @offset offset seconds adding to totalsecs. | 74 | * @offset offset seconds adding to totalsecs. |
75 | * @result pointer to struct tm variable to receive broken-down time | 75 | * @result pointer to struct tm variable to receive broken-down time |
76 | */ | 76 | */ |
77 | void time_to_tm(time_t totalsecs, int offset, struct tm *result) | 77 | void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) |
78 | { | 78 | { |
79 | long days, rem, y; | 79 | long days, rem, y; |
80 | int remainder; | ||
80 | const unsigned short *ip; | 81 | const unsigned short *ip; |
81 | 82 | ||
82 | days = totalsecs / SECS_PER_DAY; | 83 | days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); |
83 | rem = totalsecs % SECS_PER_DAY; | 84 | rem = remainder; |
84 | rem += offset; | 85 | rem += offset; |
85 | while (rem < 0) { | 86 | while (rem < 0) { |
86 | rem += SECS_PER_DAY; | 87 | rem += SECS_PER_DAY; |
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result) | |||
124 | result->tm_mon = y; | 125 | result->tm_mon = y; |
125 | result->tm_mday = days + 1; | 126 | result->tm_mday = days + 1; |
126 | } | 127 | } |
127 | EXPORT_SYMBOL(time_to_tm); | 128 | EXPORT_SYMBOL(time64_to_tm); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..3b65746c7f15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
480 | * users are removed, this can be killed. | 480 | * users are removed, this can be killed. |
481 | */ | 481 | */ |
482 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); | 482 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); |
483 | tk->tkr_mono.xtime_nsec -= remainder; | 483 | if (remainder != 0) { |
484 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; | 484 | tk->tkr_mono.xtime_nsec -= remainder; |
485 | tk->ntp_error += remainder << tk->ntp_error_shift; | 485 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; |
486 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; | 486 | tk->ntp_error += remainder << tk->ntp_error_shift; |
487 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; | ||
488 | } | ||
487 | } | 489 | } |
488 | #else | 490 | #else |
489 | #define old_vsyscall_fixup(tk) | 491 | #define old_vsyscall_fixup(tk) |
@@ -2186,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void) | |||
2186 | 2188 | ||
2187 | return now; | 2189 | return now; |
2188 | } | 2190 | } |
2191 | EXPORT_SYMBOL(get_monotonic_coarse64); | ||
2189 | 2192 | ||
2190 | /* | 2193 | /* |
2191 | * Must hold jiffies_lock | 2194 | * Must hold jiffies_lock |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a95f9728778..555670a5143c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | |||
59 | EXPORT_SYMBOL(jiffies_64); | 59 | EXPORT_SYMBOL(jiffies_64); |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * per-CPU timer vector definitions: | 62 | * The timer wheel has LVL_DEPTH array levels. Each level provides an array of |
63 | * LVL_SIZE buckets. Each level is driven by its own clock and therefor each | ||
64 | * level has a different granularity. | ||
65 | * | ||
66 | * The level granularity is: LVL_CLK_DIV ^ lvl | ||
67 | * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) | ||
68 | * | ||
69 | * The array level of a newly armed timer depends on the relative expiry | ||
70 | * time. The farther the expiry time is away the higher the array level and | ||
71 | * therefor the granularity becomes. | ||
72 | * | ||
73 | * Contrary to the original timer wheel implementation, which aims for 'exact' | ||
74 | * expiry of the timers, this implementation removes the need for recascading | ||
75 | * the timers into the lower array levels. The previous 'classic' timer wheel | ||
76 | * implementation of the kernel already violated the 'exact' expiry by adding | ||
77 | * slack to the expiry time to provide batched expiration. The granularity | ||
78 | * levels provide implicit batching. | ||
79 | * | ||
80 | * This is an optimization of the original timer wheel implementation for the | ||
81 | * majority of the timer wheel use cases: timeouts. The vast majority of | ||
82 | * timeout timers (networking, disk I/O ...) are canceled before expiry. If | ||
83 | * the timeout expires it indicates that normal operation is disturbed, so it | ||
84 | * does not matter much whether the timeout comes with a slight delay. | ||
85 | * | ||
86 | * The only exception to this are networking timers with a small expiry | ||
87 | * time. They rely on the granularity. Those fit into the first wheel level, | ||
88 | * which has HZ granularity. | ||
89 | * | ||
90 | * We don't have cascading anymore. timers with a expiry time above the | ||
91 | * capacity of the last wheel level are force expired at the maximum timeout | ||
92 | * value of the last wheel level. From data sampling we know that the maximum | ||
93 | * value observed is 5 days (network connection tracking), so this should not | ||
94 | * be an issue. | ||
95 | * | ||
96 | * The currently chosen array constants values are a good compromise between | ||
97 | * array size and granularity. | ||
98 | * | ||
99 | * This results in the following granularity and range levels: | ||
100 | * | ||
101 | * HZ 1000 steps | ||
102 | * Level Offset Granularity Range | ||
103 | * 0 0 1 ms 0 ms - 63 ms | ||
104 | * 1 64 8 ms 64 ms - 511 ms | ||
105 | * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) | ||
106 | * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) | ||
107 | * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) | ||
108 | * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) | ||
109 | * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) | ||
110 | * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) | ||
111 | * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) | ||
112 | * | ||
113 | * HZ 300 | ||
114 | * Level Offset Granularity Range | ||
115 | * 0 0 3 ms 0 ms - 210 ms | ||
116 | * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) | ||
117 | * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) | ||
118 | * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) | ||
119 | * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) | ||
120 | * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) | ||
121 | * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) | ||
122 | * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) | ||
123 | * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) | ||
124 | * | ||
125 | * HZ 250 | ||
126 | * Level Offset Granularity Range | ||
127 | * 0 0 4 ms 0 ms - 255 ms | ||
128 | * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) | ||
129 | * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) | ||
130 | * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) | ||
131 | * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) | ||
132 | * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) | ||
133 | * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) | ||
134 | * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) | ||
135 | * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) | ||
136 | * | ||
137 | * HZ 100 | ||
138 | * Level Offset Granularity Range | ||
139 | * 0 0 10 ms 0 ms - 630 ms | ||
140 | * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) | ||
141 | * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) | ||
142 | * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) | ||
143 | * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) | ||
144 | * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) | ||
145 | * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) | ||
146 | * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) | ||
63 | */ | 147 | */ |
64 | #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) | ||
65 | #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) | ||
66 | #define TVN_SIZE (1 << TVN_BITS) | ||
67 | #define TVR_SIZE (1 << TVR_BITS) | ||
68 | #define TVN_MASK (TVN_SIZE - 1) | ||
69 | #define TVR_MASK (TVR_SIZE - 1) | ||
70 | #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) | ||
71 | |||
72 | struct tvec { | ||
73 | struct hlist_head vec[TVN_SIZE]; | ||
74 | }; | ||
75 | 148 | ||
76 | struct tvec_root { | 149 | /* Clock divisor for the next level */ |
77 | struct hlist_head vec[TVR_SIZE]; | 150 | #define LVL_CLK_SHIFT 3 |
78 | }; | 151 | #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) |
152 | #define LVL_CLK_MASK (LVL_CLK_DIV - 1) | ||
153 | #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) | ||
154 | #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) | ||
79 | 155 | ||
80 | struct tvec_base { | 156 | /* |
81 | spinlock_t lock; | 157 | * The time start value for each level to select the bucket at enqueue |
82 | struct timer_list *running_timer; | 158 | * time. |
83 | unsigned long timer_jiffies; | 159 | */ |
84 | unsigned long next_timer; | 160 | #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) |
85 | unsigned long active_timers; | 161 | |
86 | unsigned long all_timers; | 162 | /* Size of each clock level */ |
87 | int cpu; | 163 | #define LVL_BITS 6 |
88 | bool migration_enabled; | 164 | #define LVL_SIZE (1UL << LVL_BITS) |
89 | bool nohz_active; | 165 | #define LVL_MASK (LVL_SIZE - 1) |
90 | struct tvec_root tv1; | 166 | #define LVL_OFFS(n) ((n) * LVL_SIZE) |
91 | struct tvec tv2; | 167 | |
92 | struct tvec tv3; | 168 | /* Level depth */ |
93 | struct tvec tv4; | 169 | #if HZ > 100 |
94 | struct tvec tv5; | 170 | # define LVL_DEPTH 9 |
95 | } ____cacheline_aligned; | 171 | # else |
172 | # define LVL_DEPTH 8 | ||
173 | #endif | ||
174 | |||
175 | /* The cutoff (max. capacity of the wheel) */ | ||
176 | #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) | ||
177 | #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) | ||
178 | |||
179 | /* | ||
180 | * The resulting wheel size. If NOHZ is configured we allocate two | ||
181 | * wheels so we have a separate storage for the deferrable timers. | ||
182 | */ | ||
183 | #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) | ||
184 | |||
185 | #ifdef CONFIG_NO_HZ_COMMON | ||
186 | # define NR_BASES 2 | ||
187 | # define BASE_STD 0 | ||
188 | # define BASE_DEF 1 | ||
189 | #else | ||
190 | # define NR_BASES 1 | ||
191 | # define BASE_STD 0 | ||
192 | # define BASE_DEF 0 | ||
193 | #endif | ||
96 | 194 | ||
195 | struct timer_base { | ||
196 | spinlock_t lock; | ||
197 | struct timer_list *running_timer; | ||
198 | unsigned long clk; | ||
199 | unsigned long next_expiry; | ||
200 | unsigned int cpu; | ||
201 | bool migration_enabled; | ||
202 | bool nohz_active; | ||
203 | bool is_idle; | ||
204 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); | ||
205 | struct hlist_head vectors[WHEEL_SIZE]; | ||
206 | } ____cacheline_aligned; | ||
97 | 207 | ||
98 | static DEFINE_PER_CPU(struct tvec_base, tvec_bases); | 208 | static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); |
99 | 209 | ||
100 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 210 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
101 | unsigned int sysctl_timer_migration = 1; | 211 | unsigned int sysctl_timer_migration = 1; |
@@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz) | |||
106 | unsigned int cpu; | 216 | unsigned int cpu; |
107 | 217 | ||
108 | /* Avoid the loop, if nothing to update */ | 218 | /* Avoid the loop, if nothing to update */ |
109 | if (this_cpu_read(tvec_bases.migration_enabled) == on) | 219 | if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) |
110 | return; | 220 | return; |
111 | 221 | ||
112 | for_each_possible_cpu(cpu) { | 222 | for_each_possible_cpu(cpu) { |
113 | per_cpu(tvec_bases.migration_enabled, cpu) = on; | 223 | per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; |
224 | per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; | ||
114 | per_cpu(hrtimer_bases.migration_enabled, cpu) = on; | 225 | per_cpu(hrtimer_bases.migration_enabled, cpu) = on; |
115 | if (!update_nohz) | 226 | if (!update_nohz) |
116 | continue; | 227 | continue; |
117 | per_cpu(tvec_bases.nohz_active, cpu) = true; | 228 | per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; |
229 | per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; | ||
118 | per_cpu(hrtimer_bases.nohz_active, cpu) = true; | 230 | per_cpu(hrtimer_bases.nohz_active, cpu) = true; |
119 | } | 231 | } |
120 | } | 232 | } |
@@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write, | |||
133 | mutex_unlock(&mutex); | 245 | mutex_unlock(&mutex); |
134 | return ret; | 246 | return ret; |
135 | } | 247 | } |
136 | |||
137 | static inline struct tvec_base *get_target_base(struct tvec_base *base, | ||
138 | int pinned) | ||
139 | { | ||
140 | if (pinned || !base->migration_enabled) | ||
141 | return this_cpu_ptr(&tvec_bases); | ||
142 | return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); | ||
143 | } | ||
144 | #else | ||
145 | static inline struct tvec_base *get_target_base(struct tvec_base *base, | ||
146 | int pinned) | ||
147 | { | ||
148 | return this_cpu_ptr(&tvec_bases); | ||
149 | } | ||
150 | #endif | 248 | #endif |
151 | 249 | ||
152 | static unsigned long round_jiffies_common(unsigned long j, int cpu, | 250 | static unsigned long round_jiffies_common(unsigned long j, int cpu, |
@@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j) | |||
351 | } | 449 | } |
352 | EXPORT_SYMBOL_GPL(round_jiffies_up_relative); | 450 | EXPORT_SYMBOL_GPL(round_jiffies_up_relative); |
353 | 451 | ||
354 | /** | 452 | |
355 | * set_timer_slack - set the allowed slack for a timer | 453 | static inline unsigned int timer_get_idx(struct timer_list *timer) |
356 | * @timer: the timer to be modified | ||
357 | * @slack_hz: the amount of time (in jiffies) allowed for rounding | ||
358 | * | ||
359 | * Set the amount of time, in jiffies, that a certain timer has | ||
360 | * in terms of slack. By setting this value, the timer subsystem | ||
361 | * will schedule the actual timer somewhere between | ||
362 | * the time mod_timer() asks for, and that time plus the slack. | ||
363 | * | ||
364 | * By setting the slack to -1, a percentage of the delay is used | ||
365 | * instead. | ||
366 | */ | ||
367 | void set_timer_slack(struct timer_list *timer, int slack_hz) | ||
368 | { | 454 | { |
369 | timer->slack = slack_hz; | 455 | return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; |
370 | } | 456 | } |
371 | EXPORT_SYMBOL_GPL(set_timer_slack); | ||
372 | 457 | ||
373 | static void | 458 | static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) |
374 | __internal_add_timer(struct tvec_base *base, struct timer_list *timer) | ||
375 | { | 459 | { |
376 | unsigned long expires = timer->expires; | 460 | timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | |
377 | unsigned long idx = expires - base->timer_jiffies; | 461 | idx << TIMER_ARRAYSHIFT; |
378 | struct hlist_head *vec; | 462 | } |
379 | 463 | ||
380 | if (idx < TVR_SIZE) { | 464 | /* |
381 | int i = expires & TVR_MASK; | 465 | * Helper function to calculate the array index for a given expiry |
382 | vec = base->tv1.vec + i; | 466 | * time. |
383 | } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { | 467 | */ |
384 | int i = (expires >> TVR_BITS) & TVN_MASK; | 468 | static inline unsigned calc_index(unsigned expires, unsigned lvl) |
385 | vec = base->tv2.vec + i; | 469 | { |
386 | } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { | 470 | expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); |
387 | int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; | 471 | return LVL_OFFS(lvl) + (expires & LVL_MASK); |
388 | vec = base->tv3.vec + i; | 472 | } |
389 | } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { | 473 | |
390 | int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; | 474 | static int calc_wheel_index(unsigned long expires, unsigned long clk) |
391 | vec = base->tv4.vec + i; | 475 | { |
392 | } else if ((signed long) idx < 0) { | 476 | unsigned long delta = expires - clk; |
393 | /* | 477 | unsigned int idx; |
394 | * Can happen if you add a timer with expires == jiffies, | 478 | |
395 | * or you set a timer to go off in the past | 479 | if (delta < LVL_START(1)) { |
396 | */ | 480 | idx = calc_index(expires, 0); |
397 | vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); | 481 | } else if (delta < LVL_START(2)) { |
482 | idx = calc_index(expires, 1); | ||
483 | } else if (delta < LVL_START(3)) { | ||
484 | idx = calc_index(expires, 2); | ||
485 | } else if (delta < LVL_START(4)) { | ||
486 | idx = calc_index(expires, 3); | ||
487 | } else if (delta < LVL_START(5)) { | ||
488 | idx = calc_index(expires, 4); | ||
489 | } else if (delta < LVL_START(6)) { | ||
490 | idx = calc_index(expires, 5); | ||
491 | } else if (delta < LVL_START(7)) { | ||
492 | idx = calc_index(expires, 6); | ||
493 | } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { | ||
494 | idx = calc_index(expires, 7); | ||
495 | } else if ((long) delta < 0) { | ||
496 | idx = clk & LVL_MASK; | ||
398 | } else { | 497 | } else { |
399 | int i; | 498 | /* |
400 | /* If the timeout is larger than MAX_TVAL (on 64-bit | 499 | * Force expire obscene large timeouts to expire at the |
401 | * architectures or with CONFIG_BASE_SMALL=1) then we | 500 | * capacity limit of the wheel. |
402 | * use the maximum timeout. | ||
403 | */ | 501 | */ |
404 | if (idx > MAX_TVAL) { | 502 | if (expires >= WHEEL_TIMEOUT_CUTOFF) |
405 | idx = MAX_TVAL; | 503 | expires = WHEEL_TIMEOUT_MAX; |
406 | expires = idx + base->timer_jiffies; | 504 | |
407 | } | 505 | idx = calc_index(expires, LVL_DEPTH - 1); |
408 | i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; | ||
409 | vec = base->tv5.vec + i; | ||
410 | } | 506 | } |
507 | return idx; | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * Enqueue the timer into the hash bucket, mark it pending in | ||
512 | * the bitmap and store the index in the timer flags. | ||
513 | */ | ||
514 | static void enqueue_timer(struct timer_base *base, struct timer_list *timer, | ||
515 | unsigned int idx) | ||
516 | { | ||
517 | hlist_add_head(&timer->entry, base->vectors + idx); | ||
518 | __set_bit(idx, base->pending_map); | ||
519 | timer_set_idx(timer, idx); | ||
520 | } | ||
521 | |||
522 | static void | ||
523 | __internal_add_timer(struct timer_base *base, struct timer_list *timer) | ||
524 | { | ||
525 | unsigned int idx; | ||
411 | 526 | ||
412 | hlist_add_head(&timer->entry, vec); | 527 | idx = calc_wheel_index(timer->expires, base->clk); |
528 | enqueue_timer(base, timer, idx); | ||
413 | } | 529 | } |
414 | 530 | ||
415 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | 531 | static void |
532 | trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) | ||
416 | { | 533 | { |
417 | /* Advance base->jiffies, if the base is empty */ | 534 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) |
418 | if (!base->all_timers++) | 535 | return; |
419 | base->timer_jiffies = jiffies; | ||
420 | 536 | ||
421 | __internal_add_timer(base, timer); | ||
422 | /* | 537 | /* |
423 | * Update base->active_timers and base->next_timer | 538 | * TODO: This wants some optimizing similar to the code below, but we |
539 | * will do that when we switch from push to pull for deferrable timers. | ||
424 | */ | 540 | */ |
425 | if (!(timer->flags & TIMER_DEFERRABLE)) { | 541 | if (timer->flags & TIMER_DEFERRABLE) { |
426 | if (!base->active_timers++ || | 542 | if (tick_nohz_full_cpu(base->cpu)) |
427 | time_before(timer->expires, base->next_timer)) | 543 | wake_up_nohz_cpu(base->cpu); |
428 | base->next_timer = timer->expires; | 544 | return; |
429 | } | 545 | } |
430 | 546 | ||
431 | /* | 547 | /* |
432 | * Check whether the other CPU is in dynticks mode and needs | 548 | * We might have to IPI the remote CPU if the base is idle and the |
433 | * to be triggered to reevaluate the timer wheel. | 549 | * timer is not deferrable. If the other CPU is on the way to idle |
434 | * We are protected against the other CPU fiddling | 550 | * then it can't set base->is_idle as we hold the base lock: |
435 | * with the timer by holding the timer base lock. This also | ||
436 | * makes sure that a CPU on the way to stop its tick can not | ||
437 | * evaluate the timer wheel. | ||
438 | * | ||
439 | * Spare the IPI for deferrable timers on idle targets though. | ||
440 | * The next busy ticks will take care of it. Except full dynticks | ||
441 | * require special care against races with idle_cpu(), lets deal | ||
442 | * with that later. | ||
443 | */ | 551 | */ |
444 | if (base->nohz_active) { | 552 | if (!base->is_idle) |
445 | if (!(timer->flags & TIMER_DEFERRABLE) || | 553 | return; |
446 | tick_nohz_full_cpu(base->cpu)) | 554 | |
447 | wake_up_nohz_cpu(base->cpu); | 555 | /* Check whether this is the new first expiring timer: */ |
448 | } | 556 | if (time_after_eq(timer->expires, base->next_expiry)) |
557 | return; | ||
558 | |||
559 | /* | ||
560 | * Set the next expiry time and kick the CPU so it can reevaluate the | ||
561 | * wheel: | ||
562 | */ | ||
563 | base->next_expiry = timer->expires; | ||
564 | wake_up_nohz_cpu(base->cpu); | ||
565 | } | ||
566 | |||
567 | static void | ||
568 | internal_add_timer(struct timer_base *base, struct timer_list *timer) | ||
569 | { | ||
570 | __internal_add_timer(base, timer); | ||
571 | trigger_dyntick_cpu(base, timer); | ||
449 | } | 572 | } |
450 | 573 | ||
451 | #ifdef CONFIG_TIMER_STATS | 574 | #ifdef CONFIG_TIMER_STATS |
@@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, | |||
666 | { | 789 | { |
667 | timer->entry.pprev = NULL; | 790 | timer->entry.pprev = NULL; |
668 | timer->flags = flags | raw_smp_processor_id(); | 791 | timer->flags = flags | raw_smp_processor_id(); |
669 | timer->slack = -1; | ||
670 | #ifdef CONFIG_TIMER_STATS | 792 | #ifdef CONFIG_TIMER_STATS |
671 | timer->start_site = NULL; | 793 | timer->start_site = NULL; |
672 | timer->start_pid = -1; | 794 | timer->start_pid = -1; |
@@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) | |||
706 | entry->next = LIST_POISON2; | 828 | entry->next = LIST_POISON2; |
707 | } | 829 | } |
708 | 830 | ||
709 | static inline void | 831 | static int detach_if_pending(struct timer_list *timer, struct timer_base *base, |
710 | detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | ||
711 | { | ||
712 | detach_timer(timer, true); | ||
713 | if (!(timer->flags & TIMER_DEFERRABLE)) | ||
714 | base->active_timers--; | ||
715 | base->all_timers--; | ||
716 | } | ||
717 | |||
718 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | ||
719 | bool clear_pending) | 832 | bool clear_pending) |
720 | { | 833 | { |
834 | unsigned idx = timer_get_idx(timer); | ||
835 | |||
721 | if (!timer_pending(timer)) | 836 | if (!timer_pending(timer)) |
722 | return 0; | 837 | return 0; |
723 | 838 | ||
839 | if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) | ||
840 | __clear_bit(idx, base->pending_map); | ||
841 | |||
724 | detach_timer(timer, clear_pending); | 842 | detach_timer(timer, clear_pending); |
725 | if (!(timer->flags & TIMER_DEFERRABLE)) { | ||
726 | base->active_timers--; | ||
727 | if (timer->expires == base->next_timer) | ||
728 | base->next_timer = base->timer_jiffies; | ||
729 | } | ||
730 | /* If this was the last timer, advance base->jiffies */ | ||
731 | if (!--base->all_timers) | ||
732 | base->timer_jiffies = jiffies; | ||
733 | return 1; | 843 | return 1; |
734 | } | 844 | } |
735 | 845 | ||
846 | static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) | ||
847 | { | ||
848 | struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); | ||
849 | |||
850 | /* | ||
851 | * If the timer is deferrable and nohz is active then we need to use | ||
852 | * the deferrable base. | ||
853 | */ | ||
854 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | ||
855 | (tflags & TIMER_DEFERRABLE)) | ||
856 | base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); | ||
857 | return base; | ||
858 | } | ||
859 | |||
860 | static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) | ||
861 | { | ||
862 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
863 | |||
864 | /* | ||
865 | * If the timer is deferrable and nohz is active then we need to use | ||
866 | * the deferrable base. | ||
867 | */ | ||
868 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | ||
869 | (tflags & TIMER_DEFERRABLE)) | ||
870 | base = this_cpu_ptr(&timer_bases[BASE_DEF]); | ||
871 | return base; | ||
872 | } | ||
873 | |||
874 | static inline struct timer_base *get_timer_base(u32 tflags) | ||
875 | { | ||
876 | return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); | ||
877 | } | ||
878 | |||
879 | #ifdef CONFIG_NO_HZ_COMMON | ||
880 | static inline struct timer_base * | ||
881 | __get_target_base(struct timer_base *base, unsigned tflags) | ||
882 | { | ||
883 | #ifdef CONFIG_SMP | ||
884 | if ((tflags & TIMER_PINNED) || !base->migration_enabled) | ||
885 | return get_timer_this_cpu_base(tflags); | ||
886 | return get_timer_cpu_base(tflags, get_nohz_timer_target()); | ||
887 | #else | ||
888 | return get_timer_this_cpu_base(tflags); | ||
889 | #endif | ||
890 | } | ||
891 | |||
892 | static inline void forward_timer_base(struct timer_base *base) | ||
893 | { | ||
894 | /* | ||
895 | * We only forward the base when it's idle and we have a delta between | ||
896 | * base clock and jiffies. | ||
897 | */ | ||
898 | if (!base->is_idle || (long) (jiffies - base->clk) < 2) | ||
899 | return; | ||
900 | |||
901 | /* | ||
902 | * If the next expiry value is > jiffies, then we fast forward to | ||
903 | * jiffies otherwise we forward to the next expiry value. | ||
904 | */ | ||
905 | if (time_after(base->next_expiry, jiffies)) | ||
906 | base->clk = jiffies; | ||
907 | else | ||
908 | base->clk = base->next_expiry; | ||
909 | } | ||
910 | #else | ||
911 | static inline struct timer_base * | ||
912 | __get_target_base(struct timer_base *base, unsigned tflags) | ||
913 | { | ||
914 | return get_timer_this_cpu_base(tflags); | ||
915 | } | ||
916 | |||
917 | static inline void forward_timer_base(struct timer_base *base) { } | ||
918 | #endif | ||
919 | |||
920 | static inline struct timer_base * | ||
921 | get_target_base(struct timer_base *base, unsigned tflags) | ||
922 | { | ||
923 | struct timer_base *target = __get_target_base(base, tflags); | ||
924 | |||
925 | forward_timer_base(target); | ||
926 | return target; | ||
927 | } | ||
928 | |||
736 | /* | 929 | /* |
737 | * We are using hashed locking: holding per_cpu(tvec_bases).lock | 930 | * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means |
738 | * means that all timers which are tied to this base via timer->base are | 931 | * that all timers which are tied to this base are locked, and the base itself |
739 | * locked, and the base itself is locked too. | 932 | * is locked too. |
740 | * | 933 | * |
741 | * So __run_timers/migrate_timers can safely modify all timers which could | 934 | * So __run_timers/migrate_timers can safely modify all timers which could |
742 | * be found on ->tvX lists. | 935 | * be found in the base->vectors array. |
743 | * | 936 | * |
744 | * When the timer's base is locked and removed from the list, the | 937 | * When a timer is migrating then the TIMER_MIGRATING flag is set and we need |
745 | * TIMER_MIGRATING flag is set, FIXME | 938 | * to wait until the migration is done. |
746 | */ | 939 | */ |
747 | static struct tvec_base *lock_timer_base(struct timer_list *timer, | 940 | static struct timer_base *lock_timer_base(struct timer_list *timer, |
748 | unsigned long *flags) | 941 | unsigned long *flags) |
749 | __acquires(timer->base->lock) | 942 | __acquires(timer->base->lock) |
750 | { | 943 | { |
751 | for (;;) { | 944 | for (;;) { |
945 | struct timer_base *base; | ||
752 | u32 tf = timer->flags; | 946 | u32 tf = timer->flags; |
753 | struct tvec_base *base; | ||
754 | 947 | ||
755 | if (!(tf & TIMER_MIGRATING)) { | 948 | if (!(tf & TIMER_MIGRATING)) { |
756 | base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); | 949 | base = get_timer_base(tf); |
757 | spin_lock_irqsave(&base->lock, *flags); | 950 | spin_lock_irqsave(&base->lock, *flags); |
758 | if (timer->flags == tf) | 951 | if (timer->flags == tf) |
759 | return base; | 952 | return base; |
@@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, | |||
764 | } | 957 | } |
765 | 958 | ||
766 | static inline int | 959 | static inline int |
767 | __mod_timer(struct timer_list *timer, unsigned long expires, | 960 | __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) |
768 | bool pending_only, int pinned) | ||
769 | { | 961 | { |
770 | struct tvec_base *base, *new_base; | 962 | struct timer_base *base, *new_base; |
771 | unsigned long flags; | 963 | unsigned int idx = UINT_MAX; |
964 | unsigned long clk = 0, flags; | ||
772 | int ret = 0; | 965 | int ret = 0; |
773 | 966 | ||
967 | /* | ||
968 | * This is a common optimization triggered by the networking code - if | ||
969 | * the timer is re-modified to have the same timeout or ends up in the | ||
970 | * same array bucket then just return: | ||
971 | */ | ||
972 | if (timer_pending(timer)) { | ||
973 | if (timer->expires == expires) | ||
974 | return 1; | ||
975 | /* | ||
976 | * Take the current timer_jiffies of base, but without holding | ||
977 | * the lock! | ||
978 | */ | ||
979 | base = get_timer_base(timer->flags); | ||
980 | clk = base->clk; | ||
981 | |||
982 | idx = calc_wheel_index(expires, clk); | ||
983 | |||
984 | /* | ||
985 | * Retrieve and compare the array index of the pending | ||
986 | * timer. If it matches set the expiry to the new value so a | ||
987 | * subsequent call will exit in the expires check above. | ||
988 | */ | ||
989 | if (idx == timer_get_idx(timer)) { | ||
990 | timer->expires = expires; | ||
991 | return 1; | ||
992 | } | ||
993 | } | ||
994 | |||
774 | timer_stats_timer_set_start_info(timer); | 995 | timer_stats_timer_set_start_info(timer); |
775 | BUG_ON(!timer->function); | 996 | BUG_ON(!timer->function); |
776 | 997 | ||
@@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
782 | 1003 | ||
783 | debug_activate(timer, expires); | 1004 | debug_activate(timer, expires); |
784 | 1005 | ||
785 | new_base = get_target_base(base, pinned); | 1006 | new_base = get_target_base(base, timer->flags); |
786 | 1007 | ||
787 | if (base != new_base) { | 1008 | if (base != new_base) { |
788 | /* | 1009 | /* |
789 | * We are trying to schedule the timer on the local CPU. | 1010 | * We are trying to schedule the timer on the new base. |
790 | * However we can't change timer's base while it is running, | 1011 | * However we can't change timer's base while it is running, |
791 | * otherwise del_timer_sync() can't detect that the timer's | 1012 | * otherwise del_timer_sync() can't detect that the timer's |
792 | * handler yet has not finished. This also guarantees that | 1013 | * handler yet has not finished. This also guarantees that the |
793 | * the timer is serialized wrt itself. | 1014 | * timer is serialized wrt itself. |
794 | */ | 1015 | */ |
795 | if (likely(base->running_timer != timer)) { | 1016 | if (likely(base->running_timer != timer)) { |
796 | /* See the comment in lock_timer_base() */ | 1017 | /* See the comment in lock_timer_base() */ |
@@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
805 | } | 1026 | } |
806 | 1027 | ||
807 | timer->expires = expires; | 1028 | timer->expires = expires; |
808 | internal_add_timer(base, timer); | 1029 | /* |
1030 | * If 'idx' was calculated above and the base time did not advance | ||
1031 | * between calculating 'idx' and taking the lock, only enqueue_timer() | ||
1032 | * and trigger_dyntick_cpu() is required. Otherwise we need to | ||
1033 | * (re)calculate the wheel index via internal_add_timer(). | ||
1034 | */ | ||
1035 | if (idx != UINT_MAX && clk == base->clk) { | ||
1036 | enqueue_timer(base, timer, idx); | ||
1037 | trigger_dyntick_cpu(base, timer); | ||
1038 | } else { | ||
1039 | internal_add_timer(base, timer); | ||
1040 | } | ||
809 | 1041 | ||
810 | out_unlock: | 1042 | out_unlock: |
811 | spin_unlock_irqrestore(&base->lock, flags); | 1043 | spin_unlock_irqrestore(&base->lock, flags); |
@@ -825,49 +1057,10 @@ out_unlock: | |||
825 | */ | 1057 | */ |
826 | int mod_timer_pending(struct timer_list *timer, unsigned long expires) | 1058 | int mod_timer_pending(struct timer_list *timer, unsigned long expires) |
827 | { | 1059 | { |
828 | return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); | 1060 | return __mod_timer(timer, expires, true); |
829 | } | 1061 | } |
830 | EXPORT_SYMBOL(mod_timer_pending); | 1062 | EXPORT_SYMBOL(mod_timer_pending); |
831 | 1063 | ||
832 | /* | ||
833 | * Decide where to put the timer while taking the slack into account | ||
834 | * | ||
835 | * Algorithm: | ||
836 | * 1) calculate the maximum (absolute) time | ||
837 | * 2) calculate the highest bit where the expires and new max are different | ||
838 | * 3) use this bit to make a mask | ||
839 | * 4) use the bitmask to round down the maximum time, so that all last | ||
840 | * bits are zeros | ||
841 | */ | ||
842 | static inline | ||
843 | unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | ||
844 | { | ||
845 | unsigned long expires_limit, mask; | ||
846 | int bit; | ||
847 | |||
848 | if (timer->slack >= 0) { | ||
849 | expires_limit = expires + timer->slack; | ||
850 | } else { | ||
851 | long delta = expires - jiffies; | ||
852 | |||
853 | if (delta < 256) | ||
854 | return expires; | ||
855 | |||
856 | expires_limit = expires + delta / 256; | ||
857 | } | ||
858 | mask = expires ^ expires_limit; | ||
859 | if (mask == 0) | ||
860 | return expires; | ||
861 | |||
862 | bit = __fls(mask); | ||
863 | |||
864 | mask = (1UL << bit) - 1; | ||
865 | |||
866 | expires_limit = expires_limit & ~(mask); | ||
867 | |||
868 | return expires_limit; | ||
869 | } | ||
870 | |||
871 | /** | 1064 | /** |
872 | * mod_timer - modify a timer's timeout | 1065 | * mod_timer - modify a timer's timeout |
873 | * @timer: the timer to be modified | 1066 | * @timer: the timer to be modified |
@@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
890 | */ | 1083 | */ |
891 | int mod_timer(struct timer_list *timer, unsigned long expires) | 1084 | int mod_timer(struct timer_list *timer, unsigned long expires) |
892 | { | 1085 | { |
893 | expires = apply_slack(timer, expires); | 1086 | return __mod_timer(timer, expires, false); |
894 | |||
895 | /* | ||
896 | * This is a common optimization triggered by the | ||
897 | * networking code - if the timer is re-modified | ||
898 | * to be the same thing then just return: | ||
899 | */ | ||
900 | if (timer_pending(timer) && timer->expires == expires) | ||
901 | return 1; | ||
902 | |||
903 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); | ||
904 | } | 1087 | } |
905 | EXPORT_SYMBOL(mod_timer); | 1088 | EXPORT_SYMBOL(mod_timer); |
906 | 1089 | ||
907 | /** | 1090 | /** |
908 | * mod_timer_pinned - modify a timer's timeout | ||
909 | * @timer: the timer to be modified | ||
910 | * @expires: new timeout in jiffies | ||
911 | * | ||
912 | * mod_timer_pinned() is a way to update the expire field of an | ||
913 | * active timer (if the timer is inactive it will be activated) | ||
914 | * and to ensure that the timer is scheduled on the current CPU. | ||
915 | * | ||
916 | * Note that this does not prevent the timer from being migrated | ||
917 | * when the current CPU goes offline. If this is a problem for | ||
918 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
919 | * example, cancelling the timer when the corresponding CPU goes | ||
920 | * offline. | ||
921 | * | ||
922 | * mod_timer_pinned(timer, expires) is equivalent to: | ||
923 | * | ||
924 | * del_timer(timer); timer->expires = expires; add_timer(timer); | ||
925 | */ | ||
926 | int mod_timer_pinned(struct timer_list *timer, unsigned long expires) | ||
927 | { | ||
928 | if (timer->expires == expires && timer_pending(timer)) | ||
929 | return 1; | ||
930 | |||
931 | return __mod_timer(timer, expires, false, TIMER_PINNED); | ||
932 | } | ||
933 | EXPORT_SYMBOL(mod_timer_pinned); | ||
934 | |||
935 | /** | ||
936 | * add_timer - start a timer | 1091 | * add_timer - start a timer |
937 | * @timer: the timer to be added | 1092 | * @timer: the timer to be added |
938 | * | 1093 | * |
@@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer); | |||
962 | */ | 1117 | */ |
963 | void add_timer_on(struct timer_list *timer, int cpu) | 1118 | void add_timer_on(struct timer_list *timer, int cpu) |
964 | { | 1119 | { |
965 | struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); | 1120 | struct timer_base *new_base, *base; |
966 | struct tvec_base *base; | ||
967 | unsigned long flags; | 1121 | unsigned long flags; |
968 | 1122 | ||
969 | timer_stats_timer_set_start_info(timer); | 1123 | timer_stats_timer_set_start_info(timer); |
970 | BUG_ON(timer_pending(timer) || !timer->function); | 1124 | BUG_ON(timer_pending(timer) || !timer->function); |
971 | 1125 | ||
1126 | new_base = get_timer_cpu_base(timer->flags, cpu); | ||
1127 | |||
972 | /* | 1128 | /* |
973 | * If @timer was on a different CPU, it should be migrated with the | 1129 | * If @timer was on a different CPU, it should be migrated with the |
974 | * old base locked to prevent other operations proceeding with the | 1130 | * old base locked to prevent other operations proceeding with the |
@@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); | |||
1004 | */ | 1160 | */ |
1005 | int del_timer(struct timer_list *timer) | 1161 | int del_timer(struct timer_list *timer) |
1006 | { | 1162 | { |
1007 | struct tvec_base *base; | 1163 | struct timer_base *base; |
1008 | unsigned long flags; | 1164 | unsigned long flags; |
1009 | int ret = 0; | 1165 | int ret = 0; |
1010 | 1166 | ||
@@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer); | |||
1030 | */ | 1186 | */ |
1031 | int try_to_del_timer_sync(struct timer_list *timer) | 1187 | int try_to_del_timer_sync(struct timer_list *timer) |
1032 | { | 1188 | { |
1033 | struct tvec_base *base; | 1189 | struct timer_base *base; |
1034 | unsigned long flags; | 1190 | unsigned long flags; |
1035 | int ret = -1; | 1191 | int ret = -1; |
1036 | 1192 | ||
@@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer) | |||
1114 | EXPORT_SYMBOL(del_timer_sync); | 1270 | EXPORT_SYMBOL(del_timer_sync); |
1115 | #endif | 1271 | #endif |
1116 | 1272 | ||
1117 | static int cascade(struct tvec_base *base, struct tvec *tv, int index) | ||
1118 | { | ||
1119 | /* cascade all the timers from tv up one level */ | ||
1120 | struct timer_list *timer; | ||
1121 | struct hlist_node *tmp; | ||
1122 | struct hlist_head tv_list; | ||
1123 | |||
1124 | hlist_move_list(tv->vec + index, &tv_list); | ||
1125 | |||
1126 | /* | ||
1127 | * We are removing _all_ timers from the list, so we | ||
1128 | * don't have to detach them individually. | ||
1129 | */ | ||
1130 | hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { | ||
1131 | /* No accounting, while moving them */ | ||
1132 | __internal_add_timer(base, timer); | ||
1133 | } | ||
1134 | |||
1135 | return index; | ||
1136 | } | ||
1137 | |||
1138 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1273 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
1139 | unsigned long data) | 1274 | unsigned long data) |
1140 | { | 1275 | { |
@@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1178 | } | 1313 | } |
1179 | } | 1314 | } |
1180 | 1315 | ||
1181 | #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) | 1316 | static void expire_timers(struct timer_base *base, struct hlist_head *head) |
1182 | |||
1183 | /** | ||
1184 | * __run_timers - run all expired timers (if any) on this CPU. | ||
1185 | * @base: the timer vector to be processed. | ||
1186 | * | ||
1187 | * This function cascades all vectors and executes all expired timer | ||
1188 | * vectors. | ||
1189 | */ | ||
1190 | static inline void __run_timers(struct tvec_base *base) | ||
1191 | { | 1317 | { |
1192 | struct timer_list *timer; | 1318 | while (!hlist_empty(head)) { |
1319 | struct timer_list *timer; | ||
1320 | void (*fn)(unsigned long); | ||
1321 | unsigned long data; | ||
1193 | 1322 | ||
1194 | spin_lock_irq(&base->lock); | 1323 | timer = hlist_entry(head->first, struct timer_list, entry); |
1324 | timer_stats_account_timer(timer); | ||
1195 | 1325 | ||
1196 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 1326 | base->running_timer = timer; |
1197 | struct hlist_head work_list; | 1327 | detach_timer(timer, true); |
1198 | struct hlist_head *head = &work_list; | ||
1199 | int index; | ||
1200 | 1328 | ||
1201 | if (!base->all_timers) { | 1329 | fn = timer->function; |
1202 | base->timer_jiffies = jiffies; | 1330 | data = timer->data; |
1203 | break; | 1331 | |
1332 | if (timer->flags & TIMER_IRQSAFE) { | ||
1333 | spin_unlock(&base->lock); | ||
1334 | call_timer_fn(timer, fn, data); | ||
1335 | spin_lock(&base->lock); | ||
1336 | } else { | ||
1337 | spin_unlock_irq(&base->lock); | ||
1338 | call_timer_fn(timer, fn, data); | ||
1339 | spin_lock_irq(&base->lock); | ||
1204 | } | 1340 | } |
1341 | } | ||
1342 | } | ||
1205 | 1343 | ||
1206 | index = base->timer_jiffies & TVR_MASK; | 1344 | static int __collect_expired_timers(struct timer_base *base, |
1345 | struct hlist_head *heads) | ||
1346 | { | ||
1347 | unsigned long clk = base->clk; | ||
1348 | struct hlist_head *vec; | ||
1349 | int i, levels = 0; | ||
1350 | unsigned int idx; | ||
1207 | 1351 | ||
1208 | /* | 1352 | for (i = 0; i < LVL_DEPTH; i++) { |
1209 | * Cascade timers: | 1353 | idx = (clk & LVL_MASK) + i * LVL_SIZE; |
1210 | */ | 1354 | |
1211 | if (!index && | 1355 | if (__test_and_clear_bit(idx, base->pending_map)) { |
1212 | (!cascade(base, &base->tv2, INDEX(0))) && | 1356 | vec = base->vectors + idx; |
1213 | (!cascade(base, &base->tv3, INDEX(1))) && | 1357 | hlist_move_list(vec, heads++); |
1214 | !cascade(base, &base->tv4, INDEX(2))) | 1358 | levels++; |
1215 | cascade(base, &base->tv5, INDEX(3)); | ||
1216 | ++base->timer_jiffies; | ||
1217 | hlist_move_list(base->tv1.vec + index, head); | ||
1218 | while (!hlist_empty(head)) { | ||
1219 | void (*fn)(unsigned long); | ||
1220 | unsigned long data; | ||
1221 | bool irqsafe; | ||
1222 | |||
1223 | timer = hlist_entry(head->first, struct timer_list, entry); | ||
1224 | fn = timer->function; | ||
1225 | data = timer->data; | ||
1226 | irqsafe = timer->flags & TIMER_IRQSAFE; | ||
1227 | |||
1228 | timer_stats_account_timer(timer); | ||
1229 | |||
1230 | base->running_timer = timer; | ||
1231 | detach_expired_timer(timer, base); | ||
1232 | |||
1233 | if (irqsafe) { | ||
1234 | spin_unlock(&base->lock); | ||
1235 | call_timer_fn(timer, fn, data); | ||
1236 | spin_lock(&base->lock); | ||
1237 | } else { | ||
1238 | spin_unlock_irq(&base->lock); | ||
1239 | call_timer_fn(timer, fn, data); | ||
1240 | spin_lock_irq(&base->lock); | ||
1241 | } | ||
1242 | } | 1359 | } |
1360 | /* Is it time to look at the next level? */ | ||
1361 | if (clk & LVL_CLK_MASK) | ||
1362 | break; | ||
1363 | /* Shift clock for the next level granularity */ | ||
1364 | clk >>= LVL_CLK_SHIFT; | ||
1243 | } | 1365 | } |
1244 | base->running_timer = NULL; | 1366 | return levels; |
1245 | spin_unlock_irq(&base->lock); | ||
1246 | } | 1367 | } |
1247 | 1368 | ||
1248 | #ifdef CONFIG_NO_HZ_COMMON | 1369 | #ifdef CONFIG_NO_HZ_COMMON |
1249 | /* | 1370 | /* |
1250 | * Find out when the next timer event is due to happen. This | 1371 | * Find the next pending bucket of a level. Search from level start (@offset) |
1251 | * is used on S/390 to stop all activity when a CPU is idle. | 1372 | * + @clk upwards and if nothing there, search from start of the level |
1252 | * This function needs to be called with interrupts disabled. | 1373 | * (@offset) up to @offset + clk. |
1374 | */ | ||
1375 | static int next_pending_bucket(struct timer_base *base, unsigned offset, | ||
1376 | unsigned clk) | ||
1377 | { | ||
1378 | unsigned pos, start = offset + clk; | ||
1379 | unsigned end = offset + LVL_SIZE; | ||
1380 | |||
1381 | pos = find_next_bit(base->pending_map, end, start); | ||
1382 | if (pos < end) | ||
1383 | return pos - start; | ||
1384 | |||
1385 | pos = find_next_bit(base->pending_map, start, offset); | ||
1386 | return pos < start ? pos + LVL_SIZE - start : -1; | ||
1387 | } | ||
1388 | |||
1389 | /* | ||
1390 | * Search the first expiring timer in the various clock levels. Caller must | ||
1391 | * hold base->lock. | ||
1253 | */ | 1392 | */ |
1254 | static unsigned long __next_timer_interrupt(struct tvec_base *base) | 1393 | static unsigned long __next_timer_interrupt(struct timer_base *base) |
1255 | { | 1394 | { |
1256 | unsigned long timer_jiffies = base->timer_jiffies; | 1395 | unsigned long clk, next, adj; |
1257 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; | 1396 | unsigned lvl, offset = 0; |
1258 | int index, slot, array, found = 0; | 1397 | |
1259 | struct timer_list *nte; | 1398 | next = base->clk + NEXT_TIMER_MAX_DELTA; |
1260 | struct tvec *varray[4]; | 1399 | clk = base->clk; |
1261 | 1400 | for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { | |
1262 | /* Look for timer events in tv1. */ | 1401 | int pos = next_pending_bucket(base, offset, clk & LVL_MASK); |
1263 | index = slot = timer_jiffies & TVR_MASK; | 1402 | |
1264 | do { | 1403 | if (pos >= 0) { |
1265 | hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { | 1404 | unsigned long tmp = clk + (unsigned long) pos; |
1266 | if (nte->flags & TIMER_DEFERRABLE) | 1405 | |
1267 | continue; | 1406 | tmp <<= LVL_SHIFT(lvl); |
1268 | 1407 | if (time_before(tmp, next)) | |
1269 | found = 1; | 1408 | next = tmp; |
1270 | expires = nte->expires; | ||
1271 | /* Look at the cascade bucket(s)? */ | ||
1272 | if (!index || slot < index) | ||
1273 | goto cascade; | ||
1274 | return expires; | ||
1275 | } | 1409 | } |
1276 | slot = (slot + 1) & TVR_MASK; | 1410 | /* |
1277 | } while (slot != index); | 1411 | * Clock for the next level. If the current level clock lower |
1278 | 1412 | * bits are zero, we look at the next level as is. If not we | |
1279 | cascade: | 1413 | * need to advance it by one because that's going to be the |
1280 | /* Calculate the next cascade event */ | 1414 | * next expiring bucket in that level. base->clk is the next |
1281 | if (index) | 1415 | * expiring jiffie. So in case of: |
1282 | timer_jiffies += TVR_SIZE - index; | 1416 | * |
1283 | timer_jiffies >>= TVR_BITS; | 1417 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1284 | 1418 | * 0 0 0 0 0 0 | |
1285 | /* Check tv2-tv5. */ | 1419 | * |
1286 | varray[0] = &base->tv2; | 1420 | * we have to look at all levels @index 0. With |
1287 | varray[1] = &base->tv3; | 1421 | * |
1288 | varray[2] = &base->tv4; | 1422 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1289 | varray[3] = &base->tv5; | 1423 | * 0 0 0 0 0 2 |
1290 | 1424 | * | |
1291 | for (array = 0; array < 4; array++) { | 1425 | * LVL0 has the next expiring bucket @index 2. The upper |
1292 | struct tvec *varp = varray[array]; | 1426 | * levels have the next expiring bucket @index 1. |
1293 | 1427 | * | |
1294 | index = slot = timer_jiffies & TVN_MASK; | 1428 | * In case that the propagation wraps the next level the same |
1295 | do { | 1429 | * rules apply: |
1296 | hlist_for_each_entry(nte, varp->vec + slot, entry) { | 1430 | * |
1297 | if (nte->flags & TIMER_DEFERRABLE) | 1431 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1298 | continue; | 1432 | * 0 0 0 0 F 2 |
1299 | 1433 | * | |
1300 | found = 1; | 1434 | * So after looking at LVL0 we get: |
1301 | if (time_before(nte->expires, expires)) | 1435 | * |
1302 | expires = nte->expires; | 1436 | * LVL5 LVL4 LVL3 LVL2 LVL1 |
1303 | } | 1437 | * 0 0 0 1 0 |
1304 | /* | 1438 | * |
1305 | * Do we still search for the first timer or are | 1439 | * So no propagation from LVL1 to LVL2 because that happened |
1306 | * we looking up the cascade buckets ? | 1440 | * with the add already, but then we need to propagate further |
1307 | */ | 1441 | * from LVL2 to LVL3. |
1308 | if (found) { | 1442 | * |
1309 | /* Look at the cascade bucket(s)? */ | 1443 | * So the simple check whether the lower bits of the current |
1310 | if (!index || slot < index) | 1444 | * level are 0 or not is sufficient for all cases. |
1311 | break; | 1445 | */ |
1312 | return expires; | 1446 | adj = clk & LVL_CLK_MASK ? 1 : 0; |
1313 | } | 1447 | clk >>= LVL_CLK_SHIFT; |
1314 | slot = (slot + 1) & TVN_MASK; | 1448 | clk += adj; |
1315 | } while (slot != index); | ||
1316 | |||
1317 | if (index) | ||
1318 | timer_jiffies += TVN_SIZE - index; | ||
1319 | timer_jiffies >>= TVN_BITS; | ||
1320 | } | 1449 | } |
1321 | return expires; | 1450 | return next; |
1322 | } | 1451 | } |
1323 | 1452 | ||
1324 | /* | 1453 | /* |
@@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) | |||
1364 | */ | 1493 | */ |
1365 | u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | 1494 | u64 get_next_timer_interrupt(unsigned long basej, u64 basem) |
1366 | { | 1495 | { |
1367 | struct tvec_base *base = this_cpu_ptr(&tvec_bases); | 1496 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
1368 | u64 expires = KTIME_MAX; | 1497 | u64 expires = KTIME_MAX; |
1369 | unsigned long nextevt; | 1498 | unsigned long nextevt; |
1370 | 1499 | ||
@@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
1376 | return expires; | 1505 | return expires; |
1377 | 1506 | ||
1378 | spin_lock(&base->lock); | 1507 | spin_lock(&base->lock); |
1379 | if (base->active_timers) { | 1508 | nextevt = __next_timer_interrupt(base); |
1380 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1509 | base->next_expiry = nextevt; |
1381 | base->next_timer = __next_timer_interrupt(base); | 1510 | /* |
1382 | nextevt = base->next_timer; | 1511 | * We have a fresh next event. Check whether we can forward the base: |
1383 | if (time_before_eq(nextevt, basej)) | 1512 | */ |
1384 | expires = basem; | 1513 | if (time_after(nextevt, jiffies)) |
1385 | else | 1514 | base->clk = jiffies; |
1386 | expires = basem + (nextevt - basej) * TICK_NSEC; | 1515 | else if (time_after(nextevt, base->clk)) |
1516 | base->clk = nextevt; | ||
1517 | |||
1518 | if (time_before_eq(nextevt, basej)) { | ||
1519 | expires = basem; | ||
1520 | base->is_idle = false; | ||
1521 | } else { | ||
1522 | expires = basem + (nextevt - basej) * TICK_NSEC; | ||
1523 | /* | ||
1524 | * If we expect to sleep more than a tick, mark the base idle: | ||
1525 | */ | ||
1526 | if ((expires - basem) > TICK_NSEC) | ||
1527 | base->is_idle = true; | ||
1387 | } | 1528 | } |
1388 | spin_unlock(&base->lock); | 1529 | spin_unlock(&base->lock); |
1389 | 1530 | ||
1390 | return cmp_next_hrtimer_event(basem, expires); | 1531 | return cmp_next_hrtimer_event(basem, expires); |
1391 | } | 1532 | } |
1533 | |||
1534 | /** | ||
1535 | * timer_clear_idle - Clear the idle state of the timer base | ||
1536 | * | ||
1537 | * Called with interrupts disabled | ||
1538 | */ | ||
1539 | void timer_clear_idle(void) | ||
1540 | { | ||
1541 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
1542 | |||
1543 | /* | ||
1544 | * We do this unlocked. The worst outcome is a remote enqueue sending | ||
1545 | * a pointless IPI, but taking the lock would just make the window for | ||
1546 | * sending the IPI a few instructions smaller for the cost of taking | ||
1547 | * the lock in the exit from idle path. | ||
1548 | */ | ||
1549 | base->is_idle = false; | ||
1550 | } | ||
1551 | |||
1552 | static int collect_expired_timers(struct timer_base *base, | ||
1553 | struct hlist_head *heads) | ||
1554 | { | ||
1555 | /* | ||
1556 | * NOHZ optimization. After a long idle sleep we need to forward the | ||
1557 | * base to current jiffies. Avoid a loop by searching the bitfield for | ||
1558 | * the next expiring timer. | ||
1559 | */ | ||
1560 | if ((long)(jiffies - base->clk) > 2) { | ||
1561 | unsigned long next = __next_timer_interrupt(base); | ||
1562 | |||
1563 | /* | ||
1564 | * If the next timer is ahead of time forward to current | ||
1565 | * jiffies, otherwise forward to the next expiry time: | ||
1566 | */ | ||
1567 | if (time_after(next, jiffies)) { | ||
1568 | /* The call site will increment clock! */ | ||
1569 | base->clk = jiffies - 1; | ||
1570 | return 0; | ||
1571 | } | ||
1572 | base->clk = next; | ||
1573 | } | ||
1574 | return __collect_expired_timers(base, heads); | ||
1575 | } | ||
1576 | #else | ||
1577 | static inline int collect_expired_timers(struct timer_base *base, | ||
1578 | struct hlist_head *heads) | ||
1579 | { | ||
1580 | return __collect_expired_timers(base, heads); | ||
1581 | } | ||
1392 | #endif | 1582 | #endif |
1393 | 1583 | ||
1394 | /* | 1584 | /* |
@@ -1411,15 +1601,42 @@ void update_process_times(int user_tick) | |||
1411 | run_posix_cpu_timers(p); | 1601 | run_posix_cpu_timers(p); |
1412 | } | 1602 | } |
1413 | 1603 | ||
1604 | /** | ||
1605 | * __run_timers - run all expired timers (if any) on this CPU. | ||
1606 | * @base: the timer vector to be processed. | ||
1607 | */ | ||
1608 | static inline void __run_timers(struct timer_base *base) | ||
1609 | { | ||
1610 | struct hlist_head heads[LVL_DEPTH]; | ||
1611 | int levels; | ||
1612 | |||
1613 | if (!time_after_eq(jiffies, base->clk)) | ||
1614 | return; | ||
1615 | |||
1616 | spin_lock_irq(&base->lock); | ||
1617 | |||
1618 | while (time_after_eq(jiffies, base->clk)) { | ||
1619 | |||
1620 | levels = collect_expired_timers(base, heads); | ||
1621 | base->clk++; | ||
1622 | |||
1623 | while (levels--) | ||
1624 | expire_timers(base, heads + levels); | ||
1625 | } | ||
1626 | base->running_timer = NULL; | ||
1627 | spin_unlock_irq(&base->lock); | ||
1628 | } | ||
1629 | |||
1414 | /* | 1630 | /* |
1415 | * This function runs timers and the timer-tq in bottom half context. | 1631 | * This function runs timers and the timer-tq in bottom half context. |
1416 | */ | 1632 | */ |
1417 | static void run_timer_softirq(struct softirq_action *h) | 1633 | static void run_timer_softirq(struct softirq_action *h) |
1418 | { | 1634 | { |
1419 | struct tvec_base *base = this_cpu_ptr(&tvec_bases); | 1635 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
1420 | 1636 | ||
1421 | if (time_after_eq(jiffies, base->timer_jiffies)) | 1637 | __run_timers(base); |
1422 | __run_timers(base); | 1638 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) |
1639 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); | ||
1423 | } | 1640 | } |
1424 | 1641 | ||
1425 | /* | 1642 | /* |
@@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h) | |||
1427 | */ | 1644 | */ |
1428 | void run_local_timers(void) | 1645 | void run_local_timers(void) |
1429 | { | 1646 | { |
1647 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | ||
1648 | |||
1430 | hrtimer_run_queues(); | 1649 | hrtimer_run_queues(); |
1650 | /* Raise the softirq only if required. */ | ||
1651 | if (time_before(jiffies, base->clk)) { | ||
1652 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) | ||
1653 | return; | ||
1654 | /* CPU is awake, so check the deferrable base. */ | ||
1655 | base++; | ||
1656 | if (time_before(jiffies, base->clk)) | ||
1657 | return; | ||
1658 | } | ||
1431 | raise_softirq(TIMER_SOFTIRQ); | 1659 | raise_softirq(TIMER_SOFTIRQ); |
1432 | } | 1660 | } |
1433 | 1661 | ||
@@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout) | |||
1512 | expire = timeout + jiffies; | 1740 | expire = timeout + jiffies; |
1513 | 1741 | ||
1514 | setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); | 1742 | setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); |
1515 | __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); | 1743 | __mod_timer(&timer, expire, false); |
1516 | schedule(); | 1744 | schedule(); |
1517 | del_singleshot_timer_sync(&timer); | 1745 | del_singleshot_timer_sync(&timer); |
1518 | 1746 | ||
@@ -1563,87 +1791,62 @@ signed long __sched schedule_timeout_idle(signed long timeout) | |||
1563 | EXPORT_SYMBOL(schedule_timeout_idle); | 1791 | EXPORT_SYMBOL(schedule_timeout_idle); |
1564 | 1792 | ||
1565 | #ifdef CONFIG_HOTPLUG_CPU | 1793 | #ifdef CONFIG_HOTPLUG_CPU |
1566 | static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) | 1794 | static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) |
1567 | { | 1795 | { |
1568 | struct timer_list *timer; | 1796 | struct timer_list *timer; |
1569 | int cpu = new_base->cpu; | 1797 | int cpu = new_base->cpu; |
1570 | 1798 | ||
1571 | while (!hlist_empty(head)) { | 1799 | while (!hlist_empty(head)) { |
1572 | timer = hlist_entry(head->first, struct timer_list, entry); | 1800 | timer = hlist_entry(head->first, struct timer_list, entry); |
1573 | /* We ignore the accounting on the dying cpu */ | ||
1574 | detach_timer(timer, false); | 1801 | detach_timer(timer, false); |
1575 | timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; | 1802 | timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; |
1576 | internal_add_timer(new_base, timer); | 1803 | internal_add_timer(new_base, timer); |
1577 | } | 1804 | } |
1578 | } | 1805 | } |
1579 | 1806 | ||
1580 | static void migrate_timers(int cpu) | 1807 | int timers_dead_cpu(unsigned int cpu) |
1581 | { | 1808 | { |
1582 | struct tvec_base *old_base; | 1809 | struct timer_base *old_base; |
1583 | struct tvec_base *new_base; | 1810 | struct timer_base *new_base; |
1584 | int i; | 1811 | int b, i; |
1585 | 1812 | ||
1586 | BUG_ON(cpu_online(cpu)); | 1813 | BUG_ON(cpu_online(cpu)); |
1587 | old_base = per_cpu_ptr(&tvec_bases, cpu); | ||
1588 | new_base = get_cpu_ptr(&tvec_bases); | ||
1589 | /* | ||
1590 | * The caller is globally serialized and nobody else | ||
1591 | * takes two locks at once, deadlock is not possible. | ||
1592 | */ | ||
1593 | spin_lock_irq(&new_base->lock); | ||
1594 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | ||
1595 | |||
1596 | BUG_ON(old_base->running_timer); | ||
1597 | |||
1598 | for (i = 0; i < TVR_SIZE; i++) | ||
1599 | migrate_timer_list(new_base, old_base->tv1.vec + i); | ||
1600 | for (i = 0; i < TVN_SIZE; i++) { | ||
1601 | migrate_timer_list(new_base, old_base->tv2.vec + i); | ||
1602 | migrate_timer_list(new_base, old_base->tv3.vec + i); | ||
1603 | migrate_timer_list(new_base, old_base->tv4.vec + i); | ||
1604 | migrate_timer_list(new_base, old_base->tv5.vec + i); | ||
1605 | } | ||
1606 | 1814 | ||
1607 | old_base->active_timers = 0; | 1815 | for (b = 0; b < NR_BASES; b++) { |
1608 | old_base->all_timers = 0; | 1816 | old_base = per_cpu_ptr(&timer_bases[b], cpu); |
1817 | new_base = get_cpu_ptr(&timer_bases[b]); | ||
1818 | /* | ||
1819 | * The caller is globally serialized and nobody else | ||
1820 | * takes two locks at once, deadlock is not possible. | ||
1821 | */ | ||
1822 | spin_lock_irq(&new_base->lock); | ||
1823 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | ||
1609 | 1824 | ||
1610 | spin_unlock(&old_base->lock); | 1825 | BUG_ON(old_base->running_timer); |
1611 | spin_unlock_irq(&new_base->lock); | ||
1612 | put_cpu_ptr(&tvec_bases); | ||
1613 | } | ||
1614 | 1826 | ||
1615 | static int timer_cpu_notify(struct notifier_block *self, | 1827 | for (i = 0; i < WHEEL_SIZE; i++) |
1616 | unsigned long action, void *hcpu) | 1828 | migrate_timer_list(new_base, old_base->vectors + i); |
1617 | { | ||
1618 | switch (action) { | ||
1619 | case CPU_DEAD: | ||
1620 | case CPU_DEAD_FROZEN: | ||
1621 | migrate_timers((long)hcpu); | ||
1622 | break; | ||
1623 | default: | ||
1624 | break; | ||
1625 | } | ||
1626 | 1829 | ||
1627 | return NOTIFY_OK; | 1830 | spin_unlock(&old_base->lock); |
1831 | spin_unlock_irq(&new_base->lock); | ||
1832 | put_cpu_ptr(&timer_bases); | ||
1833 | } | ||
1834 | return 0; | ||
1628 | } | 1835 | } |
1629 | 1836 | ||
1630 | static inline void timer_register_cpu_notifier(void) | ||
1631 | { | ||
1632 | cpu_notifier(timer_cpu_notify, 0); | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline void timer_register_cpu_notifier(void) { } | ||
1636 | #endif /* CONFIG_HOTPLUG_CPU */ | 1837 | #endif /* CONFIG_HOTPLUG_CPU */ |
1637 | 1838 | ||
1638 | static void __init init_timer_cpu(int cpu) | 1839 | static void __init init_timer_cpu(int cpu) |
1639 | { | 1840 | { |
1640 | struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); | 1841 | struct timer_base *base; |
1641 | 1842 | int i; | |
1642 | base->cpu = cpu; | ||
1643 | spin_lock_init(&base->lock); | ||
1644 | 1843 | ||
1645 | base->timer_jiffies = jiffies; | 1844 | for (i = 0; i < NR_BASES; i++) { |
1646 | base->next_timer = base->timer_jiffies; | 1845 | base = per_cpu_ptr(&timer_bases[i], cpu); |
1846 | base->cpu = cpu; | ||
1847 | spin_lock_init(&base->lock); | ||
1848 | base->clk = jiffies; | ||
1849 | } | ||
1647 | } | 1850 | } |
1648 | 1851 | ||
1649 | static void __init init_timer_cpus(void) | 1852 | static void __init init_timer_cpus(void) |
@@ -1658,7 +1861,6 @@ void __init init_timers(void) | |||
1658 | { | 1861 | { |
1659 | init_timer_cpus(); | 1862 | init_timer_cpus(); |
1660 | init_timer_stats(); | 1863 | init_timer_stats(); |
1661 | timer_register_cpu_notifier(); | ||
1662 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 1864 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); |
1663 | } | 1865 | } |
1664 | 1866 | ||
@@ -1702,9 +1904,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) | |||
1702 | } | 1904 | } |
1703 | 1905 | ||
1704 | /** | 1906 | /** |
1705 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | 1907 | * usleep_range - Sleep for an approximate time |
1706 | * @min: Minimum time in usecs to sleep | 1908 | * @min: Minimum time in usecs to sleep |
1707 | * @max: Maximum time in usecs to sleep | 1909 | * @max: Maximum time in usecs to sleep |
1910 | * | ||
1911 | * In non-atomic context where the exact wakeup time is flexible, use | ||
1912 | * usleep_range() instead of udelay(). The sleep improves responsiveness | ||
1913 | * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces | ||
1914 | * power usage by allowing hrtimers to take advantage of an already- | ||
1915 | * scheduled interrupt instead of scheduling a new one just for this sleep. | ||
1708 | */ | 1916 | */ |
1709 | void __sched usleep_range(unsigned long min, unsigned long max) | 1917 | void __sched usleep_range(unsigned long min, unsigned long max) |
1710 | { | 1918 | { |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr) | |||
279 | 279 | ||
280 | static int tstats_show(struct seq_file *m, void *v) | 280 | static int tstats_show(struct seq_file *m, void *v) |
281 | { | 281 | { |
282 | struct timespec period; | 282 | struct timespec64 period; |
283 | struct entry *entry; | 283 | struct entry *entry; |
284 | unsigned long ms; | 284 | unsigned long ms; |
285 | long events = 0; | 285 | long events = 0; |
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v) | |||
295 | 295 | ||
296 | time = ktime_sub(time_stop, time_start); | 296 | time = ktime_sub(time_stop, time_start); |
297 | 297 | ||
298 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec64(time); |
299 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
300 | 300 | ||
301 | seq_puts(m, "Timer Stats Version: v0.3\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); |
302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); |
303 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); | 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); |
305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); | 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); |
diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -82,6 +82,104 @@ static int min_online = -1; | |||
82 | static int max_online; | 82 | static int max_online; |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * Attempt to take a CPU offline. Return false if the CPU is already | ||
86 | * offline or if it is not subject to CPU-hotplug operations. The | ||
87 | * caller can detect other failures by looking at the statistics. | ||
88 | */ | ||
89 | bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, | ||
90 | unsigned long *sum_offl, int *min_offl, int *max_offl) | ||
91 | { | ||
92 | unsigned long delta; | ||
93 | int ret; | ||
94 | unsigned long starttime; | ||
95 | |||
96 | if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) | ||
97 | return false; | ||
98 | |||
99 | if (verbose) | ||
100 | pr_alert("%s" TORTURE_FLAG | ||
101 | "torture_onoff task: offlining %d\n", | ||
102 | torture_type, cpu); | ||
103 | starttime = jiffies; | ||
104 | (*n_offl_attempts)++; | ||
105 | ret = cpu_down(cpu); | ||
106 | if (ret) { | ||
107 | if (verbose) | ||
108 | pr_alert("%s" TORTURE_FLAG | ||
109 | "torture_onoff task: offline %d failed: errno %d\n", | ||
110 | torture_type, cpu, ret); | ||
111 | } else { | ||
112 | if (verbose) | ||
113 | pr_alert("%s" TORTURE_FLAG | ||
114 | "torture_onoff task: offlined %d\n", | ||
115 | torture_type, cpu); | ||
116 | (*n_offl_successes)++; | ||
117 | delta = jiffies - starttime; | ||
118 | sum_offl += delta; | ||
119 | if (*min_offl < 0) { | ||
120 | *min_offl = delta; | ||
121 | *max_offl = delta; | ||
122 | } | ||
123 | if (*min_offl > delta) | ||
124 | *min_offl = delta; | ||
125 | if (*max_offl < delta) | ||
126 | *max_offl = delta; | ||
127 | } | ||
128 | |||
129 | return true; | ||
130 | } | ||
131 | EXPORT_SYMBOL_GPL(torture_offline); | ||
132 | |||
133 | /* | ||
134 | * Attempt to bring a CPU online. Return false if the CPU is already | ||
135 | * online or if it is not subject to CPU-hotplug operations. The | ||
136 | * caller can detect other failures by looking at the statistics. | ||
137 | */ | ||
138 | bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, | ||
139 | unsigned long *sum_onl, int *min_onl, int *max_onl) | ||
140 | { | ||
141 | unsigned long delta; | ||
142 | int ret; | ||
143 | unsigned long starttime; | ||
144 | |||
145 | if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) | ||
146 | return false; | ||
147 | |||
148 | if (verbose) | ||
149 | pr_alert("%s" TORTURE_FLAG | ||
150 | "torture_onoff task: onlining %d\n", | ||
151 | torture_type, cpu); | ||
152 | starttime = jiffies; | ||
153 | (*n_onl_attempts)++; | ||
154 | ret = cpu_up(cpu); | ||
155 | if (ret) { | ||
156 | if (verbose) | ||
157 | pr_alert("%s" TORTURE_FLAG | ||
158 | "torture_onoff task: online %d failed: errno %d\n", | ||
159 | torture_type, cpu, ret); | ||
160 | } else { | ||
161 | if (verbose) | ||
162 | pr_alert("%s" TORTURE_FLAG | ||
163 | "torture_onoff task: onlined %d\n", | ||
164 | torture_type, cpu); | ||
165 | (*n_onl_successes)++; | ||
166 | delta = jiffies - starttime; | ||
167 | *sum_onl += delta; | ||
168 | if (*min_onl < 0) { | ||
169 | *min_onl = delta; | ||
170 | *max_onl = delta; | ||
171 | } | ||
172 | if (*min_onl > delta) | ||
173 | *min_onl = delta; | ||
174 | if (*max_onl < delta) | ||
175 | *max_onl = delta; | ||
176 | } | ||
177 | |||
178 | return true; | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(torture_online); | ||
181 | |||
182 | /* | ||
85 | * Execute random CPU-hotplug operations at the interval specified | 183 | * Execute random CPU-hotplug operations at the interval specified |
86 | * by the onoff_interval. | 184 | * by the onoff_interval. |
87 | */ | 185 | */ |
@@ -89,16 +187,19 @@ static int | |||
89 | torture_onoff(void *arg) | 187 | torture_onoff(void *arg) |
90 | { | 188 | { |
91 | int cpu; | 189 | int cpu; |
92 | unsigned long delta; | ||
93 | int maxcpu = -1; | 190 | int maxcpu = -1; |
94 | DEFINE_TORTURE_RANDOM(rand); | 191 | DEFINE_TORTURE_RANDOM(rand); |
95 | int ret; | ||
96 | unsigned long starttime; | ||
97 | 192 | ||
98 | VERBOSE_TOROUT_STRING("torture_onoff task started"); | 193 | VERBOSE_TOROUT_STRING("torture_onoff task started"); |
99 | for_each_online_cpu(cpu) | 194 | for_each_online_cpu(cpu) |
100 | maxcpu = cpu; | 195 | maxcpu = cpu; |
101 | WARN_ON(maxcpu < 0); | 196 | WARN_ON(maxcpu < 0); |
197 | |||
198 | if (maxcpu == 0) { | ||
199 | VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); | ||
200 | goto stop; | ||
201 | } | ||
202 | |||
102 | if (onoff_holdoff > 0) { | 203 | if (onoff_holdoff > 0) { |
103 | VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); | 204 | VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); |
104 | schedule_timeout_interruptible(onoff_holdoff); | 205 | schedule_timeout_interruptible(onoff_holdoff); |
@@ -106,69 +207,16 @@ torture_onoff(void *arg) | |||
106 | } | 207 | } |
107 | while (!torture_must_stop()) { | 208 | while (!torture_must_stop()) { |
108 | cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); | 209 | cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); |
109 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | 210 | if (!torture_offline(cpu, |
110 | if (verbose) | 211 | &n_offline_attempts, &n_offline_successes, |
111 | pr_alert("%s" TORTURE_FLAG | 212 | &sum_offline, &min_offline, &max_offline)) |
112 | "torture_onoff task: offlining %d\n", | 213 | torture_online(cpu, |
113 | torture_type, cpu); | 214 | &n_online_attempts, &n_online_successes, |
114 | starttime = jiffies; | 215 | &sum_online, &min_online, &max_online); |
115 | n_offline_attempts++; | ||
116 | ret = cpu_down(cpu); | ||
117 | if (ret) { | ||
118 | if (verbose) | ||
119 | pr_alert("%s" TORTURE_FLAG | ||
120 | "torture_onoff task: offline %d failed: errno %d\n", | ||
121 | torture_type, cpu, ret); | ||
122 | } else { | ||
123 | if (verbose) | ||
124 | pr_alert("%s" TORTURE_FLAG | ||
125 | "torture_onoff task: offlined %d\n", | ||
126 | torture_type, cpu); | ||
127 | n_offline_successes++; | ||
128 | delta = jiffies - starttime; | ||
129 | sum_offline += delta; | ||
130 | if (min_offline < 0) { | ||
131 | min_offline = delta; | ||
132 | max_offline = delta; | ||
133 | } | ||
134 | if (min_offline > delta) | ||
135 | min_offline = delta; | ||
136 | if (max_offline < delta) | ||
137 | max_offline = delta; | ||
138 | } | ||
139 | } else if (cpu_is_hotpluggable(cpu)) { | ||
140 | if (verbose) | ||
141 | pr_alert("%s" TORTURE_FLAG | ||
142 | "torture_onoff task: onlining %d\n", | ||
143 | torture_type, cpu); | ||
144 | starttime = jiffies; | ||
145 | n_online_attempts++; | ||
146 | ret = cpu_up(cpu); | ||
147 | if (ret) { | ||
148 | if (verbose) | ||
149 | pr_alert("%s" TORTURE_FLAG | ||
150 | "torture_onoff task: online %d failed: errno %d\n", | ||
151 | torture_type, cpu, ret); | ||
152 | } else { | ||
153 | if (verbose) | ||
154 | pr_alert("%s" TORTURE_FLAG | ||
155 | "torture_onoff task: onlined %d\n", | ||
156 | torture_type, cpu); | ||
157 | n_online_successes++; | ||
158 | delta = jiffies - starttime; | ||
159 | sum_online += delta; | ||
160 | if (min_online < 0) { | ||
161 | min_online = delta; | ||
162 | max_online = delta; | ||
163 | } | ||
164 | if (min_online > delta) | ||
165 | min_online = delta; | ||
166 | if (max_online < delta) | ||
167 | max_online = delta; | ||
168 | } | ||
169 | } | ||
170 | schedule_timeout_interruptible(onoff_interval); | 216 | schedule_timeout_interruptible(onoff_interval); |
171 | } | 217 | } |
218 | |||
219 | stop: | ||
172 | torture_kthread_stopping("torture_onoff"); | 220 | torture_kthread_stopping("torture_onoff"); |
173 | return 0; | 221 | return 0; |
174 | } | 222 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fafeaf803bd0..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -542,6 +542,7 @@ config HIST_TRIGGERS | |||
542 | bool "Histogram triggers" | 542 | bool "Histogram triggers" |
543 | depends on ARCH_HAVE_NMI_SAFE_CMPXCHG | 543 | depends on ARCH_HAVE_NMI_SAFE_CMPXCHG |
544 | select TRACING_MAP | 544 | select TRACING_MAP |
545 | select TRACING | ||
545 | default n | 546 | default n |
546 | help | 547 | help |
547 | Hist triggers allow one or more arbitrary trace event fields | 548 | Hist triggers allow one or more arbitrary trace event fields |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9aef8654e90d..fb345cd11883 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) | |||
127 | 127 | ||
128 | static void trace_note_time(struct blk_trace *bt) | 128 | static void trace_note_time(struct blk_trace *bt) |
129 | { | 129 | { |
130 | struct timespec now; | 130 | struct timespec64 now; |
131 | unsigned long flags; | 131 | unsigned long flags; |
132 | u32 words[2]; | 132 | u32 words[2]; |
133 | 133 | ||
134 | getnstimeofday(&now); | 134 | /* need to check user space to see if this breaks in y2038 or y2106 */ |
135 | words[0] = now.tv_sec; | 135 | ktime_get_real_ts64(&now); |
136 | words[0] = (u32)now.tv_sec; | ||
136 | words[1] = now.tv_nsec; | 137 | words[1] = now.tv_nsec; |
137 | 138 | ||
138 | local_irq_save(flags); | 139 | local_irq_save(flags); |
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | |||
189 | BLK_TC_ACT(BLK_TC_WRITE) }; | 190 | BLK_TC_ACT(BLK_TC_WRITE) }; |
190 | 191 | ||
191 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | 192 | #define BLK_TC_RAHEAD BLK_TC_AHEAD |
193 | #define BLK_TC_PREFLUSH BLK_TC_FLUSH | ||
192 | 194 | ||
193 | /* The ilog2() calls fall out because they're constant */ | 195 | /* The ilog2() calls fall out because they're constant */ |
194 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ | 196 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ |
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | |||
199 | * blk_io_trace structure and places it in a per-cpu subbuffer. | 201 | * blk_io_trace structure and places it in a per-cpu subbuffer. |
200 | */ | 202 | */ |
201 | static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | 203 | static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, |
202 | int rw, u32 what, int error, int pdu_len, void *pdu_data) | 204 | int op, int op_flags, u32 what, int error, int pdu_len, |
205 | void *pdu_data) | ||
203 | { | 206 | { |
204 | struct task_struct *tsk = current; | 207 | struct task_struct *tsk = current; |
205 | struct ring_buffer_event *event = NULL; | 208 | struct ring_buffer_event *event = NULL; |
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
214 | if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) | 217 | if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) |
215 | return; | 218 | return; |
216 | 219 | ||
217 | what |= ddir_act[rw & WRITE]; | 220 | what |= ddir_act[op_is_write(op) ? WRITE : READ]; |
218 | what |= MASK_TC_BIT(rw, SYNC); | 221 | what |= MASK_TC_BIT(op_flags, SYNC); |
219 | what |= MASK_TC_BIT(rw, RAHEAD); | 222 | what |= MASK_TC_BIT(op_flags, RAHEAD); |
220 | what |= MASK_TC_BIT(rw, META); | 223 | what |= MASK_TC_BIT(op_flags, META); |
221 | what |= MASK_TC_BIT(rw, DISCARD); | 224 | what |= MASK_TC_BIT(op_flags, PREFLUSH); |
222 | what |= MASK_TC_BIT(rw, FLUSH); | 225 | what |= MASK_TC_BIT(op_flags, FUA); |
223 | what |= MASK_TC_BIT(rw, FUA); | 226 | if (op == REQ_OP_DISCARD) |
227 | what |= BLK_TC_ACT(BLK_TC_DISCARD); | ||
228 | if (op == REQ_OP_FLUSH) | ||
229 | what |= BLK_TC_ACT(BLK_TC_FLUSH); | ||
224 | 230 | ||
225 | pid = tsk->pid; | 231 | pid = tsk->pid; |
226 | if (act_log_check(bt, what, sector, pid)) | 232 | if (act_log_check(bt, what, sector, pid)) |
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |||
708 | 714 | ||
709 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { | 715 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
710 | what |= BLK_TC_ACT(BLK_TC_PC); | 716 | what |= BLK_TC_ACT(BLK_TC_PC); |
711 | __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, | 717 | __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, |
712 | what, rq->errors, rq->cmd_len, rq->cmd); | 718 | what, rq->errors, rq->cmd_len, rq->cmd); |
713 | } else { | 719 | } else { |
714 | what |= BLK_TC_ACT(BLK_TC_FS); | 720 | what |= BLK_TC_ACT(BLK_TC_FS); |
715 | __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, | 721 | __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), |
716 | rq->cmd_flags, what, rq->errors, 0, NULL); | 722 | rq->cmd_flags, what, rq->errors, 0, NULL); |
717 | } | 723 | } |
718 | } | 724 | } |
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | |||
770 | return; | 776 | return; |
771 | 777 | ||
772 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 778 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
773 | bio->bi_rw, what, error, 0, NULL); | 779 | bio_op(bio), bio->bi_rw, what, error, 0, NULL); |
774 | } | 780 | } |
775 | 781 | ||
776 | static void blk_add_trace_bio_bounce(void *ignore, | 782 | static void blk_add_trace_bio_bounce(void *ignore, |
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore, | |||
818 | struct blk_trace *bt = q->blk_trace; | 824 | struct blk_trace *bt = q->blk_trace; |
819 | 825 | ||
820 | if (bt) | 826 | if (bt) |
821 | __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); | 827 | __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, |
828 | NULL); | ||
822 | } | 829 | } |
823 | } | 830 | } |
824 | 831 | ||
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore, | |||
833 | struct blk_trace *bt = q->blk_trace; | 840 | struct blk_trace *bt = q->blk_trace; |
834 | 841 | ||
835 | if (bt) | 842 | if (bt) |
836 | __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, | 843 | __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, |
837 | 0, 0, NULL); | 844 | 0, 0, NULL); |
838 | } | 845 | } |
839 | } | 846 | } |
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) | |||
843 | struct blk_trace *bt = q->blk_trace; | 850 | struct blk_trace *bt = q->blk_trace; |
844 | 851 | ||
845 | if (bt) | 852 | if (bt) |
846 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); | 853 | __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); |
847 | } | 854 | } |
848 | 855 | ||
849 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, | 856 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, | |||
860 | else | 867 | else |
861 | what = BLK_TA_UNPLUG_TIMER; | 868 | what = BLK_TA_UNPLUG_TIMER; |
862 | 869 | ||
863 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); | 870 | __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
864 | } | 871 | } |
865 | } | 872 | } |
866 | 873 | ||
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore, | |||
874 | __be64 rpdu = cpu_to_be64(pdu); | 881 | __be64 rpdu = cpu_to_be64(pdu); |
875 | 882 | ||
876 | __blk_add_trace(bt, bio->bi_iter.bi_sector, | 883 | __blk_add_trace(bt, bio->bi_iter.bi_sector, |
877 | bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, | 884 | bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, |
878 | bio->bi_error, sizeof(rpdu), &rpdu); | 885 | BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), |
886 | &rpdu); | ||
879 | } | 887 | } |
880 | } | 888 | } |
881 | 889 | ||
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, | |||
907 | r.sector_from = cpu_to_be64(from); | 915 | r.sector_from = cpu_to_be64(from); |
908 | 916 | ||
909 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 917 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
910 | bio->bi_rw, BLK_TA_REMAP, bio->bi_error, | 918 | bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, |
911 | sizeof(r), &r); | 919 | sizeof(r), &r); |
912 | } | 920 | } |
913 | 921 | ||
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore, | |||
940 | r.sector_from = cpu_to_be64(from); | 948 | r.sector_from = cpu_to_be64(from); |
941 | 949 | ||
942 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), | 950 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
943 | rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, | 951 | rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, |
944 | sizeof(r), &r); | 952 | sizeof(r), &r); |
945 | } | 953 | } |
946 | 954 | ||
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q, | |||
965 | return; | 973 | return; |
966 | 974 | ||
967 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) | 975 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) |
968 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, | 976 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, |
969 | BLK_TA_DRV_DATA, rq->errors, len, data); | 977 | BLK_TA_DRV_DATA, rq->errors, len, data); |
970 | else | 978 | else |
971 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, | 979 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, |
972 | BLK_TA_DRV_DATA, rq->errors, len, data); | 980 | BLK_TA_DRV_DATA, rq->errors, len, data); |
973 | } | 981 | } |
974 | EXPORT_SYMBOL_GPL(blk_add_driver_data); | 982 | EXPORT_SYMBOL_GPL(blk_add_driver_data); |
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq) | |||
1769 | } | 1777 | } |
1770 | } | 1778 | } |
1771 | 1779 | ||
1772 | void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | 1780 | void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) |
1773 | { | 1781 | { |
1774 | int i = 0; | 1782 | int i = 0; |
1775 | 1783 | ||
1776 | if (rw & REQ_FLUSH) | 1784 | if (rw & REQ_PREFLUSH) |
1777 | rwbs[i++] = 'F'; | 1785 | rwbs[i++] = 'F'; |
1778 | 1786 | ||
1779 | if (rw & WRITE) | 1787 | switch (op) { |
1788 | case REQ_OP_WRITE: | ||
1789 | case REQ_OP_WRITE_SAME: | ||
1780 | rwbs[i++] = 'W'; | 1790 | rwbs[i++] = 'W'; |
1781 | else if (rw & REQ_DISCARD) | 1791 | break; |
1792 | case REQ_OP_DISCARD: | ||
1793 | rwbs[i++] = 'D'; | ||
1794 | break; | ||
1795 | case REQ_OP_SECURE_ERASE: | ||
1782 | rwbs[i++] = 'D'; | 1796 | rwbs[i++] = 'D'; |
1783 | else if (bytes) | 1797 | rwbs[i++] = 'E'; |
1798 | break; | ||
1799 | case REQ_OP_FLUSH: | ||
1800 | rwbs[i++] = 'F'; | ||
1801 | break; | ||
1802 | case REQ_OP_READ: | ||
1784 | rwbs[i++] = 'R'; | 1803 | rwbs[i++] = 'R'; |
1785 | else | 1804 | break; |
1805 | default: | ||
1786 | rwbs[i++] = 'N'; | 1806 | rwbs[i++] = 'N'; |
1807 | } | ||
1787 | 1808 | ||
1788 | if (rw & REQ_FUA) | 1809 | if (rw & REQ_FUA) |
1789 | rwbs[i++] = 'F'; | 1810 | rwbs[i++] = 'F'; |
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1793 | rwbs[i++] = 'S'; | 1814 | rwbs[i++] = 'S'; |
1794 | if (rw & REQ_META) | 1815 | if (rw & REQ_META) |
1795 | rwbs[i++] = 'M'; | 1816 | rwbs[i++] = 'M'; |
1796 | if (rw & REQ_SECURE) | ||
1797 | rwbs[i++] = 'E'; | ||
1798 | 1817 | ||
1799 | rwbs[i] = '\0'; | 1818 | rwbs[i] = '\0'; |
1800 | } | 1819 | } |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 780bcbe1d4de..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
81 | .arg3_type = ARG_ANYTHING, | 81 | .arg3_type = ARG_ANYTHING, |
82 | }; | 82 | }; |
83 | 83 | ||
84 | static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
85 | { | ||
86 | void *unsafe_ptr = (void *) (long) r1; | ||
87 | void *src = (void *) (long) r2; | ||
88 | int size = (int) r3; | ||
89 | |||
90 | /* | ||
91 | * Ensure we're in user context which is safe for the helper to | ||
92 | * run. This helper has no business in a kthread. | ||
93 | * | ||
94 | * access_ok() should prevent writing to non-user memory, but in | ||
95 | * some situations (nommu, temporary switch, etc) access_ok() does | ||
96 | * not provide enough validation, hence the check on KERNEL_DS. | ||
97 | */ | ||
98 | |||
99 | if (unlikely(in_interrupt() || | ||
100 | current->flags & (PF_KTHREAD | PF_EXITING))) | ||
101 | return -EPERM; | ||
102 | if (unlikely(segment_eq(get_fs(), KERNEL_DS))) | ||
103 | return -EPERM; | ||
104 | if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) | ||
105 | return -EPERM; | ||
106 | |||
107 | return probe_kernel_write(unsafe_ptr, src, size); | ||
108 | } | ||
109 | |||
110 | static const struct bpf_func_proto bpf_probe_write_user_proto = { | ||
111 | .func = bpf_probe_write_user, | ||
112 | .gpl_only = true, | ||
113 | .ret_type = RET_INTEGER, | ||
114 | .arg1_type = ARG_ANYTHING, | ||
115 | .arg2_type = ARG_PTR_TO_STACK, | ||
116 | .arg3_type = ARG_CONST_STACK_SIZE, | ||
117 | }; | ||
118 | |||
119 | static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | ||
120 | { | ||
121 | pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", | ||
122 | current->comm, task_pid_nr(current)); | ||
123 | |||
124 | return &bpf_probe_write_user_proto; | ||
125 | } | ||
126 | |||
84 | /* | 127 | /* |
85 | * limited trace_printk() | 128 | * limited trace_printk() |
86 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 129 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed |
@@ -188,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | |||
188 | return &bpf_trace_printk_proto; | 231 | return &bpf_trace_printk_proto; |
189 | } | 232 | } |
190 | 233 | ||
191 | static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | 234 | static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) |
192 | { | 235 | { |
193 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | 236 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; |
194 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 237 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
238 | unsigned int cpu = smp_processor_id(); | ||
239 | u64 index = flags & BPF_F_INDEX_MASK; | ||
240 | struct bpf_event_entry *ee; | ||
195 | struct perf_event *event; | 241 | struct perf_event *event; |
196 | struct file *file; | ||
197 | 242 | ||
243 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
244 | return -EINVAL; | ||
245 | if (index == BPF_F_CURRENT_CPU) | ||
246 | index = cpu; | ||
198 | if (unlikely(index >= array->map.max_entries)) | 247 | if (unlikely(index >= array->map.max_entries)) |
199 | return -E2BIG; | 248 | return -E2BIG; |
200 | 249 | ||
201 | file = (struct file *)array->ptrs[index]; | 250 | ee = READ_ONCE(array->ptrs[index]); |
202 | if (unlikely(!file)) | 251 | if (!ee) |
203 | return -ENOENT; | 252 | return -ENOENT; |
204 | 253 | ||
205 | event = file->private_data; | 254 | event = ee->event; |
255 | if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && | ||
256 | event->attr.type != PERF_TYPE_RAW)) | ||
257 | return -EINVAL; | ||
206 | 258 | ||
207 | /* make sure event is local and doesn't have pmu::count */ | 259 | /* make sure event is local and doesn't have pmu::count */ |
208 | if (event->oncpu != smp_processor_id() || | 260 | if (unlikely(event->oncpu != cpu || event->pmu->count)) |
209 | event->pmu->count) | ||
210 | return -EINVAL; | 261 | return -EINVAL; |
211 | 262 | ||
212 | /* | 263 | /* |
@@ -225,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { | |||
225 | .arg2_type = ARG_ANYTHING, | 276 | .arg2_type = ARG_ANYTHING, |
226 | }; | 277 | }; |
227 | 278 | ||
228 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 279 | static __always_inline u64 |
280 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | ||
281 | u64 flags, struct perf_raw_record *raw) | ||
229 | { | 282 | { |
230 | struct pt_regs *regs = (struct pt_regs *) (long) r1; | ||
231 | struct bpf_map *map = (struct bpf_map *) (long) r2; | ||
232 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 283 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
284 | unsigned int cpu = smp_processor_id(); | ||
233 | u64 index = flags & BPF_F_INDEX_MASK; | 285 | u64 index = flags & BPF_F_INDEX_MASK; |
234 | void *data = (void *) (long) r4; | ||
235 | struct perf_sample_data sample_data; | 286 | struct perf_sample_data sample_data; |
287 | struct bpf_event_entry *ee; | ||
236 | struct perf_event *event; | 288 | struct perf_event *event; |
237 | struct file *file; | ||
238 | struct perf_raw_record raw = { | ||
239 | .size = size, | ||
240 | .data = data, | ||
241 | }; | ||
242 | 289 | ||
243 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
244 | return -EINVAL; | ||
245 | if (index == BPF_F_CURRENT_CPU) | 290 | if (index == BPF_F_CURRENT_CPU) |
246 | index = raw_smp_processor_id(); | 291 | index = cpu; |
247 | if (unlikely(index >= array->map.max_entries)) | 292 | if (unlikely(index >= array->map.max_entries)) |
248 | return -E2BIG; | 293 | return -E2BIG; |
249 | 294 | ||
250 | file = (struct file *)array->ptrs[index]; | 295 | ee = READ_ONCE(array->ptrs[index]); |
251 | if (unlikely(!file)) | 296 | if (!ee) |
252 | return -ENOENT; | 297 | return -ENOENT; |
253 | 298 | ||
254 | event = file->private_data; | 299 | event = ee->event; |
255 | |||
256 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || | 300 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || |
257 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) | 301 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) |
258 | return -EINVAL; | 302 | return -EINVAL; |
259 | 303 | ||
260 | if (unlikely(event->oncpu != smp_processor_id())) | 304 | if (unlikely(event->oncpu != cpu)) |
261 | return -EOPNOTSUPP; | 305 | return -EOPNOTSUPP; |
262 | 306 | ||
263 | perf_sample_data_init(&sample_data, 0, 0); | 307 | perf_sample_data_init(&sample_data, 0, 0); |
264 | sample_data.raw = &raw; | 308 | sample_data.raw = raw; |
265 | perf_event_output(event, &sample_data, regs); | 309 | perf_event_output(event, &sample_data, regs); |
266 | return 0; | 310 | return 0; |
267 | } | 311 | } |
268 | 312 | ||
313 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | ||
314 | { | ||
315 | struct pt_regs *regs = (struct pt_regs *)(long) r1; | ||
316 | struct bpf_map *map = (struct bpf_map *)(long) r2; | ||
317 | void *data = (void *)(long) r4; | ||
318 | struct perf_raw_record raw = { | ||
319 | .frag = { | ||
320 | .size = size, | ||
321 | .data = data, | ||
322 | }, | ||
323 | }; | ||
324 | |||
325 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | ||
326 | return -EINVAL; | ||
327 | |||
328 | return __bpf_perf_event_output(regs, map, flags, &raw); | ||
329 | } | ||
330 | |||
269 | static const struct bpf_func_proto bpf_perf_event_output_proto = { | 331 | static const struct bpf_func_proto bpf_perf_event_output_proto = { |
270 | .func = bpf_perf_event_output, | 332 | .func = bpf_perf_event_output, |
271 | .gpl_only = true, | 333 | .gpl_only = true, |
@@ -279,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { | |||
279 | 341 | ||
280 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); | 342 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); |
281 | 343 | ||
282 | static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 344 | u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, |
345 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) | ||
283 | { | 346 | { |
284 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); | 347 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); |
348 | struct perf_raw_frag frag = { | ||
349 | .copy = ctx_copy, | ||
350 | .size = ctx_size, | ||
351 | .data = ctx, | ||
352 | }; | ||
353 | struct perf_raw_record raw = { | ||
354 | .frag = { | ||
355 | { | ||
356 | .next = ctx_size ? &frag : NULL, | ||
357 | }, | ||
358 | .size = meta_size, | ||
359 | .data = meta, | ||
360 | }, | ||
361 | }; | ||
285 | 362 | ||
286 | perf_fetch_caller_regs(regs); | 363 | perf_fetch_caller_regs(regs); |
287 | 364 | ||
288 | return bpf_perf_event_output((long)regs, r2, flags, r4, size); | 365 | return __bpf_perf_event_output(regs, map, flags, &raw); |
289 | } | 366 | } |
290 | 367 | ||
291 | static const struct bpf_func_proto bpf_event_output_proto = { | 368 | static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) |
292 | .func = bpf_event_output, | 369 | { |
370 | return (long) current; | ||
371 | } | ||
372 | |||
373 | static const struct bpf_func_proto bpf_get_current_task_proto = { | ||
374 | .func = bpf_get_current_task, | ||
293 | .gpl_only = true, | 375 | .gpl_only = true, |
294 | .ret_type = RET_INTEGER, | 376 | .ret_type = RET_INTEGER, |
295 | .arg1_type = ARG_PTR_TO_CTX, | ||
296 | .arg2_type = ARG_CONST_MAP_PTR, | ||
297 | .arg3_type = ARG_ANYTHING, | ||
298 | .arg4_type = ARG_PTR_TO_STACK, | ||
299 | .arg5_type = ARG_CONST_STACK_SIZE, | ||
300 | }; | 377 | }; |
301 | 378 | ||
302 | const struct bpf_func_proto *bpf_get_event_output_proto(void) | ||
303 | { | ||
304 | return &bpf_event_output_proto; | ||
305 | } | ||
306 | |||
307 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | 379 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) |
308 | { | 380 | { |
309 | switch (func_id) { | 381 | switch (func_id) { |
@@ -321,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
321 | return &bpf_tail_call_proto; | 393 | return &bpf_tail_call_proto; |
322 | case BPF_FUNC_get_current_pid_tgid: | 394 | case BPF_FUNC_get_current_pid_tgid: |
323 | return &bpf_get_current_pid_tgid_proto; | 395 | return &bpf_get_current_pid_tgid_proto; |
396 | case BPF_FUNC_get_current_task: | ||
397 | return &bpf_get_current_task_proto; | ||
324 | case BPF_FUNC_get_current_uid_gid: | 398 | case BPF_FUNC_get_current_uid_gid: |
325 | return &bpf_get_current_uid_gid_proto; | 399 | return &bpf_get_current_uid_gid_proto; |
326 | case BPF_FUNC_get_current_comm: | 400 | case BPF_FUNC_get_current_comm: |
@@ -331,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
331 | return &bpf_get_smp_processor_id_proto; | 405 | return &bpf_get_smp_processor_id_proto; |
332 | case BPF_FUNC_perf_event_read: | 406 | case BPF_FUNC_perf_event_read: |
333 | return &bpf_perf_event_read_proto; | 407 | return &bpf_perf_event_read_proto; |
408 | case BPF_FUNC_probe_write_user: | ||
409 | return bpf_get_probe_write_proto(); | ||
334 | default: | 410 | default: |
335 | return NULL; | 411 | return NULL; |
336 | } | 412 | } |
@@ -349,20 +425,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
349 | } | 425 | } |
350 | 426 | ||
351 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ | 427 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ |
352 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) | 428 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
429 | enum bpf_reg_type *reg_type) | ||
353 | { | 430 | { |
354 | /* check bounds */ | ||
355 | if (off < 0 || off >= sizeof(struct pt_regs)) | 431 | if (off < 0 || off >= sizeof(struct pt_regs)) |
356 | return false; | 432 | return false; |
357 | |||
358 | /* only read is allowed */ | ||
359 | if (type != BPF_READ) | 433 | if (type != BPF_READ) |
360 | return false; | 434 | return false; |
361 | |||
362 | /* disallow misaligned access */ | ||
363 | if (off % size != 0) | 435 | if (off % size != 0) |
364 | return false; | 436 | return false; |
365 | |||
366 | return true; | 437 | return true; |
367 | } | 438 | } |
368 | 439 | ||
@@ -427,7 +498,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) | |||
427 | } | 498 | } |
428 | } | 499 | } |
429 | 500 | ||
430 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) | 501 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
502 | enum bpf_reg_type *reg_type) | ||
431 | { | 503 | { |
432 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) | 504 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) |
433 | return false; | 505 | return false; |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900dbb1efff2..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | |||
89 | /* What to set function_trace_op to */ | 89 | /* What to set function_trace_op to */ |
90 | static struct ftrace_ops *set_function_trace_op; | 90 | static struct ftrace_ops *set_function_trace_op; |
91 | 91 | ||
92 | /* List for set_ftrace_pid's pids. */ | 92 | static bool ftrace_pids_enabled(struct ftrace_ops *ops) |
93 | LIST_HEAD(ftrace_pids); | ||
94 | struct ftrace_pid { | ||
95 | struct list_head list; | ||
96 | struct pid *pid; | ||
97 | }; | ||
98 | |||
99 | static bool ftrace_pids_enabled(void) | ||
100 | { | 93 | { |
101 | return !list_empty(&ftrace_pids); | 94 | struct trace_array *tr; |
95 | |||
96 | if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) | ||
97 | return false; | ||
98 | |||
99 | tr = ops->private; | ||
100 | |||
101 | return tr->function_pids != NULL; | ||
102 | } | 102 | } |
103 | 103 | ||
104 | static void ftrace_update_trampoline(struct ftrace_ops *ops); | 104 | static void ftrace_update_trampoline(struct ftrace_ops *ops); |
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) | |||
179 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | 179 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
180 | struct ftrace_ops *op, struct pt_regs *regs) | 180 | struct ftrace_ops *op, struct pt_regs *regs) |
181 | { | 181 | { |
182 | if (!test_tsk_trace_trace(current)) | 182 | struct trace_array *tr = op->private; |
183 | |||
184 | if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) | ||
183 | return; | 185 | return; |
184 | 186 | ||
185 | op->saved_func(ip, parent_ip, op, regs); | 187 | op->saved_func(ip, parent_ip, op, regs); |
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
417 | /* Always save the function, and reset at unregistering */ | 419 | /* Always save the function, and reset at unregistering */ |
418 | ops->saved_func = ops->func; | 420 | ops->saved_func = ops->func; |
419 | 421 | ||
420 | if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) | 422 | if (ftrace_pids_enabled(ops)) |
421 | ops->func = ftrace_pid_func; | 423 | ops->func = ftrace_pid_func; |
422 | 424 | ||
423 | ftrace_update_trampoline(ops); | 425 | ftrace_update_trampoline(ops); |
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
450 | 452 | ||
451 | static void ftrace_update_pid_func(void) | 453 | static void ftrace_update_pid_func(void) |
452 | { | 454 | { |
453 | bool enabled = ftrace_pids_enabled(); | ||
454 | struct ftrace_ops *op; | 455 | struct ftrace_ops *op; |
455 | 456 | ||
456 | /* Only do something if we are tracing something */ | 457 | /* Only do something if we are tracing something */ |
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) | |||
459 | 460 | ||
460 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 461 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
461 | if (op->flags & FTRACE_OPS_FL_PID) { | 462 | if (op->flags & FTRACE_OPS_FL_PID) { |
462 | op->func = enabled ? ftrace_pid_func : | 463 | op->func = ftrace_pids_enabled(op) ? |
463 | op->saved_func; | 464 | ftrace_pid_func : op->saved_func; |
464 | ftrace_update_trampoline(op); | 465 | ftrace_update_trampoline(op); |
465 | } | 466 | } |
466 | } while_for_each_ftrace_op(op); | 467 | } while_for_each_ftrace_op(op); |
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | |||
5324 | return ops->func; | 5325 | return ops->func; |
5325 | } | 5326 | } |
5326 | 5327 | ||
5327 | static void clear_ftrace_swapper(void) | 5328 | static void |
5329 | ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, | ||
5330 | struct task_struct *prev, struct task_struct *next) | ||
5328 | { | 5331 | { |
5329 | struct task_struct *p; | 5332 | struct trace_array *tr = data; |
5330 | int cpu; | 5333 | struct trace_pid_list *pid_list; |
5331 | 5334 | ||
5332 | get_online_cpus(); | 5335 | pid_list = rcu_dereference_sched(tr->function_pids); |
5333 | for_each_online_cpu(cpu) { | ||
5334 | p = idle_task(cpu); | ||
5335 | clear_tsk_trace_trace(p); | ||
5336 | } | ||
5337 | put_online_cpus(); | ||
5338 | } | ||
5339 | |||
5340 | static void set_ftrace_swapper(void) | ||
5341 | { | ||
5342 | struct task_struct *p; | ||
5343 | int cpu; | ||
5344 | 5336 | ||
5345 | get_online_cpus(); | 5337 | this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, |
5346 | for_each_online_cpu(cpu) { | 5338 | trace_ignore_this_task(pid_list, next)); |
5347 | p = idle_task(cpu); | ||
5348 | set_tsk_trace_trace(p); | ||
5349 | } | ||
5350 | put_online_cpus(); | ||
5351 | } | 5339 | } |
5352 | 5340 | ||
5353 | static void clear_ftrace_pid(struct pid *pid) | 5341 | static void clear_ftrace_pids(struct trace_array *tr) |
5354 | { | 5342 | { |
5355 | struct task_struct *p; | 5343 | struct trace_pid_list *pid_list; |
5344 | int cpu; | ||
5356 | 5345 | ||
5357 | rcu_read_lock(); | 5346 | pid_list = rcu_dereference_protected(tr->function_pids, |
5358 | do_each_pid_task(pid, PIDTYPE_PID, p) { | 5347 | lockdep_is_held(&ftrace_lock)); |
5359 | clear_tsk_trace_trace(p); | 5348 | if (!pid_list) |
5360 | } while_each_pid_task(pid, PIDTYPE_PID, p); | 5349 | return; |
5361 | rcu_read_unlock(); | ||
5362 | 5350 | ||
5363 | put_pid(pid); | 5351 | unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); |
5364 | } | ||
5365 | 5352 | ||
5366 | static void set_ftrace_pid(struct pid *pid) | 5353 | for_each_possible_cpu(cpu) |
5367 | { | 5354 | per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; |
5368 | struct task_struct *p; | ||
5369 | 5355 | ||
5370 | rcu_read_lock(); | 5356 | rcu_assign_pointer(tr->function_pids, NULL); |
5371 | do_each_pid_task(pid, PIDTYPE_PID, p) { | ||
5372 | set_tsk_trace_trace(p); | ||
5373 | } while_each_pid_task(pid, PIDTYPE_PID, p); | ||
5374 | rcu_read_unlock(); | ||
5375 | } | ||
5376 | 5357 | ||
5377 | static void clear_ftrace_pid_task(struct pid *pid) | 5358 | /* Wait till all users are no longer using pid filtering */ |
5378 | { | 5359 | synchronize_sched(); |
5379 | if (pid == ftrace_swapper_pid) | ||
5380 | clear_ftrace_swapper(); | ||
5381 | else | ||
5382 | clear_ftrace_pid(pid); | ||
5383 | } | ||
5384 | 5360 | ||
5385 | static void set_ftrace_pid_task(struct pid *pid) | 5361 | trace_free_pid_list(pid_list); |
5386 | { | ||
5387 | if (pid == ftrace_swapper_pid) | ||
5388 | set_ftrace_swapper(); | ||
5389 | else | ||
5390 | set_ftrace_pid(pid); | ||
5391 | } | 5362 | } |
5392 | 5363 | ||
5393 | static int ftrace_pid_add(int p) | 5364 | static void ftrace_pid_reset(struct trace_array *tr) |
5394 | { | 5365 | { |
5395 | struct pid *pid; | ||
5396 | struct ftrace_pid *fpid; | ||
5397 | int ret = -EINVAL; | ||
5398 | |||
5399 | mutex_lock(&ftrace_lock); | 5366 | mutex_lock(&ftrace_lock); |
5400 | 5367 | clear_ftrace_pids(tr); | |
5401 | if (!p) | ||
5402 | pid = ftrace_swapper_pid; | ||
5403 | else | ||
5404 | pid = find_get_pid(p); | ||
5405 | |||
5406 | if (!pid) | ||
5407 | goto out; | ||
5408 | |||
5409 | ret = 0; | ||
5410 | |||
5411 | list_for_each_entry(fpid, &ftrace_pids, list) | ||
5412 | if (fpid->pid == pid) | ||
5413 | goto out_put; | ||
5414 | |||
5415 | ret = -ENOMEM; | ||
5416 | |||
5417 | fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); | ||
5418 | if (!fpid) | ||
5419 | goto out_put; | ||
5420 | |||
5421 | list_add(&fpid->list, &ftrace_pids); | ||
5422 | fpid->pid = pid; | ||
5423 | |||
5424 | set_ftrace_pid_task(pid); | ||
5425 | 5368 | ||
5426 | ftrace_update_pid_func(); | 5369 | ftrace_update_pid_func(); |
5427 | |||
5428 | ftrace_startup_all(0); | 5370 | ftrace_startup_all(0); |
5429 | 5371 | ||
5430 | mutex_unlock(&ftrace_lock); | 5372 | mutex_unlock(&ftrace_lock); |
5431 | return 0; | ||
5432 | |||
5433 | out_put: | ||
5434 | if (pid != ftrace_swapper_pid) | ||
5435 | put_pid(pid); | ||
5436 | |||
5437 | out: | ||
5438 | mutex_unlock(&ftrace_lock); | ||
5439 | return ret; | ||
5440 | } | 5373 | } |
5441 | 5374 | ||
5442 | static void ftrace_pid_reset(void) | 5375 | /* Greater than any max PID */ |
5443 | { | 5376 | #define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) |
5444 | struct ftrace_pid *fpid, *safe; | ||
5445 | |||
5446 | mutex_lock(&ftrace_lock); | ||
5447 | list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { | ||
5448 | struct pid *pid = fpid->pid; | ||
5449 | |||
5450 | clear_ftrace_pid_task(pid); | ||
5451 | |||
5452 | list_del(&fpid->list); | ||
5453 | kfree(fpid); | ||
5454 | } | ||
5455 | |||
5456 | ftrace_update_pid_func(); | ||
5457 | ftrace_startup_all(0); | ||
5458 | |||
5459 | mutex_unlock(&ftrace_lock); | ||
5460 | } | ||
5461 | 5377 | ||
5462 | static void *fpid_start(struct seq_file *m, loff_t *pos) | 5378 | static void *fpid_start(struct seq_file *m, loff_t *pos) |
5379 | __acquires(RCU) | ||
5463 | { | 5380 | { |
5381 | struct trace_pid_list *pid_list; | ||
5382 | struct trace_array *tr = m->private; | ||
5383 | |||
5464 | mutex_lock(&ftrace_lock); | 5384 | mutex_lock(&ftrace_lock); |
5385 | rcu_read_lock_sched(); | ||
5465 | 5386 | ||
5466 | if (!ftrace_pids_enabled() && (!*pos)) | 5387 | pid_list = rcu_dereference_sched(tr->function_pids); |
5467 | return (void *) 1; | ||
5468 | 5388 | ||
5469 | return seq_list_start(&ftrace_pids, *pos); | 5389 | if (!pid_list) |
5390 | return !(*pos) ? FTRACE_NO_PIDS : NULL; | ||
5391 | |||
5392 | return trace_pid_start(pid_list, pos); | ||
5470 | } | 5393 | } |
5471 | 5394 | ||
5472 | static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) | 5395 | static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) |
5473 | { | 5396 | { |
5474 | if (v == (void *)1) | 5397 | struct trace_array *tr = m->private; |
5398 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); | ||
5399 | |||
5400 | if (v == FTRACE_NO_PIDS) | ||
5475 | return NULL; | 5401 | return NULL; |
5476 | 5402 | ||
5477 | return seq_list_next(v, &ftrace_pids, pos); | 5403 | return trace_pid_next(pid_list, v, pos); |
5478 | } | 5404 | } |
5479 | 5405 | ||
5480 | static void fpid_stop(struct seq_file *m, void *p) | 5406 | static void fpid_stop(struct seq_file *m, void *p) |
5407 | __releases(RCU) | ||
5481 | { | 5408 | { |
5409 | rcu_read_unlock_sched(); | ||
5482 | mutex_unlock(&ftrace_lock); | 5410 | mutex_unlock(&ftrace_lock); |
5483 | } | 5411 | } |
5484 | 5412 | ||
5485 | static int fpid_show(struct seq_file *m, void *v) | 5413 | static int fpid_show(struct seq_file *m, void *v) |
5486 | { | 5414 | { |
5487 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); | 5415 | if (v == FTRACE_NO_PIDS) { |
5488 | |||
5489 | if (v == (void *)1) { | ||
5490 | seq_puts(m, "no pid\n"); | 5416 | seq_puts(m, "no pid\n"); |
5491 | return 0; | 5417 | return 0; |
5492 | } | 5418 | } |
5493 | 5419 | ||
5494 | if (fpid->pid == ftrace_swapper_pid) | 5420 | return trace_pid_show(m, v); |
5495 | seq_puts(m, "swapper tasks\n"); | ||
5496 | else | ||
5497 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); | ||
5498 | |||
5499 | return 0; | ||
5500 | } | 5421 | } |
5501 | 5422 | ||
5502 | static const struct seq_operations ftrace_pid_sops = { | 5423 | static const struct seq_operations ftrace_pid_sops = { |
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { | |||
5509 | static int | 5430 | static int |
5510 | ftrace_pid_open(struct inode *inode, struct file *file) | 5431 | ftrace_pid_open(struct inode *inode, struct file *file) |
5511 | { | 5432 | { |
5433 | struct trace_array *tr = inode->i_private; | ||
5434 | struct seq_file *m; | ||
5512 | int ret = 0; | 5435 | int ret = 0; |
5513 | 5436 | ||
5437 | if (trace_array_get(tr) < 0) | ||
5438 | return -ENODEV; | ||
5439 | |||
5514 | if ((file->f_mode & FMODE_WRITE) && | 5440 | if ((file->f_mode & FMODE_WRITE) && |
5515 | (file->f_flags & O_TRUNC)) | 5441 | (file->f_flags & O_TRUNC)) |
5516 | ftrace_pid_reset(); | 5442 | ftrace_pid_reset(tr); |
5517 | 5443 | ||
5518 | if (file->f_mode & FMODE_READ) | 5444 | ret = seq_open(file, &ftrace_pid_sops); |
5519 | ret = seq_open(file, &ftrace_pid_sops); | 5445 | if (ret < 0) { |
5446 | trace_array_put(tr); | ||
5447 | } else { | ||
5448 | m = file->private_data; | ||
5449 | /* copy tr over to seq ops */ | ||
5450 | m->private = tr; | ||
5451 | } | ||
5520 | 5452 | ||
5521 | return ret; | 5453 | return ret; |
5522 | } | 5454 | } |
5523 | 5455 | ||
5456 | static void ignore_task_cpu(void *data) | ||
5457 | { | ||
5458 | struct trace_array *tr = data; | ||
5459 | struct trace_pid_list *pid_list; | ||
5460 | |||
5461 | /* | ||
5462 | * This function is called by on_each_cpu() while the | ||
5463 | * event_mutex is held. | ||
5464 | */ | ||
5465 | pid_list = rcu_dereference_protected(tr->function_pids, | ||
5466 | mutex_is_locked(&ftrace_lock)); | ||
5467 | |||
5468 | this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, | ||
5469 | trace_ignore_this_task(pid_list, current)); | ||
5470 | } | ||
5471 | |||
5524 | static ssize_t | 5472 | static ssize_t |
5525 | ftrace_pid_write(struct file *filp, const char __user *ubuf, | 5473 | ftrace_pid_write(struct file *filp, const char __user *ubuf, |
5526 | size_t cnt, loff_t *ppos) | 5474 | size_t cnt, loff_t *ppos) |
5527 | { | 5475 | { |
5528 | char buf[64], *tmp; | 5476 | struct seq_file *m = filp->private_data; |
5529 | long val; | 5477 | struct trace_array *tr = m->private; |
5530 | int ret; | 5478 | struct trace_pid_list *filtered_pids = NULL; |
5479 | struct trace_pid_list *pid_list; | ||
5480 | ssize_t ret; | ||
5531 | 5481 | ||
5532 | if (cnt >= sizeof(buf)) | 5482 | if (!cnt) |
5533 | return -EINVAL; | 5483 | return 0; |
5484 | |||
5485 | mutex_lock(&ftrace_lock); | ||
5486 | |||
5487 | filtered_pids = rcu_dereference_protected(tr->function_pids, | ||
5488 | lockdep_is_held(&ftrace_lock)); | ||
5489 | |||
5490 | ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); | ||
5491 | if (ret < 0) | ||
5492 | goto out; | ||
5534 | 5493 | ||
5535 | if (copy_from_user(&buf, ubuf, cnt)) | 5494 | rcu_assign_pointer(tr->function_pids, pid_list); |
5536 | return -EFAULT; | ||
5537 | 5495 | ||
5538 | buf[cnt] = 0; | 5496 | if (filtered_pids) { |
5497 | synchronize_sched(); | ||
5498 | trace_free_pid_list(filtered_pids); | ||
5499 | } else if (pid_list) { | ||
5500 | /* Register a probe to set whether to ignore the tracing of a task */ | ||
5501 | register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); | ||
5502 | } | ||
5539 | 5503 | ||
5540 | /* | 5504 | /* |
5541 | * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" | 5505 | * Ignoring of pids is done at task switch. But we have to |
5542 | * to clean the filter quietly. | 5506 | * check for those tasks that are currently running. |
5507 | * Always do this in case a pid was appended or removed. | ||
5543 | */ | 5508 | */ |
5544 | tmp = strstrip(buf); | 5509 | on_each_cpu(ignore_task_cpu, tr, 1); |
5545 | if (strlen(tmp) == 0) | ||
5546 | return 1; | ||
5547 | 5510 | ||
5548 | ret = kstrtol(tmp, 10, &val); | 5511 | ftrace_update_pid_func(); |
5549 | if (ret < 0) | 5512 | ftrace_startup_all(0); |
5550 | return ret; | 5513 | out: |
5514 | mutex_unlock(&ftrace_lock); | ||
5551 | 5515 | ||
5552 | ret = ftrace_pid_add(val); | 5516 | if (ret > 0) |
5517 | *ppos += ret; | ||
5553 | 5518 | ||
5554 | return ret ? ret : cnt; | 5519 | return ret; |
5555 | } | 5520 | } |
5556 | 5521 | ||
5557 | static int | 5522 | static int |
5558 | ftrace_pid_release(struct inode *inode, struct file *file) | 5523 | ftrace_pid_release(struct inode *inode, struct file *file) |
5559 | { | 5524 | { |
5560 | if (file->f_mode & FMODE_READ) | 5525 | struct trace_array *tr = inode->i_private; |
5561 | seq_release(inode, file); | ||
5562 | 5526 | ||
5563 | return 0; | 5527 | trace_array_put(tr); |
5528 | |||
5529 | return seq_release(inode, file); | ||
5564 | } | 5530 | } |
5565 | 5531 | ||
5566 | static const struct file_operations ftrace_pid_fops = { | 5532 | static const struct file_operations ftrace_pid_fops = { |
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = { | |||
5571 | .release = ftrace_pid_release, | 5537 | .release = ftrace_pid_release, |
5572 | }; | 5538 | }; |
5573 | 5539 | ||
5574 | static __init int ftrace_init_tracefs(void) | 5540 | void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) |
5575 | { | 5541 | { |
5576 | struct dentry *d_tracer; | 5542 | trace_create_file("set_ftrace_pid", 0644, d_tracer, |
5543 | tr, &ftrace_pid_fops); | ||
5544 | } | ||
5577 | 5545 | ||
5578 | d_tracer = tracing_init_dentry(); | 5546 | void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, |
5579 | if (IS_ERR(d_tracer)) | 5547 | struct dentry *d_tracer) |
5580 | return 0; | 5548 | { |
5549 | /* Only the top level directory has the dyn_tracefs and profile */ | ||
5550 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); | ||
5581 | 5551 | ||
5582 | ftrace_init_dyn_tracefs(d_tracer); | 5552 | ftrace_init_dyn_tracefs(d_tracer); |
5583 | |||
5584 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | ||
5585 | NULL, &ftrace_pid_fops); | ||
5586 | |||
5587 | ftrace_profile_tracefs(d_tracer); | 5553 | ftrace_profile_tracefs(d_tracer); |
5588 | |||
5589 | return 0; | ||
5590 | } | 5554 | } |
5591 | fs_initcall(ftrace_init_tracefs); | ||
5592 | 5555 | ||
5593 | /** | 5556 | /** |
5594 | * ftrace_kill - kill ftrace | 5557 | * ftrace_kill - kill ftrace |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a4bd6b68a0b..dade4c9559cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
26 | #include <linux/linkage.h> | 26 | #include <linux/linkage.h> |
27 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
28 | #include <linux/kprobes.h> | 28 | #include <linux/vmalloc.h> |
29 | #include <linux/ftrace.h> | 29 | #include <linux/ftrace.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/percpu.h> | 31 | #include <linux/percpu.h> |
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, | |||
319 | return 0; | 319 | return 0; |
320 | } | 320 | } |
321 | 321 | ||
322 | void trace_free_pid_list(struct trace_pid_list *pid_list) | ||
323 | { | ||
324 | vfree(pid_list->pids); | ||
325 | kfree(pid_list); | ||
326 | } | ||
327 | |||
328 | /** | ||
329 | * trace_find_filtered_pid - check if a pid exists in a filtered_pid list | ||
330 | * @filtered_pids: The list of pids to check | ||
331 | * @search_pid: The PID to find in @filtered_pids | ||
332 | * | ||
333 | * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. | ||
334 | */ | ||
335 | bool | ||
336 | trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) | ||
337 | { | ||
338 | /* | ||
339 | * If pid_max changed after filtered_pids was created, we | ||
340 | * by default ignore all pids greater than the previous pid_max. | ||
341 | */ | ||
342 | if (search_pid >= filtered_pids->pid_max) | ||
343 | return false; | ||
344 | |||
345 | return test_bit(search_pid, filtered_pids->pids); | ||
346 | } | ||
347 | |||
348 | /** | ||
349 | * trace_ignore_this_task - should a task be ignored for tracing | ||
350 | * @filtered_pids: The list of pids to check | ||
351 | * @task: The task that should be ignored if not filtered | ||
352 | * | ||
353 | * Checks if @task should be traced or not from @filtered_pids. | ||
354 | * Returns true if @task should *NOT* be traced. | ||
355 | * Returns false if @task should be traced. | ||
356 | */ | ||
357 | bool | ||
358 | trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) | ||
359 | { | ||
360 | /* | ||
361 | * Return false, because if filtered_pids does not exist, | ||
362 | * all pids are good to trace. | ||
363 | */ | ||
364 | if (!filtered_pids) | ||
365 | return false; | ||
366 | |||
367 | return !trace_find_filtered_pid(filtered_pids, task->pid); | ||
368 | } | ||
369 | |||
370 | /** | ||
371 | * trace_pid_filter_add_remove - Add or remove a task from a pid_list | ||
372 | * @pid_list: The list to modify | ||
373 | * @self: The current task for fork or NULL for exit | ||
374 | * @task: The task to add or remove | ||
375 | * | ||
376 | * If adding a task, if @self is defined, the task is only added if @self | ||
377 | * is also included in @pid_list. This happens on fork and tasks should | ||
378 | * only be added when the parent is listed. If @self is NULL, then the | ||
379 | * @task pid will be removed from the list, which would happen on exit | ||
380 | * of a task. | ||
381 | */ | ||
382 | void trace_filter_add_remove_task(struct trace_pid_list *pid_list, | ||
383 | struct task_struct *self, | ||
384 | struct task_struct *task) | ||
385 | { | ||
386 | if (!pid_list) | ||
387 | return; | ||
388 | |||
389 | /* For forks, we only add if the forking task is listed */ | ||
390 | if (self) { | ||
391 | if (!trace_find_filtered_pid(pid_list, self->pid)) | ||
392 | return; | ||
393 | } | ||
394 | |||
395 | /* Sorry, but we don't support pid_max changing after setting */ | ||
396 | if (task->pid >= pid_list->pid_max) | ||
397 | return; | ||
398 | |||
399 | /* "self" is set for forks, and NULL for exits */ | ||
400 | if (self) | ||
401 | set_bit(task->pid, pid_list->pids); | ||
402 | else | ||
403 | clear_bit(task->pid, pid_list->pids); | ||
404 | } | ||
405 | |||
406 | /** | ||
407 | * trace_pid_next - Used for seq_file to get to the next pid of a pid_list | ||
408 | * @pid_list: The pid list to show | ||
409 | * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) | ||
410 | * @pos: The position of the file | ||
411 | * | ||
412 | * This is used by the seq_file "next" operation to iterate the pids | ||
413 | * listed in a trace_pid_list structure. | ||
414 | * | ||
415 | * Returns the pid+1 as we want to display pid of zero, but NULL would | ||
416 | * stop the iteration. | ||
417 | */ | ||
418 | void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) | ||
419 | { | ||
420 | unsigned long pid = (unsigned long)v; | ||
421 | |||
422 | (*pos)++; | ||
423 | |||
424 | /* pid already is +1 of the actual prevous bit */ | ||
425 | pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); | ||
426 | |||
427 | /* Return pid + 1 to allow zero to be represented */ | ||
428 | if (pid < pid_list->pid_max) | ||
429 | return (void *)(pid + 1); | ||
430 | |||
431 | return NULL; | ||
432 | } | ||
433 | |||
434 | /** | ||
435 | * trace_pid_start - Used for seq_file to start reading pid lists | ||
436 | * @pid_list: The pid list to show | ||
437 | * @pos: The position of the file | ||
438 | * | ||
439 | * This is used by seq_file "start" operation to start the iteration | ||
440 | * of listing pids. | ||
441 | * | ||
442 | * Returns the pid+1 as we want to display pid of zero, but NULL would | ||
443 | * stop the iteration. | ||
444 | */ | ||
445 | void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) | ||
446 | { | ||
447 | unsigned long pid; | ||
448 | loff_t l = 0; | ||
449 | |||
450 | pid = find_first_bit(pid_list->pids, pid_list->pid_max); | ||
451 | if (pid >= pid_list->pid_max) | ||
452 | return NULL; | ||
453 | |||
454 | /* Return pid + 1 so that zero can be the exit value */ | ||
455 | for (pid++; pid && l < *pos; | ||
456 | pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) | ||
457 | ; | ||
458 | return (void *)pid; | ||
459 | } | ||
460 | |||
461 | /** | ||
462 | * trace_pid_show - show the current pid in seq_file processing | ||
463 | * @m: The seq_file structure to write into | ||
464 | * @v: A void pointer of the pid (+1) value to display | ||
465 | * | ||
466 | * Can be directly used by seq_file operations to display the current | ||
467 | * pid value. | ||
468 | */ | ||
469 | int trace_pid_show(struct seq_file *m, void *v) | ||
470 | { | ||
471 | unsigned long pid = (unsigned long)v - 1; | ||
472 | |||
473 | seq_printf(m, "%lu\n", pid); | ||
474 | return 0; | ||
475 | } | ||
476 | |||
477 | /* 128 should be much more than enough */ | ||
478 | #define PID_BUF_SIZE 127 | ||
479 | |||
480 | int trace_pid_write(struct trace_pid_list *filtered_pids, | ||
481 | struct trace_pid_list **new_pid_list, | ||
482 | const char __user *ubuf, size_t cnt) | ||
483 | { | ||
484 | struct trace_pid_list *pid_list; | ||
485 | struct trace_parser parser; | ||
486 | unsigned long val; | ||
487 | int nr_pids = 0; | ||
488 | ssize_t read = 0; | ||
489 | ssize_t ret = 0; | ||
490 | loff_t pos; | ||
491 | pid_t pid; | ||
492 | |||
493 | if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) | ||
494 | return -ENOMEM; | ||
495 | |||
496 | /* | ||
497 | * Always recreate a new array. The write is an all or nothing | ||
498 | * operation. Always create a new array when adding new pids by | ||
499 | * the user. If the operation fails, then the current list is | ||
500 | * not modified. | ||
501 | */ | ||
502 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); | ||
503 | if (!pid_list) | ||
504 | return -ENOMEM; | ||
505 | |||
506 | pid_list->pid_max = READ_ONCE(pid_max); | ||
507 | |||
508 | /* Only truncating will shrink pid_max */ | ||
509 | if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) | ||
510 | pid_list->pid_max = filtered_pids->pid_max; | ||
511 | |||
512 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); | ||
513 | if (!pid_list->pids) { | ||
514 | kfree(pid_list); | ||
515 | return -ENOMEM; | ||
516 | } | ||
517 | |||
518 | if (filtered_pids) { | ||
519 | /* copy the current bits to the new max */ | ||
520 | for_each_set_bit(pid, filtered_pids->pids, | ||
521 | filtered_pids->pid_max) { | ||
522 | set_bit(pid, pid_list->pids); | ||
523 | nr_pids++; | ||
524 | } | ||
525 | } | ||
526 | |||
527 | while (cnt > 0) { | ||
528 | |||
529 | pos = 0; | ||
530 | |||
531 | ret = trace_get_user(&parser, ubuf, cnt, &pos); | ||
532 | if (ret < 0 || !trace_parser_loaded(&parser)) | ||
533 | break; | ||
534 | |||
535 | read += ret; | ||
536 | ubuf += ret; | ||
537 | cnt -= ret; | ||
538 | |||
539 | parser.buffer[parser.idx] = 0; | ||
540 | |||
541 | ret = -EINVAL; | ||
542 | if (kstrtoul(parser.buffer, 0, &val)) | ||
543 | break; | ||
544 | if (val >= pid_list->pid_max) | ||
545 | break; | ||
546 | |||
547 | pid = (pid_t)val; | ||
548 | |||
549 | set_bit(pid, pid_list->pids); | ||
550 | nr_pids++; | ||
551 | |||
552 | trace_parser_clear(&parser); | ||
553 | ret = 0; | ||
554 | } | ||
555 | trace_parser_put(&parser); | ||
556 | |||
557 | if (ret < 0) { | ||
558 | trace_free_pid_list(pid_list); | ||
559 | return ret; | ||
560 | } | ||
561 | |||
562 | if (!nr_pids) { | ||
563 | /* Cleared the list of pids */ | ||
564 | trace_free_pid_list(pid_list); | ||
565 | read = ret; | ||
566 | pid_list = NULL; | ||
567 | } | ||
568 | |||
569 | *new_pid_list = pid_list; | ||
570 | |||
571 | return read; | ||
572 | } | ||
573 | |||
322 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) | 574 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
323 | { | 575 | { |
324 | u64 ts; | 576 | u64 ts; |
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, | |||
1862 | { | 2114 | { |
1863 | __buffer_unlock_commit(buffer, event); | 2115 | __buffer_unlock_commit(buffer, event); |
1864 | 2116 | ||
1865 | ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); | 2117 | /* |
2118 | * If regs is not set, then skip the following callers: | ||
2119 | * trace_buffer_unlock_commit_regs | ||
2120 | * event_trigger_unlock_commit | ||
2121 | * trace_event_buffer_commit | ||
2122 | * trace_event_raw_event_sched_switch | ||
2123 | * Note, we can still get here via blktrace, wakeup tracer | ||
2124 | * and mmiotrace, but that's ok if they lose a function or | ||
2125 | * two. They are that meaningful. | ||
2126 | */ | ||
2127 | ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); | ||
1866 | ftrace_trace_userstack(buffer, flags, pc); | 2128 | ftrace_trace_userstack(buffer, flags, pc); |
1867 | } | 2129 | } |
1868 | 2130 | ||
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1913 | trace.skip = skip; | 2175 | trace.skip = skip; |
1914 | 2176 | ||
1915 | /* | 2177 | /* |
2178 | * Add two, for this function and the call to save_stack_trace() | ||
2179 | * If regs is set, then these functions will not be in the way. | ||
2180 | */ | ||
2181 | if (!regs) | ||
2182 | trace.skip += 2; | ||
2183 | |||
2184 | /* | ||
1916 | * Since events can happen in NMIs there's no safe way to | 2185 | * Since events can happen in NMIs there's no safe way to |
1917 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt | 2186 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt |
1918 | * or NMI comes in, it will just have to use the default | 2187 | * or NMI comes in, it will just have to use the default |
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
2083 | 2352 | ||
2084 | /* created for use with alloc_percpu */ | 2353 | /* created for use with alloc_percpu */ |
2085 | struct trace_buffer_struct { | 2354 | struct trace_buffer_struct { |
2086 | char buffer[TRACE_BUF_SIZE]; | 2355 | int nesting; |
2356 | char buffer[4][TRACE_BUF_SIZE]; | ||
2087 | }; | 2357 | }; |
2088 | 2358 | ||
2089 | static struct trace_buffer_struct *trace_percpu_buffer; | 2359 | static struct trace_buffer_struct *trace_percpu_buffer; |
2090 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
2091 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
2092 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
2093 | 2360 | ||
2094 | /* | 2361 | /* |
2095 | * The buffer used is dependent on the context. There is a per cpu | 2362 | * Thise allows for lockless recording. If we're nested too deeply, then |
2096 | * buffer for normal context, softirq contex, hard irq context and | 2363 | * this returns NULL. |
2097 | * for NMI context. Thise allows for lockless recording. | ||
2098 | * | ||
2099 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
2100 | */ | 2364 | */ |
2101 | static char *get_trace_buf(void) | 2365 | static char *get_trace_buf(void) |
2102 | { | 2366 | { |
2103 | struct trace_buffer_struct *percpu_buffer; | 2367 | struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); |
2104 | |||
2105 | /* | ||
2106 | * If we have allocated per cpu buffers, then we do not | ||
2107 | * need to do any locking. | ||
2108 | */ | ||
2109 | if (in_nmi()) | ||
2110 | percpu_buffer = trace_percpu_nmi_buffer; | ||
2111 | else if (in_irq()) | ||
2112 | percpu_buffer = trace_percpu_irq_buffer; | ||
2113 | else if (in_softirq()) | ||
2114 | percpu_buffer = trace_percpu_sirq_buffer; | ||
2115 | else | ||
2116 | percpu_buffer = trace_percpu_buffer; | ||
2117 | 2368 | ||
2118 | if (!percpu_buffer) | 2369 | if (!buffer || buffer->nesting >= 4) |
2119 | return NULL; | 2370 | return NULL; |
2120 | 2371 | ||
2121 | return this_cpu_ptr(&percpu_buffer->buffer[0]); | 2372 | return &buffer->buffer[buffer->nesting++][0]; |
2373 | } | ||
2374 | |||
2375 | static void put_trace_buf(void) | ||
2376 | { | ||
2377 | this_cpu_dec(trace_percpu_buffer->nesting); | ||
2122 | } | 2378 | } |
2123 | 2379 | ||
2124 | static int alloc_percpu_trace_buffer(void) | 2380 | static int alloc_percpu_trace_buffer(void) |
2125 | { | 2381 | { |
2126 | struct trace_buffer_struct *buffers; | 2382 | struct trace_buffer_struct *buffers; |
2127 | struct trace_buffer_struct *sirq_buffers; | ||
2128 | struct trace_buffer_struct *irq_buffers; | ||
2129 | struct trace_buffer_struct *nmi_buffers; | ||
2130 | 2383 | ||
2131 | buffers = alloc_percpu(struct trace_buffer_struct); | 2384 | buffers = alloc_percpu(struct trace_buffer_struct); |
2132 | if (!buffers) | 2385 | if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) |
2133 | goto err_warn; | 2386 | return -ENOMEM; |
2134 | |||
2135 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
2136 | if (!sirq_buffers) | ||
2137 | goto err_sirq; | ||
2138 | |||
2139 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
2140 | if (!irq_buffers) | ||
2141 | goto err_irq; | ||
2142 | |||
2143 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
2144 | if (!nmi_buffers) | ||
2145 | goto err_nmi; | ||
2146 | 2387 | ||
2147 | trace_percpu_buffer = buffers; | 2388 | trace_percpu_buffer = buffers; |
2148 | trace_percpu_sirq_buffer = sirq_buffers; | ||
2149 | trace_percpu_irq_buffer = irq_buffers; | ||
2150 | trace_percpu_nmi_buffer = nmi_buffers; | ||
2151 | |||
2152 | return 0; | 2389 | return 0; |
2153 | |||
2154 | err_nmi: | ||
2155 | free_percpu(irq_buffers); | ||
2156 | err_irq: | ||
2157 | free_percpu(sirq_buffers); | ||
2158 | err_sirq: | ||
2159 | free_percpu(buffers); | ||
2160 | err_warn: | ||
2161 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
2162 | return -ENOMEM; | ||
2163 | } | 2390 | } |
2164 | 2391 | ||
2165 | static int buffers_allocated; | 2392 | static int buffers_allocated; |
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
2250 | tbuffer = get_trace_buf(); | 2477 | tbuffer = get_trace_buf(); |
2251 | if (!tbuffer) { | 2478 | if (!tbuffer) { |
2252 | len = 0; | 2479 | len = 0; |
2253 | goto out; | 2480 | goto out_nobuffer; |
2254 | } | 2481 | } |
2255 | 2482 | ||
2256 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); | 2483 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
2276 | } | 2503 | } |
2277 | 2504 | ||
2278 | out: | 2505 | out: |
2506 | put_trace_buf(); | ||
2507 | |||
2508 | out_nobuffer: | ||
2279 | preempt_enable_notrace(); | 2509 | preempt_enable_notrace(); |
2280 | unpause_graph_tracing(); | 2510 | unpause_graph_tracing(); |
2281 | 2511 | ||
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
2307 | tbuffer = get_trace_buf(); | 2537 | tbuffer = get_trace_buf(); |
2308 | if (!tbuffer) { | 2538 | if (!tbuffer) { |
2309 | len = 0; | 2539 | len = 0; |
2310 | goto out; | 2540 | goto out_nobuffer; |
2311 | } | 2541 | } |
2312 | 2542 | ||
2313 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); | 2543 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
2326 | __buffer_unlock_commit(buffer, event); | 2556 | __buffer_unlock_commit(buffer, event); |
2327 | ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); | 2557 | ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); |
2328 | } | 2558 | } |
2329 | out: | 2559 | |
2560 | out: | ||
2561 | put_trace_buf(); | ||
2562 | |||
2563 | out_nobuffer: | ||
2330 | preempt_enable_notrace(); | 2564 | preempt_enable_notrace(); |
2331 | unpause_graph_tracing(); | 2565 | unpause_graph_tracing(); |
2332 | 2566 | ||
@@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
6977 | for_each_tracing_cpu(cpu) | 7211 | for_each_tracing_cpu(cpu) |
6978 | tracing_init_tracefs_percpu(tr, cpu); | 7212 | tracing_init_tracefs_percpu(tr, cpu); |
6979 | 7213 | ||
7214 | ftrace_init_tracefs(tr, d_tracer); | ||
6980 | } | 7215 | } |
6981 | 7216 | ||
6982 | static struct vfsmount *trace_automount(void *ingore) | 7217 | static struct vfsmount *trace_automount(void *ingore) |
@@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void) | |||
7130 | return 0; | 7365 | return 0; |
7131 | 7366 | ||
7132 | init_tracer_tracefs(&global_trace, d_tracer); | 7367 | init_tracer_tracefs(&global_trace, d_tracer); |
7368 | ftrace_init_tracefs_toplevel(&global_trace, d_tracer); | ||
7133 | 7369 | ||
7134 | trace_create_file("tracing_thresh", 0644, d_tracer, | 7370 | trace_create_file("tracing_thresh", 0644, d_tracer, |
7135 | &global_trace, &tracing_thresh_fops); | 7371 | &global_trace, &tracing_thresh_fops); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5167c366d6b7..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -80,6 +80,12 @@ enum trace_type { | |||
80 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | 80 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
81 | filter) | 81 | filter) |
82 | 82 | ||
83 | #undef FTRACE_ENTRY_PACKED | ||
84 | #define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ | ||
85 | filter) \ | ||
86 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
87 | filter) __packed | ||
88 | |||
83 | #include "trace_entries.h" | 89 | #include "trace_entries.h" |
84 | 90 | ||
85 | /* | 91 | /* |
@@ -156,6 +162,9 @@ struct trace_array_cpu { | |||
156 | char comm[TASK_COMM_LEN]; | 162 | char comm[TASK_COMM_LEN]; |
157 | 163 | ||
158 | bool ignore_pid; | 164 | bool ignore_pid; |
165 | #ifdef CONFIG_FUNCTION_TRACER | ||
166 | bool ftrace_ignore_pid; | ||
167 | #endif | ||
159 | }; | 168 | }; |
160 | 169 | ||
161 | struct tracer; | 170 | struct tracer; |
@@ -247,6 +256,7 @@ struct trace_array { | |||
247 | int ref; | 256 | int ref; |
248 | #ifdef CONFIG_FUNCTION_TRACER | 257 | #ifdef CONFIG_FUNCTION_TRACER |
249 | struct ftrace_ops *ops; | 258 | struct ftrace_ops *ops; |
259 | struct trace_pid_list __rcu *function_pids; | ||
250 | /* function tracing enabled */ | 260 | /* function tracing enabled */ |
251 | int function_enabled; | 261 | int function_enabled; |
252 | #endif | 262 | #endif |
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); | |||
628 | 638 | ||
629 | extern unsigned long tracing_thresh; | 639 | extern unsigned long tracing_thresh; |
630 | 640 | ||
641 | /* PID filtering */ | ||
642 | |||
643 | extern int pid_max; | ||
644 | |||
645 | bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, | ||
646 | pid_t search_pid); | ||
647 | bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, | ||
648 | struct task_struct *task); | ||
649 | void trace_filter_add_remove_task(struct trace_pid_list *pid_list, | ||
650 | struct task_struct *self, | ||
651 | struct task_struct *task); | ||
652 | void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); | ||
653 | void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); | ||
654 | int trace_pid_show(struct seq_file *m, void *v); | ||
655 | void trace_free_pid_list(struct trace_pid_list *pid_list); | ||
656 | int trace_pid_write(struct trace_pid_list *filtered_pids, | ||
657 | struct trace_pid_list **new_pid_list, | ||
658 | const char __user *ubuf, size_t cnt); | ||
659 | |||
631 | #ifdef CONFIG_TRACER_MAX_TRACE | 660 | #ifdef CONFIG_TRACER_MAX_TRACE |
632 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); | 661 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); |
633 | void update_max_tr_single(struct trace_array *tr, | 662 | void update_max_tr_single(struct trace_array *tr, |
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids; | |||
821 | 850 | ||
822 | #ifdef CONFIG_FUNCTION_TRACER | 851 | #ifdef CONFIG_FUNCTION_TRACER |
823 | extern bool ftrace_filter_param __initdata; | 852 | extern bool ftrace_filter_param __initdata; |
824 | static inline int ftrace_trace_task(struct task_struct *task) | 853 | static inline int ftrace_trace_task(struct trace_array *tr) |
825 | { | 854 | { |
826 | if (list_empty(&ftrace_pids)) | 855 | return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); |
827 | return 1; | ||
828 | |||
829 | return test_tsk_trace_trace(task); | ||
830 | } | 856 | } |
831 | extern int ftrace_is_dead(void); | 857 | extern int ftrace_is_dead(void); |
832 | int ftrace_create_function_files(struct trace_array *tr, | 858 | int ftrace_create_function_files(struct trace_array *tr, |
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr); | |||
836 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); | 862 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); |
837 | void ftrace_reset_array_ops(struct trace_array *tr); | 863 | void ftrace_reset_array_ops(struct trace_array *tr); |
838 | int using_ftrace_ops_list_func(void); | 864 | int using_ftrace_ops_list_func(void); |
865 | void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); | ||
866 | void ftrace_init_tracefs_toplevel(struct trace_array *tr, | ||
867 | struct dentry *d_tracer); | ||
839 | #else | 868 | #else |
840 | static inline int ftrace_trace_task(struct task_struct *task) | 869 | static inline int ftrace_trace_task(struct trace_array *tr) |
841 | { | 870 | { |
842 | return 1; | 871 | return 1; |
843 | } | 872 | } |
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } | |||
852 | static inline __init void | 881 | static inline __init void |
853 | ftrace_init_global_array_ops(struct trace_array *tr) { } | 882 | ftrace_init_global_array_ops(struct trace_array *tr) { } |
854 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } | 883 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } |
884 | static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } | ||
885 | static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } | ||
855 | /* ftace_func_t type is not defined, use macro instead of static inline */ | 886 | /* ftace_func_t type is not defined, use macro instead of static inline */ |
856 | #define ftrace_init_array_ops(tr, func) do { } while (0) | 887 | #define ftrace_init_array_ops(tr, func) do { } while (0) |
857 | #endif /* CONFIG_FUNCTION_TRACER */ | 888 | #endif /* CONFIG_FUNCTION_TRACER */ |
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); | |||
1600 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ | 1631 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ |
1601 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | 1632 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
1602 | filter) | 1633 | filter) |
1634 | #undef FTRACE_ENTRY_PACKED | ||
1635 | #define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ | ||
1636 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
1637 | filter) | ||
1638 | |||
1603 | #include "trace_entries.h" | 1639 | #include "trace_entries.h" |
1604 | 1640 | ||
1605 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) | 1641 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, | |||
72 | ); | 72 | ); |
73 | 73 | ||
74 | /* Function call entry */ | 74 | /* Function call entry */ |
75 | FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, | 75 | FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, |
76 | 76 | ||
77 | TRACE_GRAPH_ENT, | 77 | TRACE_GRAPH_ENT, |
78 | 78 | ||
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, | |||
88 | ); | 88 | ); |
89 | 89 | ||
90 | /* Function return entry */ | 90 | /* Function return entry */ |
91 | FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | 91 | FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, |
92 | 92 | ||
93 | TRACE_GRAPH_RET, | 93 | TRACE_GRAPH_RET, |
94 | 94 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d4155892a1e..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> |
16 | #include <linux/tracefs.h> | 16 | #include <linux/tracefs.h> |
17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> |
18 | #include <linux/vmalloc.h> | ||
19 | #include <linux/module.h> | 18 | #include <linux/module.h> |
20 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
21 | #include <linux/sort.h> | 20 | #include <linux/sort.h> |
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, | |||
262 | 261 | ||
263 | local_save_flags(fbuffer->flags); | 262 | local_save_flags(fbuffer->flags); |
264 | fbuffer->pc = preempt_count(); | 263 | fbuffer->pc = preempt_count(); |
264 | /* | ||
265 | * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables | ||
266 | * preemption (adding one to the preempt_count). Since we are | ||
267 | * interested in the preempt_count at the time the tracepoint was | ||
268 | * hit, we need to subtract one to offset the increment. | ||
269 | */ | ||
270 | if (IS_ENABLED(CONFIG_PREEMPT)) | ||
271 | fbuffer->pc--; | ||
265 | fbuffer->trace_file = trace_file; | 272 | fbuffer->trace_file = trace_file; |
266 | 273 | ||
267 | fbuffer->event = | 274 | fbuffer->event = |
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr) | |||
499 | mutex_unlock(&event_mutex); | 506 | mutex_unlock(&event_mutex); |
500 | } | 507 | } |
501 | 508 | ||
502 | /* Shouldn't this be in a header? */ | ||
503 | extern int pid_max; | ||
504 | |||
505 | /* Returns true if found in filter */ | ||
506 | static bool | ||
507 | find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) | ||
508 | { | ||
509 | /* | ||
510 | * If pid_max changed after filtered_pids was created, we | ||
511 | * by default ignore all pids greater than the previous pid_max. | ||
512 | */ | ||
513 | if (search_pid >= filtered_pids->pid_max) | ||
514 | return false; | ||
515 | |||
516 | return test_bit(search_pid, filtered_pids->pids); | ||
517 | } | ||
518 | |||
519 | static bool | ||
520 | ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) | ||
521 | { | ||
522 | /* | ||
523 | * Return false, because if filtered_pids does not exist, | ||
524 | * all pids are good to trace. | ||
525 | */ | ||
526 | if (!filtered_pids) | ||
527 | return false; | ||
528 | |||
529 | return !find_filtered_pid(filtered_pids, task->pid); | ||
530 | } | ||
531 | |||
532 | static void filter_add_remove_task(struct trace_pid_list *pid_list, | ||
533 | struct task_struct *self, | ||
534 | struct task_struct *task) | ||
535 | { | ||
536 | if (!pid_list) | ||
537 | return; | ||
538 | |||
539 | /* For forks, we only add if the forking task is listed */ | ||
540 | if (self) { | ||
541 | if (!find_filtered_pid(pid_list, self->pid)) | ||
542 | return; | ||
543 | } | ||
544 | |||
545 | /* Sorry, but we don't support pid_max changing after setting */ | ||
546 | if (task->pid >= pid_list->pid_max) | ||
547 | return; | ||
548 | |||
549 | /* "self" is set for forks, and NULL for exits */ | ||
550 | if (self) | ||
551 | set_bit(task->pid, pid_list->pids); | ||
552 | else | ||
553 | clear_bit(task->pid, pid_list->pids); | ||
554 | } | ||
555 | |||
556 | static void | 509 | static void |
557 | event_filter_pid_sched_process_exit(void *data, struct task_struct *task) | 510 | event_filter_pid_sched_process_exit(void *data, struct task_struct *task) |
558 | { | 511 | { |
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) | |||
560 | struct trace_array *tr = data; | 513 | struct trace_array *tr = data; |
561 | 514 | ||
562 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 515 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
563 | filter_add_remove_task(pid_list, NULL, task); | 516 | trace_filter_add_remove_task(pid_list, NULL, task); |
564 | } | 517 | } |
565 | 518 | ||
566 | static void | 519 | static void |
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data, | |||
572 | struct trace_array *tr = data; | 525 | struct trace_array *tr = data; |
573 | 526 | ||
574 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 527 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
575 | filter_add_remove_task(pid_list, self, task); | 528 | trace_filter_add_remove_task(pid_list, self, task); |
576 | } | 529 | } |
577 | 530 | ||
578 | void trace_event_follow_fork(struct trace_array *tr, bool enable) | 531 | void trace_event_follow_fork(struct trace_array *tr, bool enable) |
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, | |||
600 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 553 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
601 | 554 | ||
602 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 555 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
603 | ignore_this_task(pid_list, prev) && | 556 | trace_ignore_this_task(pid_list, prev) && |
604 | ignore_this_task(pid_list, next)); | 557 | trace_ignore_this_task(pid_list, next)); |
605 | } | 558 | } |
606 | 559 | ||
607 | static void | 560 | static void |
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, | |||
614 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 567 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
615 | 568 | ||
616 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 569 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
617 | ignore_this_task(pid_list, next)); | 570 | trace_ignore_this_task(pid_list, next)); |
618 | } | 571 | } |
619 | 572 | ||
620 | static void | 573 | static void |
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) | |||
630 | pid_list = rcu_dereference_sched(tr->filtered_pids); | 583 | pid_list = rcu_dereference_sched(tr->filtered_pids); |
631 | 584 | ||
632 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 585 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
633 | ignore_this_task(pid_list, task)); | 586 | trace_ignore_this_task(pid_list, task)); |
634 | } | 587 | } |
635 | 588 | ||
636 | static void | 589 | static void |
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) | |||
647 | 600 | ||
648 | /* Set tracing if current is enabled */ | 601 | /* Set tracing if current is enabled */ |
649 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 602 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
650 | ignore_this_task(pid_list, current)); | 603 | trace_ignore_this_task(pid_list, current)); |
651 | } | 604 | } |
652 | 605 | ||
653 | static void __ftrace_clear_event_pids(struct trace_array *tr) | 606 | static void __ftrace_clear_event_pids(struct trace_array *tr) |
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) | |||
685 | /* Wait till all users are no longer using pid filtering */ | 638 | /* Wait till all users are no longer using pid filtering */ |
686 | synchronize_sched(); | 639 | synchronize_sched(); |
687 | 640 | ||
688 | vfree(pid_list->pids); | 641 | trace_free_pid_list(pid_list); |
689 | kfree(pid_list); | ||
690 | } | 642 | } |
691 | 643 | ||
692 | static void ftrace_clear_event_pids(struct trace_array *tr) | 644 | static void ftrace_clear_event_pids(struct trace_array *tr) |
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos) | |||
1034 | { | 986 | { |
1035 | struct trace_array *tr = m->private; | 987 | struct trace_array *tr = m->private; |
1036 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); | 988 | struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); |
1037 | unsigned long pid = (unsigned long)v; | ||
1038 | |||
1039 | (*pos)++; | ||
1040 | |||
1041 | /* pid already is +1 of the actual prevous bit */ | ||
1042 | pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); | ||
1043 | 989 | ||
1044 | /* Return pid + 1 to allow zero to be represented */ | 990 | return trace_pid_next(pid_list, v, pos); |
1045 | if (pid < pid_list->pid_max) | ||
1046 | return (void *)(pid + 1); | ||
1047 | |||
1048 | return NULL; | ||
1049 | } | 991 | } |
1050 | 992 | ||
1051 | static void *p_start(struct seq_file *m, loff_t *pos) | 993 | static void *p_start(struct seq_file *m, loff_t *pos) |
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos) | |||
1053 | { | 995 | { |
1054 | struct trace_pid_list *pid_list; | 996 | struct trace_pid_list *pid_list; |
1055 | struct trace_array *tr = m->private; | 997 | struct trace_array *tr = m->private; |
1056 | unsigned long pid; | ||
1057 | loff_t l = 0; | ||
1058 | 998 | ||
1059 | /* | 999 | /* |
1060 | * Grab the mutex, to keep calls to p_next() having the same | 1000 | * Grab the mutex, to keep calls to p_next() having the same |
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos) | |||
1070 | if (!pid_list) | 1010 | if (!pid_list) |
1071 | return NULL; | 1011 | return NULL; |
1072 | 1012 | ||
1073 | pid = find_first_bit(pid_list->pids, pid_list->pid_max); | 1013 | return trace_pid_start(pid_list, pos); |
1074 | if (pid >= pid_list->pid_max) | ||
1075 | return NULL; | ||
1076 | |||
1077 | /* Return pid + 1 so that zero can be the exit value */ | ||
1078 | for (pid++; pid && l < *pos; | ||
1079 | pid = (unsigned long)p_next(m, (void *)pid, &l)) | ||
1080 | ; | ||
1081 | return (void *)pid; | ||
1082 | } | 1014 | } |
1083 | 1015 | ||
1084 | static void p_stop(struct seq_file *m, void *p) | 1016 | static void p_stop(struct seq_file *m, void *p) |
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p) | |||
1088 | mutex_unlock(&event_mutex); | 1020 | mutex_unlock(&event_mutex); |
1089 | } | 1021 | } |
1090 | 1022 | ||
1091 | static int p_show(struct seq_file *m, void *v) | ||
1092 | { | ||
1093 | unsigned long pid = (unsigned long)v - 1; | ||
1094 | |||
1095 | seq_printf(m, "%lu\n", pid); | ||
1096 | return 0; | ||
1097 | } | ||
1098 | |||
1099 | static ssize_t | 1023 | static ssize_t |
1100 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 1024 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
1101 | loff_t *ppos) | 1025 | loff_t *ppos) |
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data) | |||
1654 | mutex_is_locked(&event_mutex)); | 1578 | mutex_is_locked(&event_mutex)); |
1655 | 1579 | ||
1656 | this_cpu_write(tr->trace_buffer.data->ignore_pid, | 1580 | this_cpu_write(tr->trace_buffer.data->ignore_pid, |
1657 | ignore_this_task(pid_list, current)); | 1581 | trace_ignore_this_task(pid_list, current)); |
1658 | } | 1582 | } |
1659 | 1583 | ||
1660 | static ssize_t | 1584 | static ssize_t |
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
1666 | struct trace_pid_list *filtered_pids = NULL; | 1590 | struct trace_pid_list *filtered_pids = NULL; |
1667 | struct trace_pid_list *pid_list; | 1591 | struct trace_pid_list *pid_list; |
1668 | struct trace_event_file *file; | 1592 | struct trace_event_file *file; |
1669 | struct trace_parser parser; | 1593 | ssize_t ret; |
1670 | unsigned long val; | ||
1671 | loff_t this_pos; | ||
1672 | ssize_t read = 0; | ||
1673 | ssize_t ret = 0; | ||
1674 | pid_t pid; | ||
1675 | int nr_pids = 0; | ||
1676 | 1594 | ||
1677 | if (!cnt) | 1595 | if (!cnt) |
1678 | return 0; | 1596 | return 0; |
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
1681 | if (ret < 0) | 1599 | if (ret < 0) |
1682 | return ret; | 1600 | return ret; |
1683 | 1601 | ||
1684 | if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) | ||
1685 | return -ENOMEM; | ||
1686 | |||
1687 | mutex_lock(&event_mutex); | 1602 | mutex_lock(&event_mutex); |
1603 | |||
1688 | filtered_pids = rcu_dereference_protected(tr->filtered_pids, | 1604 | filtered_pids = rcu_dereference_protected(tr->filtered_pids, |
1689 | lockdep_is_held(&event_mutex)); | 1605 | lockdep_is_held(&event_mutex)); |
1690 | 1606 | ||
1691 | /* | 1607 | ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); |
1692 | * Always recreate a new array. The write is an all or nothing | 1608 | if (ret < 0) |
1693 | * operation. Always create a new array when adding new pids by | ||
1694 | * the user. If the operation fails, then the current list is | ||
1695 | * not modified. | ||
1696 | */ | ||
1697 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); | ||
1698 | if (!pid_list) { | ||
1699 | read = -ENOMEM; | ||
1700 | goto out; | ||
1701 | } | ||
1702 | pid_list->pid_max = READ_ONCE(pid_max); | ||
1703 | /* Only truncating will shrink pid_max */ | ||
1704 | if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) | ||
1705 | pid_list->pid_max = filtered_pids->pid_max; | ||
1706 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); | ||
1707 | if (!pid_list->pids) { | ||
1708 | kfree(pid_list); | ||
1709 | read = -ENOMEM; | ||
1710 | goto out; | ||
1711 | } | ||
1712 | if (filtered_pids) { | ||
1713 | /* copy the current bits to the new max */ | ||
1714 | pid = find_first_bit(filtered_pids->pids, | ||
1715 | filtered_pids->pid_max); | ||
1716 | while (pid < filtered_pids->pid_max) { | ||
1717 | set_bit(pid, pid_list->pids); | ||
1718 | pid = find_next_bit(filtered_pids->pids, | ||
1719 | filtered_pids->pid_max, | ||
1720 | pid + 1); | ||
1721 | nr_pids++; | ||
1722 | } | ||
1723 | } | ||
1724 | |||
1725 | while (cnt > 0) { | ||
1726 | |||
1727 | this_pos = 0; | ||
1728 | |||
1729 | ret = trace_get_user(&parser, ubuf, cnt, &this_pos); | ||
1730 | if (ret < 0 || !trace_parser_loaded(&parser)) | ||
1731 | break; | ||
1732 | |||
1733 | read += ret; | ||
1734 | ubuf += ret; | ||
1735 | cnt -= ret; | ||
1736 | |||
1737 | parser.buffer[parser.idx] = 0; | ||
1738 | |||
1739 | ret = -EINVAL; | ||
1740 | if (kstrtoul(parser.buffer, 0, &val)) | ||
1741 | break; | ||
1742 | if (val >= pid_list->pid_max) | ||
1743 | break; | ||
1744 | |||
1745 | pid = (pid_t)val; | ||
1746 | |||
1747 | set_bit(pid, pid_list->pids); | ||
1748 | nr_pids++; | ||
1749 | |||
1750 | trace_parser_clear(&parser); | ||
1751 | ret = 0; | ||
1752 | } | ||
1753 | trace_parser_put(&parser); | ||
1754 | |||
1755 | if (ret < 0) { | ||
1756 | vfree(pid_list->pids); | ||
1757 | kfree(pid_list); | ||
1758 | read = ret; | ||
1759 | goto out; | 1609 | goto out; |
1760 | } | ||
1761 | 1610 | ||
1762 | if (!nr_pids) { | ||
1763 | /* Cleared the list of pids */ | ||
1764 | vfree(pid_list->pids); | ||
1765 | kfree(pid_list); | ||
1766 | read = ret; | ||
1767 | if (!filtered_pids) | ||
1768 | goto out; | ||
1769 | pid_list = NULL; | ||
1770 | } | ||
1771 | rcu_assign_pointer(tr->filtered_pids, pid_list); | 1611 | rcu_assign_pointer(tr->filtered_pids, pid_list); |
1772 | 1612 | ||
1773 | list_for_each_entry(file, &tr->events, list) { | 1613 | list_for_each_entry(file, &tr->events, list) { |
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
1776 | 1616 | ||
1777 | if (filtered_pids) { | 1617 | if (filtered_pids) { |
1778 | synchronize_sched(); | 1618 | synchronize_sched(); |
1779 | 1619 | trace_free_pid_list(filtered_pids); | |
1780 | vfree(filtered_pids->pids); | 1620 | } else if (pid_list) { |
1781 | kfree(filtered_pids); | ||
1782 | } else { | ||
1783 | /* | 1621 | /* |
1784 | * Register a probe that is called before all other probes | 1622 | * Register a probe that is called before all other probes |
1785 | * to set ignore_pid if next or prev do not match. | 1623 | * to set ignore_pid if next or prev do not match. |
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, | |||
1817 | out: | 1655 | out: |
1818 | mutex_unlock(&event_mutex); | 1656 | mutex_unlock(&event_mutex); |
1819 | 1657 | ||
1820 | ret = read; | 1658 | if (ret > 0) |
1821 | if (read > 0) | 1659 | *ppos += ret; |
1822 | *ppos += read; | ||
1823 | 1660 | ||
1824 | return ret; | 1661 | return ret; |
1825 | } | 1662 | } |
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = { | |||
1846 | static const struct seq_operations show_set_pid_seq_ops = { | 1683 | static const struct seq_operations show_set_pid_seq_ops = { |
1847 | .start = p_start, | 1684 | .start = p_start, |
1848 | .next = p_next, | 1685 | .next = p_next, |
1849 | .show = p_show, | 1686 | .show = trace_pid_show, |
1850 | .stop = p_stop, | 1687 | .stop = p_stop, |
1851 | }; | 1688 | }; |
1852 | 1689 | ||
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) | |||
43 | 43 | ||
44 | /* Currently only the non stack verision is supported */ | 44 | /* Currently only the non stack verision is supported */ |
45 | ops->func = function_trace_call; | 45 | ops->func = function_trace_call; |
46 | ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; | 46 | ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; |
47 | 47 | ||
48 | tr->ops = ops; | 48 | tr->ops = ops; |
49 | ops->private = tr; | 49 | ops->private = tr; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
319 | int cpu; | 319 | int cpu; |
320 | int pc; | 320 | int pc; |
321 | 321 | ||
322 | if (!ftrace_trace_task(current)) | 322 | if (!ftrace_trace_task(tr)) |
323 | return 0; | 323 | return 0; |
324 | 324 | ||
325 | /* trace it when it is-nested-in or is a function enabled. */ | 325 | /* trace it when it is-nested-in or is a function enabled. */ |
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
338 | if (ftrace_graph_notrace_addr(trace->func)) | 338 | if (ftrace_graph_notrace_addr(trace->func)) |
339 | return 1; | 339 | return 1; |
340 | 340 | ||
341 | /* | ||
342 | * Stop here if tracing_threshold is set. We only write function return | ||
343 | * events to the ring buffer. | ||
344 | */ | ||
345 | if (tracing_thresh) | ||
346 | return 1; | ||
347 | |||
341 | local_irq_save(flags); | 348 | local_irq_save(flags); |
342 | cpu = raw_smp_processor_id(); | 349 | cpu = raw_smp_processor_id(); |
343 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); | 350 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
355 | return ret; | 362 | return ret; |
356 | } | 363 | } |
357 | 364 | ||
358 | static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | ||
359 | { | ||
360 | if (tracing_thresh) | ||
361 | return 1; | ||
362 | else | ||
363 | return trace_graph_entry(trace); | ||
364 | } | ||
365 | |||
366 | static void | 365 | static void |
367 | __trace_graph_function(struct trace_array *tr, | 366 | __trace_graph_function(struct trace_array *tr, |
368 | unsigned long ip, unsigned long flags, int pc) | 367 | unsigned long ip, unsigned long flags, int pc) |
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) | |||
457 | set_graph_array(tr); | 456 | set_graph_array(tr); |
458 | if (tracing_thresh) | 457 | if (tracing_thresh) |
459 | ret = register_ftrace_graph(&trace_graph_thresh_return, | 458 | ret = register_ftrace_graph(&trace_graph_thresh_return, |
460 | &trace_graph_thresh_entry); | 459 | &trace_graph_entry); |
461 | else | 460 | else |
462 | ret = register_ftrace_graph(&trace_graph_return, | 461 | ret = register_ftrace_graph(&trace_graph_return, |
463 | &trace_graph_entry); | 462 | &trace_graph_entry); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5546eec0505f..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) | |||
587 | * $retval : fetch return value | 587 | * $retval : fetch return value |
588 | * $stack : fetch stack address | 588 | * $stack : fetch stack address |
589 | * $stackN : fetch Nth of stack (N:0-) | 589 | * $stackN : fetch Nth of stack (N:0-) |
590 | * $comm : fetch current task comm | ||
590 | * @ADDR : fetch memory at ADDR (ADDR should be in kernel) | 591 | * @ADDR : fetch memory at ADDR (ADDR should be in kernel) |
591 | * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) | 592 | * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) |
592 | * %REG : fetch register REG | 593 | * %REG : fetch register REG |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | |||
68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", | 68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", |
69 | dev->bus->number, dev->devfn, | 69 | dev->bus->number, dev->devfn, |
70 | dev->vendor, dev->device, dev->irq); | 70 | dev->vendor, dev->device, dev->irq); |
71 | /* | ||
72 | * XXX: is pci_resource_to_user() appropriate, since we are | ||
73 | * supposed to interpret the __ioremap() phys_addr argument based on | ||
74 | * these printed values? | ||
75 | */ | ||
76 | for (i = 0; i < 7; i++) { | 71 | for (i = 0; i < 7; i++) { |
77 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 72 | start = dev->resource[i].start; |
78 | trace_seq_printf(s, " %llx", | 73 | trace_seq_printf(s, " %llx", |
79 | (unsigned long long)(start | | 74 | (unsigned long long)(start | |
80 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); | 75 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); |
81 | } | 76 | } |
82 | for (i = 0; i < 7; i++) { | 77 | for (i = 0; i < 7; i++) { |
83 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 78 | start = dev->resource[i].start; |
79 | end = dev->resource[i].end; | ||
84 | trace_seq_printf(s, " %llx", | 80 | trace_seq_printf(s, " %llx", |
85 | dev->resource[i].start < dev->resource[i].end ? | 81 | dev->resource[i].start < dev->resource[i].end ? |
86 | (unsigned long long)(end - start) + 1 : 0); | 82 | (unsigned long long)(end - start) + 1 : 0); |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt { | |||
36 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) | 36 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) |
37 | { | 37 | { |
38 | struct trace_bprintk_fmt *pos; | 38 | struct trace_bprintk_fmt *pos; |
39 | |||
40 | if (!fmt) | ||
41 | return ERR_PTR(-EINVAL); | ||
42 | |||
39 | list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { | 43 | list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { |
40 | if (!strcmp(pos->fmt, fmt)) | 44 | if (!strcmp(pos->fmt, fmt)) |
41 | return pos; | 45 | return pos; |
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
57 | for (iter = start; iter < end; iter++) { | 61 | for (iter = start; iter < end; iter++) { |
58 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); | 62 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |
59 | if (tb_fmt) { | 63 | if (tb_fmt) { |
60 | *iter = tb_fmt->fmt; | 64 | if (!IS_ERR(tb_fmt)) |
65 | *iter = tb_fmt->fmt; | ||
61 | continue; | 66 | continue; |
62 | } | 67 | } |
63 | 68 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
218 | kfree(data); | 218 | kfree(data); |
219 | } | 219 | } |
220 | 220 | ||
221 | void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, | ||
222 | void *data, void *dest) | ||
223 | { | ||
224 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
225 | u8 *dst = get_rloc_data(dest); | ||
226 | long ret; | ||
227 | |||
228 | if (!maxlen) | ||
229 | return; | ||
230 | |||
231 | ret = strlcpy(dst, current->comm, maxlen); | ||
232 | *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); | ||
233 | } | ||
234 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); | ||
235 | |||
236 | void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, | ||
237 | void *data, void *dest) | ||
238 | { | ||
239 | *(u32 *)dest = strlen(current->comm) + 1; | ||
240 | } | ||
241 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); | ||
242 | |||
221 | static const struct fetch_type *find_fetch_type(const char *type, | 243 | static const struct fetch_type *find_fetch_type(const char *type, |
222 | const struct fetch_type *ftbl) | 244 | const struct fetch_type *ftbl) |
223 | { | 245 | { |
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
348 | } | 370 | } |
349 | } else | 371 | } else |
350 | ret = -EINVAL; | 372 | ret = -EINVAL; |
373 | } else if (strcmp(arg, "comm") == 0) { | ||
374 | if (strcmp(t->name, "string") != 0 && | ||
375 | strcmp(t->name, "string_size") != 0) | ||
376 | return -EINVAL; | ||
377 | f->fn = t->fetch[FETCH_MTD_comm]; | ||
351 | } else | 378 | } else |
352 | ret = -EINVAL; | 379 | ret = -EINVAL; |
353 | 380 | ||
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
522 | arg[t - parg->comm] = '\0'; | 549 | arg[t - parg->comm] = '\0'; |
523 | t++; | 550 | t++; |
524 | } | 551 | } |
552 | /* | ||
553 | * The default type of $comm should be "string", and it can't be | ||
554 | * dereferenced. | ||
555 | */ | ||
556 | if (!t && strcmp(arg, "$comm") == 0) | ||
557 | t = "string"; | ||
525 | parg->type = find_fetch_type(t, ftbl); | 558 | parg->type = find_fetch_type(t, ftbl); |
526 | if (!parg->type) { | 559 | if (!parg->type) { |
527 | pr_info("Unsupported type: %s\n", t); | 560 | pr_info("Unsupported type: %s\n", t); |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -102,6 +102,7 @@ enum { | |||
102 | FETCH_MTD_reg = 0, | 102 | FETCH_MTD_reg = 0, |
103 | FETCH_MTD_stack, | 103 | FETCH_MTD_stack, |
104 | FETCH_MTD_retval, | 104 | FETCH_MTD_retval, |
105 | FETCH_MTD_comm, | ||
105 | FETCH_MTD_memory, | 106 | FETCH_MTD_memory, |
106 | FETCH_MTD_symbol, | 107 | FETCH_MTD_symbol, |
107 | FETCH_MTD_deref, | 108 | FETCH_MTD_deref, |
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); | |||
183 | #define fetch_bitfield_string NULL | 184 | #define fetch_bitfield_string NULL |
184 | #define fetch_bitfield_string_size NULL | 185 | #define fetch_bitfield_string_size NULL |
185 | 186 | ||
187 | /* comm only makes sense as a string */ | ||
188 | #define fetch_comm_u8 NULL | ||
189 | #define fetch_comm_u16 NULL | ||
190 | #define fetch_comm_u32 NULL | ||
191 | #define fetch_comm_u64 NULL | ||
192 | DECLARE_FETCH_FUNC(comm, string); | ||
193 | DECLARE_FETCH_FUNC(comm, string_size); | ||
194 | |||
186 | /* | 195 | /* |
187 | * Define macro for basic types - we don't need to define s* types, because | 196 | * Define macro for basic types - we don't need to define s* types, because |
188 | * we have to care only about bitwidth at recording time. | 197 | * we have to care only about bitwidth at recording time. |
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) | |||
213 | ASSIGN_FETCH_FUNC(reg, ftype), \ | 222 | ASSIGN_FETCH_FUNC(reg, ftype), \ |
214 | ASSIGN_FETCH_FUNC(stack, ftype), \ | 223 | ASSIGN_FETCH_FUNC(stack, ftype), \ |
215 | ASSIGN_FETCH_FUNC(retval, ftype), \ | 224 | ASSIGN_FETCH_FUNC(retval, ftype), \ |
225 | ASSIGN_FETCH_FUNC(comm, ftype), \ | ||
216 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 226 | ASSIGN_FETCH_FUNC(memory, ftype), \ |
217 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 227 | ASSIGN_FETCH_FUNC(symbol, ftype), \ |
218 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 228 | ASSIGN_FETCH_FUNC(deref, ftype), \ |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) | |||
938 | return allowed; | 938 | return allowed; |
939 | } | 939 | } |
940 | 940 | ||
941 | /* | ||
942 | * Returns true if @ns is the same namespace as or a descendant of | ||
943 | * @target_ns. | ||
944 | */ | ||
945 | bool current_in_userns(const struct user_namespace *target_ns) | ||
946 | { | ||
947 | struct user_namespace *ns; | ||
948 | for (ns = current_user_ns(); ns; ns = ns->parent) { | ||
949 | if (ns == target_ns) | ||
950 | return true; | ||
951 | } | ||
952 | return false; | ||
953 | } | ||
954 | |||
941 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) | 955 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) |
942 | { | 956 | { |
943 | return container_of(ns, struct user_namespace, ns); | 957 | return container_of(ns, struct user_namespace, ns); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq) | |||
4369 | /** | 4369 | /** |
4370 | * show_workqueue_state - dump workqueue state | 4370 | * show_workqueue_state - dump workqueue state |
4371 | * | 4371 | * |
4372 | * Called from a sysrq handler and prints out all busy workqueues and | 4372 | * Called from a sysrq handler or try_to_freeze_tasks() and prints out |
4373 | * pools. | 4373 | * all busy workqueues and pools. |
4374 | */ | 4374 | */ |
4375 | void show_workqueue_state(void) | 4375 | void show_workqueue_state(void) |
4376 | { | 4376 | { |
@@ -4600,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
4600 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) | 4600 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) |
4601 | return; | 4601 | return; |
4602 | 4602 | ||
4603 | /* is @cpu the only online CPU? */ | ||
4604 | cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); | 4603 | cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); |
4605 | if (cpumask_weight(&cpumask) != 1) | ||
4606 | return; | ||
4607 | 4604 | ||
4608 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ | 4605 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ |
4609 | for_each_pool_worker(worker, pool) | 4606 | for_each_pool_worker(worker, pool) |
4610 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | 4607 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); |
4611 | pool->attrs->cpumask) < 0); | ||
4612 | } | 4608 | } |
4613 | 4609 | ||
4614 | /* | 4610 | int workqueue_prepare_cpu(unsigned int cpu) |
4615 | * Workqueues should be brought up before normal priority CPU notifiers. | 4611 | { |
4616 | * This will be registered high priority CPU notifier. | 4612 | struct worker_pool *pool; |
4617 | */ | 4613 | |
4618 | static int workqueue_cpu_up_callback(struct notifier_block *nfb, | 4614 | for_each_cpu_worker_pool(pool, cpu) { |
4619 | unsigned long action, | 4615 | if (pool->nr_workers) |
4620 | void *hcpu) | 4616 | continue; |
4617 | if (!create_worker(pool)) | ||
4618 | return -ENOMEM; | ||
4619 | } | ||
4620 | return 0; | ||
4621 | } | ||
4622 | |||
4623 | int workqueue_online_cpu(unsigned int cpu) | ||
4621 | { | 4624 | { |
4622 | int cpu = (unsigned long)hcpu; | ||
4623 | struct worker_pool *pool; | 4625 | struct worker_pool *pool; |
4624 | struct workqueue_struct *wq; | 4626 | struct workqueue_struct *wq; |
4625 | int pi; | 4627 | int pi; |
4626 | 4628 | ||
4627 | switch (action & ~CPU_TASKS_FROZEN) { | 4629 | mutex_lock(&wq_pool_mutex); |
4628 | case CPU_UP_PREPARE: | ||
4629 | for_each_cpu_worker_pool(pool, cpu) { | ||
4630 | if (pool->nr_workers) | ||
4631 | continue; | ||
4632 | if (!create_worker(pool)) | ||
4633 | return NOTIFY_BAD; | ||
4634 | } | ||
4635 | break; | ||
4636 | |||
4637 | case CPU_DOWN_FAILED: | ||
4638 | case CPU_ONLINE: | ||
4639 | mutex_lock(&wq_pool_mutex); | ||
4640 | 4630 | ||
4641 | for_each_pool(pool, pi) { | 4631 | for_each_pool(pool, pi) { |
4642 | mutex_lock(&pool->attach_mutex); | 4632 | mutex_lock(&pool->attach_mutex); |
4643 | 4633 | ||
4644 | if (pool->cpu == cpu) | 4634 | if (pool->cpu == cpu) |
4645 | rebind_workers(pool); | 4635 | rebind_workers(pool); |
4646 | else if (pool->cpu < 0) | 4636 | else if (pool->cpu < 0) |
4647 | restore_unbound_workers_cpumask(pool, cpu); | 4637 | restore_unbound_workers_cpumask(pool, cpu); |
4648 | 4638 | ||
4649 | mutex_unlock(&pool->attach_mutex); | 4639 | mutex_unlock(&pool->attach_mutex); |
4650 | } | 4640 | } |
4651 | 4641 | ||
4652 | /* update NUMA affinity of unbound workqueues */ | 4642 | /* update NUMA affinity of unbound workqueues */ |
4653 | list_for_each_entry(wq, &workqueues, list) | 4643 | list_for_each_entry(wq, &workqueues, list) |
4654 | wq_update_unbound_numa(wq, cpu, true); | 4644 | wq_update_unbound_numa(wq, cpu, true); |
4655 | 4645 | ||
4656 | mutex_unlock(&wq_pool_mutex); | 4646 | mutex_unlock(&wq_pool_mutex); |
4657 | break; | 4647 | return 0; |
4658 | } | ||
4659 | return NOTIFY_OK; | ||
4660 | } | 4648 | } |
4661 | 4649 | ||
4662 | /* | 4650 | int workqueue_offline_cpu(unsigned int cpu) |
4663 | * Workqueues should be brought down after normal priority CPU notifiers. | ||
4664 | * This will be registered as low priority CPU notifier. | ||
4665 | */ | ||
4666 | static int workqueue_cpu_down_callback(struct notifier_block *nfb, | ||
4667 | unsigned long action, | ||
4668 | void *hcpu) | ||
4669 | { | 4651 | { |
4670 | int cpu = (unsigned long)hcpu; | ||
4671 | struct work_struct unbind_work; | 4652 | struct work_struct unbind_work; |
4672 | struct workqueue_struct *wq; | 4653 | struct workqueue_struct *wq; |
4673 | 4654 | ||
4674 | switch (action & ~CPU_TASKS_FROZEN) { | 4655 | /* unbinding per-cpu workers should happen on the local CPU */ |
4675 | case CPU_DOWN_PREPARE: | 4656 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); |
4676 | /* unbinding per-cpu workers should happen on the local CPU */ | 4657 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
4677 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); | 4658 | |
4678 | queue_work_on(cpu, system_highpri_wq, &unbind_work); | 4659 | /* update NUMA affinity of unbound workqueues */ |
4679 | 4660 | mutex_lock(&wq_pool_mutex); | |
4680 | /* update NUMA affinity of unbound workqueues */ | 4661 | list_for_each_entry(wq, &workqueues, list) |
4681 | mutex_lock(&wq_pool_mutex); | 4662 | wq_update_unbound_numa(wq, cpu, false); |
4682 | list_for_each_entry(wq, &workqueues, list) | 4663 | mutex_unlock(&wq_pool_mutex); |
4683 | wq_update_unbound_numa(wq, cpu, false); | 4664 | |
4684 | mutex_unlock(&wq_pool_mutex); | 4665 | /* wait for per-cpu unbinding to finish */ |
4685 | 4666 | flush_work(&unbind_work); | |
4686 | /* wait for per-cpu unbinding to finish */ | 4667 | destroy_work_on_stack(&unbind_work); |
4687 | flush_work(&unbind_work); | 4668 | return 0; |
4688 | destroy_work_on_stack(&unbind_work); | ||
4689 | break; | ||
4690 | } | ||
4691 | return NOTIFY_OK; | ||
4692 | } | 4669 | } |
4693 | 4670 | ||
4694 | #ifdef CONFIG_SMP | 4671 | #ifdef CONFIG_SMP |
@@ -5490,9 +5467,6 @@ static int __init init_workqueues(void) | |||
5490 | 5467 | ||
5491 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5468 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
5492 | 5469 | ||
5493 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | ||
5494 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | ||
5495 | |||
5496 | wq_numa_init(); | 5470 | wq_numa_init(); |
5497 | 5471 | ||
5498 | /* initialize CPU pools */ | 5472 | /* initialize CPU pools */ |