aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c17
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/bpf/arraymap.c163
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c66
-rw-r--r--kernel/bpf/verifier.c67
-rw-r--r--kernel/cgroup.c214
-rw-r--r--kernel/cgroup_pids.c34
-rw-r--r--kernel/cpu.c68
-rw-r--r--kernel/cpuset.c9
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c309
-rw-r--r--kernel/events/internal.h25
-rw-r--r--kernel/exit.c84
-rw-r--r--kernel/fork.c70
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c14
-rw-r--r--kernel/gcov/gcc_4_7.c2
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c61
-rw-r--r--kernel/irq/chip.c83
-rw-r--r--kernel/irq/handle.c18
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irqdesc.c63
-rw-r--r--kernel/irq/irqdomain.c94
-rw-r--r--kernel/irq/manage.c73
-rw-r--r--kernel/irq/msi.c12
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/jump_label.c38
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/locking/lockdep.c13
-rw-r--r--kernel/locking/mutex-debug.c12
-rw-r--r--kernel/locking/mutex-debug.h8
-rw-r--r--kernel/locking/mutex.c15
-rw-r--r--kernel/locking/mutex.h12
-rw-r--r--kernel/locking/qrwlock.c2
-rw-r--r--kernel/locking/qspinlock.c146
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c194
-rw-r--r--kernel/locking/rwsem.c8
-rw-r--r--kernel/locking/rwsem.h52
-rw-r--r--kernel/memremap.c14
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/console.c8
-rw-r--r--kernel/power/hibernate.c107
-rw-r--r--kernel/power/main.c11
-rw-r--r--kernel/power/power.h11
-rw-r--r--kernel/power/process.c15
-rw-r--r--kernel/power/snapshot.c950
-rw-r--r--kernel/power/suspend.c10
-rw-r--r--kernel/power/swap.c39
-rw-r--r--kernel/power/user.c14
-rw-r--r--kernel/printk/printk.c5
-rw-r--r--kernel/profile.c181
-rw-r--r--kernel/rcu/rcuperf.c25
-rw-r--r--kernel/rcu/rcutorture.c9
-rw-r--r--kernel/rcu/tree.c691
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h655
-rw-r--r--kernel/rcu/tree_plugin.h95
-rw-r--r--kernel/rcu/update.c7
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/core.c170
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cpufreq_schedutil.c74
-rw-r--r--kernel/sched/cputime.c181
-rw-r--r--kernel/sched/debug.c17
-rw-r--r--kernel/sched/fair.c319
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/sched/stats.h3
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/smp.c81
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/hrtimer.c42
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/time/test_udelay.c16
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c98
-rw-r--r--kernel/time/timeconv.c11
-rw-r--r--kernel/time/timekeeping.c11
-rw-r--r--kernel/time/timer.c1132
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/torture.c176
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c83
-rw-r--r--kernel/trace/bpf_trace.c166
-rw-r--r--kernel/trace/ftrace.c313
-rw-r--r--kernel/trace/trace.c358
-rw-r--r--kernel/trace/trace.h48
-rw-r--r--kernel/trace/trace_entries.h4
-rw-r--r--kernel/trace/trace_events.c219
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c19
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_printk.c7
-rw-r--r--kernel/trace/trace_probe.c33
-rw-r--r--kernel/trace/trace_probe.h10
-rw-r--r--kernel/user_namespace.c14
-rw-r--r--kernel/workqueue.c118
114 files changed, 5349 insertions, 3590 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 22bb4f24f071..8d528f9930da 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1883,6 +1883,23 @@ out_null:
1883 audit_log_format(ab, " exe=(null)"); 1883 audit_log_format(ab, " exe=(null)");
1884} 1884}
1885 1885
1886struct tty_struct *audit_get_tty(struct task_struct *tsk)
1887{
1888 struct tty_struct *tty = NULL;
1889 unsigned long flags;
1890
1891 spin_lock_irqsave(&tsk->sighand->siglock, flags);
1892 if (tsk->signal)
1893 tty = tty_kref_get(tsk->signal->tty);
1894 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
1895 return tty;
1896}
1897
1898void audit_put_tty(struct tty_struct *tty)
1899{
1900 tty_kref_put(tty);
1901}
1902
1886void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 1903void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1887{ 1904{
1888 const struct cred *cred; 1905 const struct cred *cred;
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..a492f4c4e710 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -23,6 +23,7 @@
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h> 24#include <linux/skbuff.h>
25#include <uapi/linux/mqueue.h> 25#include <uapi/linux/mqueue.h>
26#include <linux/tty.h>
26 27
27/* AUDIT_NAMES is the number of slots we reserve in the audit_context 28/* AUDIT_NAMES is the number of slots we reserve in the audit_context
28 * for saving names from getname(). If we get more names we will allocate 29 * for saving names from getname(). If we get more names we will allocate
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
262extern void audit_log_d_path_exe(struct audit_buffer *ab, 263extern void audit_log_d_path_exe(struct audit_buffer *ab,
263 struct mm_struct *mm); 264 struct mm_struct *mm);
264 265
266extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
267extern void audit_put_tty(struct tty_struct *tty);
268
265/* audit watch functions */ 269/* audit watch functions */
266#ifdef CONFIG_AUDIT_WATCH 270#ifdef CONFIG_AUDIT_WATCH
267extern void audit_put_watch(struct audit_watch *watch); 271extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 62ab53d7619c..2672d105cffc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -63,7 +63,6 @@
63#include <asm/unistd.h> 63#include <asm/unistd.h>
64#include <linux/security.h> 64#include <linux/security.h>
65#include <linux/list.h> 65#include <linux/list.h>
66#include <linux/tty.h>
67#include <linux/binfmts.h> 66#include <linux/binfmts.h>
68#include <linux/highmem.h> 67#include <linux/highmem.h>
69#include <linux/syscalls.h> 68#include <linux/syscalls.h>
@@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
1985 if (!audit_enabled) 1984 if (!audit_enabled)
1986 return; 1985 return;
1987 1986
1987 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1988 if (!ab)
1989 return;
1990
1988 uid = from_kuid(&init_user_ns, task_uid(current)); 1991 uid = from_kuid(&init_user_ns, task_uid(current));
1989 oldloginuid = from_kuid(&init_user_ns, koldloginuid); 1992 oldloginuid = from_kuid(&init_user_ns, koldloginuid);
1990 loginuid = from_kuid(&init_user_ns, kloginuid), 1993 loginuid = from_kuid(&init_user_ns, kloginuid),
1991 tty = audit_get_tty(current); 1994 tty = audit_get_tty(current);
1992 1995
1993 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1994 if (!ab)
1995 return;
1996 audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); 1996 audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
1997 audit_log_task_context(ab); 1997 audit_log_task_context(ab);
1998 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", 1998 audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
328} 328}
329 329
330/* only called from syscall */ 330/* only called from syscall */
331static int fd_array_map_update_elem(struct bpf_map *map, void *key, 331int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
332 void *value, u64 map_flags) 332 void *key, void *value, u64 map_flags)
333{ 333{
334 struct bpf_array *array = container_of(map, struct bpf_array, map); 334 struct bpf_array *array = container_of(map, struct bpf_array, map);
335 void *new_ptr, *old_ptr; 335 void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
342 return -E2BIG; 342 return -E2BIG;
343 343
344 ufd = *(u32 *)value; 344 ufd = *(u32 *)value;
345 new_ptr = map->ops->map_fd_get_ptr(map, ufd); 345 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
346 if (IS_ERR(new_ptr)) 346 if (IS_ERR(new_ptr))
347 return PTR_ERR(new_ptr); 347 return PTR_ERR(new_ptr);
348 348
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
371 } 371 }
372} 372}
373 373
374static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) 374static void *prog_fd_array_get_ptr(struct bpf_map *map,
375 struct file *map_file, int fd)
375{ 376{
376 struct bpf_array *array = container_of(map, struct bpf_array, map); 377 struct bpf_array *array = container_of(map, struct bpf_array, map);
377 struct bpf_prog *prog = bpf_prog_get(fd); 378 struct bpf_prog *prog = bpf_prog_get(fd);
379
378 if (IS_ERR(prog)) 380 if (IS_ERR(prog))
379 return prog; 381 return prog;
380 382
@@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
382 bpf_prog_put(prog); 384 bpf_prog_put(prog);
383 return ERR_PTR(-EINVAL); 385 return ERR_PTR(-EINVAL);
384 } 386 }
387
385 return prog; 388 return prog;
386} 389}
387 390
388static void prog_fd_array_put_ptr(void *ptr) 391static void prog_fd_array_put_ptr(void *ptr)
389{ 392{
390 struct bpf_prog *prog = ptr; 393 bpf_prog_put(ptr);
391
392 bpf_prog_put_rcu(prog);
393} 394}
394 395
395/* decrement refcnt of all bpf_progs that are stored in this map */ 396/* decrement refcnt of all bpf_progs that are stored in this map */
@@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = {
407 .map_free = fd_array_map_free, 408 .map_free = fd_array_map_free,
408 .map_get_next_key = array_map_get_next_key, 409 .map_get_next_key = array_map_get_next_key,
409 .map_lookup_elem = fd_array_map_lookup_elem, 410 .map_lookup_elem = fd_array_map_lookup_elem,
410 .map_update_elem = fd_array_map_update_elem,
411 .map_delete_elem = fd_array_map_delete_elem, 411 .map_delete_elem = fd_array_map_delete_elem,
412 .map_fd_get_ptr = prog_fd_array_get_ptr, 412 .map_fd_get_ptr = prog_fd_array_get_ptr,
413 .map_fd_put_ptr = prog_fd_array_put_ptr, 413 .map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -425,59 +425,105 @@ static int __init register_prog_array_map(void)
425} 425}
426late_initcall(register_prog_array_map); 426late_initcall(register_prog_array_map);
427 427
428static void perf_event_array_map_free(struct bpf_map *map) 428static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
429 struct file *map_file)
429{ 430{
430 bpf_fd_array_map_clear(map); 431 struct bpf_event_entry *ee;
431 fd_array_map_free(map); 432
433 ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
434 if (ee) {
435 ee->event = perf_file->private_data;
436 ee->perf_file = perf_file;
437 ee->map_file = map_file;
438 }
439
440 return ee;
432} 441}
433 442
434static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) 443static void __bpf_event_entry_free(struct rcu_head *rcu)
435{ 444{
436 struct perf_event *event; 445 struct bpf_event_entry *ee;
437 const struct perf_event_attr *attr;
438 struct file *file;
439 446
440 file = perf_event_get(fd); 447 ee = container_of(rcu, struct bpf_event_entry, rcu);
441 if (IS_ERR(file)) 448 fput(ee->perf_file);
442 return file; 449 kfree(ee);
450}
443 451
444 event = file->private_data; 452static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
453{
454 call_rcu(&ee->rcu, __bpf_event_entry_free);
455}
445 456
446 attr = perf_event_attrs(event); 457static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
447 if (IS_ERR(attr)) 458 struct file *map_file, int fd)
448 goto err; 459{
460 const struct perf_event_attr *attr;
461 struct bpf_event_entry *ee;
462 struct perf_event *event;
463 struct file *perf_file;
449 464
450 if (attr->inherit) 465 perf_file = perf_event_get(fd);
451 goto err; 466 if (IS_ERR(perf_file))
467 return perf_file;
452 468
453 if (attr->type == PERF_TYPE_RAW) 469 event = perf_file->private_data;
454 return file; 470 ee = ERR_PTR(-EINVAL);
455 471
456 if (attr->type == PERF_TYPE_HARDWARE) 472 attr = perf_event_attrs(event);
457 return file; 473 if (IS_ERR(attr) || attr->inherit)
474 goto err_out;
475
476 switch (attr->type) {
477 case PERF_TYPE_SOFTWARE:
478 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
479 goto err_out;
480 /* fall-through */
481 case PERF_TYPE_RAW:
482 case PERF_TYPE_HARDWARE:
483 ee = bpf_event_entry_gen(perf_file, map_file);
484 if (ee)
485 return ee;
486 ee = ERR_PTR(-ENOMEM);
487 /* fall-through */
488 default:
489 break;
490 }
458 491
459 if (attr->type == PERF_TYPE_SOFTWARE && 492err_out:
460 attr->config == PERF_COUNT_SW_BPF_OUTPUT) 493 fput(perf_file);
461 return file; 494 return ee;
462err:
463 fput(file);
464 return ERR_PTR(-EINVAL);
465} 495}
466 496
467static void perf_event_fd_array_put_ptr(void *ptr) 497static void perf_event_fd_array_put_ptr(void *ptr)
468{ 498{
469 fput((struct file *)ptr); 499 bpf_event_entry_free_rcu(ptr);
500}
501
502static void perf_event_fd_array_release(struct bpf_map *map,
503 struct file *map_file)
504{
505 struct bpf_array *array = container_of(map, struct bpf_array, map);
506 struct bpf_event_entry *ee;
507 int i;
508
509 rcu_read_lock();
510 for (i = 0; i < array->map.max_entries; i++) {
511 ee = READ_ONCE(array->ptrs[i]);
512 if (ee && ee->map_file == map_file)
513 fd_array_map_delete_elem(map, &i);
514 }
515 rcu_read_unlock();
470} 516}
471 517
472static const struct bpf_map_ops perf_event_array_ops = { 518static const struct bpf_map_ops perf_event_array_ops = {
473 .map_alloc = fd_array_map_alloc, 519 .map_alloc = fd_array_map_alloc,
474 .map_free = perf_event_array_map_free, 520 .map_free = fd_array_map_free,
475 .map_get_next_key = array_map_get_next_key, 521 .map_get_next_key = array_map_get_next_key,
476 .map_lookup_elem = fd_array_map_lookup_elem, 522 .map_lookup_elem = fd_array_map_lookup_elem,
477 .map_update_elem = fd_array_map_update_elem,
478 .map_delete_elem = fd_array_map_delete_elem, 523 .map_delete_elem = fd_array_map_delete_elem,
479 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 524 .map_fd_get_ptr = perf_event_fd_array_get_ptr,
480 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 525 .map_fd_put_ptr = perf_event_fd_array_put_ptr,
526 .map_release = perf_event_fd_array_release,
481}; 527};
482 528
483static struct bpf_map_type_list perf_event_array_type __read_mostly = { 529static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void)
491 return 0; 537 return 0;
492} 538}
493late_initcall(register_perf_event_array_map); 539late_initcall(register_perf_event_array_map);
540
541#ifdef CONFIG_SOCK_CGROUP_DATA
542static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
543 struct file *map_file /* not used */,
544 int fd)
545{
546 return cgroup_get_from_fd(fd);
547}
548
549static void cgroup_fd_array_put_ptr(void *ptr)
550{
551 /* cgroup_put free cgrp after a rcu grace period */
552 cgroup_put(ptr);
553}
554
555static void cgroup_fd_array_free(struct bpf_map *map)
556{
557 bpf_fd_array_map_clear(map);
558 fd_array_map_free(map);
559}
560
561static const struct bpf_map_ops cgroup_array_ops = {
562 .map_alloc = fd_array_map_alloc,
563 .map_free = cgroup_fd_array_free,
564 .map_get_next_key = array_map_get_next_key,
565 .map_lookup_elem = fd_array_map_lookup_elem,
566 .map_delete_elem = fd_array_map_delete_elem,
567 .map_fd_get_ptr = cgroup_fd_array_get_ptr,
568 .map_fd_put_ptr = cgroup_fd_array_put_ptr,
569};
570
571static struct bpf_map_type_list cgroup_array_type __read_mostly = {
572 .ops = &cgroup_array_ops,
573 .type = BPF_MAP_TYPE_CGROUP_ARRAY,
574};
575
576static int __init register_cgroup_array_map(void)
577{
578 bpf_register_map_type(&cgroup_array_type);
579 return 0;
580}
581late_initcall(register_cgroup_array_map);
582#endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
719 719
720 if (unlikely(index >= array->map.max_entries)) 720 if (unlikely(index >= array->map.max_entries))
721 goto out; 721 goto out;
722
723 if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) 722 if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
724 goto out; 723 goto out;
725 724
726 tail_call_cnt++; 725 tail_call_cnt++;
727 726
728 prog = READ_ONCE(array->ptrs[index]); 727 prog = READ_ONCE(array->ptrs[index]);
729 if (unlikely(!prog)) 728 if (!prog)
730 goto out; 729 goto out;
731 730
732 /* ARG1 at this point is guaranteed to point to CTX from 731 /* ARG1 at this point is guaranteed to point to CTX from
@@ -1055,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
1055 return NULL; 1054 return NULL;
1056} 1055}
1057 1056
1058const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) 1057u64 __weak
1058bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
1059 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
1059{ 1060{
1060 return NULL; 1061 return -ENOTSUPP;
1061} 1062}
1062 1063
1063/* Always built-in helper functions. */ 1064/* Always built-in helper functions. */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
101 101
102static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 102static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
103{ 103{
104 return raw_smp_processor_id(); 104 return smp_processor_id();
105} 105}
106 106
107const struct bpf_func_proto bpf_get_smp_processor_id_proto = { 107const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
11 * version 2 as published by the Free Software Foundation. 11 * version 2 as published by the Free Software Foundation.
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/init.h>
15#include <linux/magic.h> 15#include <linux/magic.h>
16#include <linux/major.h> 16#include <linux/major.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
367 .kill_sb = kill_litter_super, 367 .kill_sb = kill_litter_super,
368}; 368};
369 369
370MODULE_ALIAS_FS("bpf");
371
372static int __init bpf_init(void) 370static int __init bpf_init(void)
373{ 371{
374 int ret; 372 int ret;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 080a2dfb5800..bf4495fcd25d 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
99 if (err) 99 if (err)
100 goto free_smap; 100 goto free_smap;
101 101
102 err = get_callchain_buffers(); 102 err = get_callchain_buffers(sysctl_perf_event_max_stack);
103 if (err) 103 if (err)
104 goto free_smap; 104 goto free_smap;
105 105
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
124 124
125static int bpf_map_release(struct inode *inode, struct file *filp) 125static int bpf_map_release(struct inode *inode, struct file *filp)
126{ 126{
127 bpf_map_put_with_uref(filp->private_data); 127 struct bpf_map *map = filp->private_data;
128
129 if (map->ops->map_release)
130 map->ops->map_release(map, filp);
131
132 bpf_map_put_with_uref(map);
128 return 0; 133 return 0;
129} 134}
130 135
@@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr)
387 err = bpf_percpu_hash_update(map, key, value, attr->flags); 392 err = bpf_percpu_hash_update(map, key, value, attr->flags);
388 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 393 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
389 err = bpf_percpu_array_update(map, key, value, attr->flags); 394 err = bpf_percpu_array_update(map, key, value, attr->flags);
395 } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
396 map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
397 map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
398 rcu_read_lock();
399 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
400 attr->flags);
401 rcu_read_unlock();
390 } else { 402 } else {
391 rcu_read_lock(); 403 rcu_read_lock();
392 err = map->ops->map_update_elem(map, key, value, attr->flags); 404 err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
612 free_uid(user); 624 free_uid(user);
613} 625}
614 626
615static void __prog_put_common(struct rcu_head *rcu) 627static void __bpf_prog_put_rcu(struct rcu_head *rcu)
616{ 628{
617 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 629 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
618 630
@@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu)
621 bpf_prog_free(aux->prog); 633 bpf_prog_free(aux->prog);
622} 634}
623 635
624/* version of bpf_prog_put() that is called after a grace period */
625void bpf_prog_put_rcu(struct bpf_prog *prog)
626{
627 if (atomic_dec_and_test(&prog->aux->refcnt))
628 call_rcu(&prog->aux->rcu, __prog_put_common);
629}
630
631void bpf_prog_put(struct bpf_prog *prog) 636void bpf_prog_put(struct bpf_prog *prog)
632{ 637{
633 if (atomic_dec_and_test(&prog->aux->refcnt)) 638 if (atomic_dec_and_test(&prog->aux->refcnt))
634 __prog_put_common(&prog->aux->rcu); 639 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
635} 640}
636EXPORT_SYMBOL_GPL(bpf_prog_put); 641EXPORT_SYMBOL_GPL(bpf_prog_put);
637 642
@@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
639{ 644{
640 struct bpf_prog *prog = filp->private_data; 645 struct bpf_prog *prog = filp->private_data;
641 646
642 bpf_prog_put_rcu(prog); 647 bpf_prog_put(prog);
643 return 0; 648 return 0;
644} 649}
645 650
@@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
653 O_RDWR | O_CLOEXEC); 658 O_RDWR | O_CLOEXEC);
654} 659}
655 660
656static struct bpf_prog *__bpf_prog_get(struct fd f) 661static struct bpf_prog *____bpf_prog_get(struct fd f)
657{ 662{
658 if (!f.file) 663 if (!f.file)
659 return ERR_PTR(-EBADF); 664 return ERR_PTR(-EBADF);
@@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
665 return f.file->private_data; 670 return f.file->private_data;
666} 671}
667 672
668struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) 673struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
669{ 674{
670 if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { 675 if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
671 atomic_dec(&prog->aux->refcnt); 676 atomic_sub(i, &prog->aux->refcnt);
672 return ERR_PTR(-EBUSY); 677 return ERR_PTR(-EBUSY);
673 } 678 }
674 return prog; 679 return prog;
675} 680}
681EXPORT_SYMBOL_GPL(bpf_prog_add);
676 682
677/* called by sockets/tracing/seccomp before attaching program to an event 683struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
678 * pairs with bpf_prog_put() 684{
679 */ 685 return bpf_prog_add(prog, 1);
680struct bpf_prog *bpf_prog_get(u32 ufd) 686}
687
688static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
681{ 689{
682 struct fd f = fdget(ufd); 690 struct fd f = fdget(ufd);
683 struct bpf_prog *prog; 691 struct bpf_prog *prog;
684 692
685 prog = __bpf_prog_get(f); 693 prog = ____bpf_prog_get(f);
686 if (IS_ERR(prog)) 694 if (IS_ERR(prog))
687 return prog; 695 return prog;
696 if (type && prog->type != *type) {
697 prog = ERR_PTR(-EINVAL);
698 goto out;
699 }
688 700
689 prog = bpf_prog_inc(prog); 701 prog = bpf_prog_inc(prog);
702out:
690 fdput(f); 703 fdput(f);
691
692 return prog; 704 return prog;
693} 705}
694EXPORT_SYMBOL_GPL(bpf_prog_get); 706
707struct bpf_prog *bpf_prog_get(u32 ufd)
708{
709 return __bpf_prog_get(ufd, NULL);
710}
711
712struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
713{
714 return __bpf_prog_get(ufd, &type);
715}
716EXPORT_SYMBOL_GPL(bpf_prog_get_type);
695 717
696/* last field in 'union bpf_attr' used by this command */ 718/* last field in 'union bpf_attr' used by this command */
697#define BPF_PROG_LOAD_LAST_FIELD kern_version 719#define BPF_PROG_LOAD_LAST_FIELD kern_version
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e07903c8f..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -126,31 +126,6 @@
126 * are set to NOT_INIT to indicate that they are no longer readable. 126 * are set to NOT_INIT to indicate that they are no longer readable.
127 */ 127 */
128 128
129/* types of values stored in eBPF registers */
130enum bpf_reg_type {
131 NOT_INIT = 0, /* nothing was written into register */
132 UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
133 PTR_TO_CTX, /* reg points to bpf_context */
134 CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
135 PTR_TO_MAP_VALUE, /* reg points to map element value */
136 PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
137 FRAME_PTR, /* reg == frame_pointer */
138 PTR_TO_STACK, /* reg == frame_pointer + imm */
139 CONST_IMM, /* constant integer value */
140
141 /* PTR_TO_PACKET represents:
142 * skb->data
143 * skb->data + imm
144 * skb->data + (u16) var
145 * skb->data + (u16) var + imm
146 * if (range > 0) then [ptr, ptr + range - off) is safe to access
147 * if (id > 0) means that some 'var' was added
148 * if (off > 0) menas that 'imm' was added
149 */
150 PTR_TO_PACKET,
151 PTR_TO_PACKET_END, /* skb->data + headlen */
152};
153
154struct reg_state { 129struct reg_state {
155 enum bpf_reg_type type; 130 enum bpf_reg_type type;
156 union { 131 union {
@@ -678,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
678 653
679#define MAX_PACKET_OFF 0xffff 654#define MAX_PACKET_OFF 0xffff
680 655
656static bool may_write_pkt_data(enum bpf_prog_type type)
657{
658 switch (type) {
659 case BPF_PROG_TYPE_XDP:
660 return true;
661 default:
662 return false;
663 }
664}
665
681static int check_packet_access(struct verifier_env *env, u32 regno, int off, 666static int check_packet_access(struct verifier_env *env, u32 regno, int off,
682 int size) 667 int size)
683{ 668{
@@ -695,10 +680,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
695 680
696/* check access to 'struct bpf_context' fields */ 681/* check access to 'struct bpf_context' fields */
697static int check_ctx_access(struct verifier_env *env, int off, int size, 682static int check_ctx_access(struct verifier_env *env, int off, int size,
698 enum bpf_access_type t) 683 enum bpf_access_type t, enum bpf_reg_type *reg_type)
699{ 684{
700 if (env->prog->aux->ops->is_valid_access && 685 if (env->prog->aux->ops->is_valid_access &&
701 env->prog->aux->ops->is_valid_access(off, size, t)) { 686 env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
702 /* remember the offset of last byte accessed in ctx */ 687 /* remember the offset of last byte accessed in ctx */
703 if (env->prog->aux->max_ctx_offset < off + size) 688 if (env->prog->aux->max_ctx_offset < off + size)
704 env->prog->aux->max_ctx_offset = off + size; 689 env->prog->aux->max_ctx_offset = off + size;
@@ -738,6 +723,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
738 switch (env->prog->type) { 723 switch (env->prog->type) {
739 case BPF_PROG_TYPE_SCHED_CLS: 724 case BPF_PROG_TYPE_SCHED_CLS:
740 case BPF_PROG_TYPE_SCHED_ACT: 725 case BPF_PROG_TYPE_SCHED_ACT:
726 case BPF_PROG_TYPE_XDP:
741 break; 727 break;
742 default: 728 default:
743 verbose("verifier is misconfigured\n"); 729 verbose("verifier is misconfigured\n");
@@ -798,21 +784,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
798 mark_reg_unknown_value(state->regs, value_regno); 784 mark_reg_unknown_value(state->regs, value_regno);
799 785
800 } else if (reg->type == PTR_TO_CTX) { 786 } else if (reg->type == PTR_TO_CTX) {
787 enum bpf_reg_type reg_type = UNKNOWN_VALUE;
788
801 if (t == BPF_WRITE && value_regno >= 0 && 789 if (t == BPF_WRITE && value_regno >= 0 &&
802 is_pointer_value(env, value_regno)) { 790 is_pointer_value(env, value_regno)) {
803 verbose("R%d leaks addr into ctx\n", value_regno); 791 verbose("R%d leaks addr into ctx\n", value_regno);
804 return -EACCES; 792 return -EACCES;
805 } 793 }
806 err = check_ctx_access(env, off, size, t); 794 err = check_ctx_access(env, off, size, t, &reg_type);
807 if (!err && t == BPF_READ && value_regno >= 0) { 795 if (!err && t == BPF_READ && value_regno >= 0) {
808 mark_reg_unknown_value(state->regs, value_regno); 796 mark_reg_unknown_value(state->regs, value_regno);
809 if (off == offsetof(struct __sk_buff, data) && 797 if (env->allow_ptr_leaks)
810 env->allow_ptr_leaks)
811 /* note that reg.[id|off|range] == 0 */ 798 /* note that reg.[id|off|range] == 0 */
812 state->regs[value_regno].type = PTR_TO_PACKET; 799 state->regs[value_regno].type = reg_type;
813 else if (off == offsetof(struct __sk_buff, data_end) &&
814 env->allow_ptr_leaks)
815 state->regs[value_regno].type = PTR_TO_PACKET_END;
816 } 800 }
817 801
818 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { 802 } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -832,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
832 err = check_stack_read(state, off, size, value_regno); 816 err = check_stack_read(state, off, size, value_regno);
833 } 817 }
834 } else if (state->regs[regno].type == PTR_TO_PACKET) { 818 } else if (state->regs[regno].type == PTR_TO_PACKET) {
835 if (t == BPF_WRITE) { 819 if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
836 verbose("cannot write into packet\n"); 820 verbose("cannot write into packet\n");
837 return -EACCES; 821 return -EACCES;
838 } 822 }
823 if (t == BPF_WRITE && value_regno >= 0 &&
824 is_pointer_value(env, value_regno)) {
825 verbose("R%d leaks addr into packet\n", value_regno);
826 return -EACCES;
827 }
839 err = check_packet_access(env, regno, off, size); 828 err = check_packet_access(env, regno, off, size);
840 if (!err && t == BPF_READ && value_regno >= 0) 829 if (!err && t == BPF_READ && value_regno >= 0)
841 mark_reg_unknown_value(state->regs, value_regno); 830 mark_reg_unknown_value(state->regs, value_regno);
@@ -1062,6 +1051,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1062 if (func_id != BPF_FUNC_get_stackid) 1051 if (func_id != BPF_FUNC_get_stackid)
1063 goto error; 1052 goto error;
1064 break; 1053 break;
1054 case BPF_MAP_TYPE_CGROUP_ARRAY:
1055 if (func_id != BPF_FUNC_skb_in_cgroup)
1056 goto error;
1057 break;
1065 default: 1058 default:
1066 break; 1059 break;
1067 } 1060 }
@@ -1081,6 +1074,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
1081 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) 1074 if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
1082 goto error; 1075 goto error;
1083 break; 1076 break;
1077 case BPF_FUNC_skb_in_cgroup:
1078 if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
1079 goto error;
1080 break;
1084 default: 1081 default:
1085 break; 1082 break;
1086 } 1083 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..d1c51b7f5221 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,7 @@
61#include <linux/cpuset.h> 61#include <linux/cpuset.h>
62#include <linux/proc_ns.h> 62#include <linux/proc_ns.h>
63#include <linux/nsproxy.h> 63#include <linux/nsproxy.h>
64#include <linux/proc_ns.h> 64#include <linux/file.h>
65#include <net/sock.h> 65#include <net/sock.h>
66 66
67/* 67/*
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)
837 837
838static void put_css_set(struct css_set *cset) 838static void put_css_set(struct css_set *cset)
839{ 839{
840 unsigned long flags;
841
840 /* 842 /*
841 * Ensure that the refcount doesn't hit zero while any readers 843 * Ensure that the refcount doesn't hit zero while any readers
842 * can see it. Similar to atomic_dec_and_lock(), but for an 844 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)
845 if (atomic_add_unless(&cset->refcount, -1, 1)) 847 if (atomic_add_unless(&cset->refcount, -1, 1))
846 return; 848 return;
847 849
848 spin_lock_bh(&css_set_lock); 850 spin_lock_irqsave(&css_set_lock, flags);
849 put_css_set_locked(cset); 851 put_css_set_locked(cset);
850 spin_unlock_bh(&css_set_lock); 852 spin_unlock_irqrestore(&css_set_lock, flags);
851} 853}
852 854
853/* 855/*
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1070 1072
1071 /* First see if we already have a cgroup group that matches 1073 /* First see if we already have a cgroup group that matches
1072 * the desired set */ 1074 * the desired set */
1073 spin_lock_bh(&css_set_lock); 1075 spin_lock_irq(&css_set_lock);
1074 cset = find_existing_css_set(old_cset, cgrp, template); 1076 cset = find_existing_css_set(old_cset, cgrp, template);
1075 if (cset) 1077 if (cset)
1076 get_css_set(cset); 1078 get_css_set(cset);
1077 spin_unlock_bh(&css_set_lock); 1079 spin_unlock_irq(&css_set_lock);
1078 1080
1079 if (cset) 1081 if (cset)
1080 return cset; 1082 return cset;
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1102 * find_existing_css_set() */ 1104 * find_existing_css_set() */
1103 memcpy(cset->subsys, template, sizeof(cset->subsys)); 1105 memcpy(cset->subsys, template, sizeof(cset->subsys));
1104 1106
1105 spin_lock_bh(&css_set_lock); 1107 spin_lock_irq(&css_set_lock);
1106 /* Add reference counts and links from the new css_set. */ 1108 /* Add reference counts and links from the new css_set. */
1107 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 1109 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1108 struct cgroup *c = link->cgrp; 1110 struct cgroup *c = link->cgrp;
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1128 css_get(css); 1130 css_get(css);
1129 } 1131 }
1130 1132
1131 spin_unlock_bh(&css_set_lock); 1133 spin_unlock_irq(&css_set_lock);
1132 1134
1133 return cset; 1135 return cset;
1134} 1136}
@@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
1158{ 1160{
1159 lockdep_assert_held(&cgroup_mutex); 1161 lockdep_assert_held(&cgroup_mutex);
1160 1162
1161 if (root->hierarchy_id) { 1163 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1162 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1163 root->hierarchy_id = 0;
1164 }
1165} 1164}
1166 1165
1167static void cgroup_free_root(struct cgroup_root *root) 1166static void cgroup_free_root(struct cgroup_root *root)
1168{ 1167{
1169 if (root) { 1168 if (root) {
1170 /* hierarchy ID should already have been released */
1171 WARN_ON_ONCE(root->hierarchy_id);
1172
1173 idr_destroy(&root->cgroup_idr); 1169 idr_destroy(&root->cgroup_idr);
1174 kfree(root); 1170 kfree(root);
1175 } 1171 }
@@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
1192 * Release all the links from cset_links to this hierarchy's 1188 * Release all the links from cset_links to this hierarchy's
1193 * root cgroup 1189 * root cgroup
1194 */ 1190 */
1195 spin_lock_bh(&css_set_lock); 1191 spin_lock_irq(&css_set_lock);
1196 1192
1197 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1193 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1198 list_del(&link->cset_link); 1194 list_del(&link->cset_link);
@@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
1200 kfree(link); 1196 kfree(link);
1201 } 1197 }
1202 1198
1203 spin_unlock_bh(&css_set_lock); 1199 spin_unlock_irq(&css_set_lock);
1204 1200
1205 if (!list_empty(&root->root_list)) { 1201 if (!list_empty(&root->root_list)) {
1206 list_del(&root->root_list); 1202 list_del(&root->root_list);
@@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1600 ss->root = dst_root; 1596 ss->root = dst_root;
1601 css->cgroup = dcgrp; 1597 css->cgroup = dcgrp;
1602 1598
1603 spin_lock_bh(&css_set_lock); 1599 spin_lock_irq(&css_set_lock);
1604 hash_for_each(css_set_table, i, cset, hlist) 1600 hash_for_each(css_set_table, i, cset, hlist)
1605 list_move_tail(&cset->e_cset_node[ss->id], 1601 list_move_tail(&cset->e_cset_node[ss->id],
1606 &dcgrp->e_csets[ss->id]); 1602 &dcgrp->e_csets[ss->id]);
1607 spin_unlock_bh(&css_set_lock); 1603 spin_unlock_irq(&css_set_lock);
1608 1604
1609 /* default hierarchy doesn't enable controllers by default */ 1605 /* default hierarchy doesn't enable controllers by default */
1610 dst_root->subsys_mask |= 1 << ssid; 1606 dst_root->subsys_mask |= 1 << ssid;
@@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1640 if (!buf) 1636 if (!buf)
1641 return -ENOMEM; 1637 return -ENOMEM;
1642 1638
1643 spin_lock_bh(&css_set_lock); 1639 spin_lock_irq(&css_set_lock);
1644 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); 1640 ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1645 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); 1641 len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1646 spin_unlock_bh(&css_set_lock); 1642 spin_unlock_irq(&css_set_lock);
1647 1643
1648 if (len >= PATH_MAX) 1644 if (len >= PATH_MAX)
1649 len = -ERANGE; 1645 len = -ERANGE;
@@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void)
1897{ 1893{
1898 struct task_struct *p, *g; 1894 struct task_struct *p, *g;
1899 1895
1900 spin_lock_bh(&css_set_lock); 1896 spin_lock_irq(&css_set_lock);
1901 1897
1902 if (use_task_css_set_links) 1898 if (use_task_css_set_links)
1903 goto out_unlock; 1899 goto out_unlock;
@@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void)
1922 * entry won't be deleted though the process has exited. 1918 * entry won't be deleted though the process has exited.
1923 * Do it while holding siglock so that we don't end up 1919 * Do it while holding siglock so that we don't end up
1924 * racing against cgroup_exit(). 1920 * racing against cgroup_exit().
1921 *
1922 * Interrupts were already disabled while acquiring
1923 * the css_set_lock, so we do not need to disable it
1924 * again when acquiring the sighand->siglock here.
1925 */ 1925 */
1926 spin_lock_irq(&p->sighand->siglock); 1926 spin_lock(&p->sighand->siglock);
1927 if (!(p->flags & PF_EXITING)) { 1927 if (!(p->flags & PF_EXITING)) {
1928 struct css_set *cset = task_css_set(p); 1928 struct css_set *cset = task_css_set(p);
1929 1929
@@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void)
1932 list_add_tail(&p->cg_list, &cset->tasks); 1932 list_add_tail(&p->cg_list, &cset->tasks);
1933 get_css_set(cset); 1933 get_css_set(cset);
1934 } 1934 }
1935 spin_unlock_irq(&p->sighand->siglock); 1935 spin_unlock(&p->sighand->siglock);
1936 } while_each_thread(g, p); 1936 } while_each_thread(g, p);
1937 read_unlock(&tasklist_lock); 1937 read_unlock(&tasklist_lock);
1938out_unlock: 1938out_unlock:
1939 spin_unlock_bh(&css_set_lock); 1939 spin_unlock_irq(&css_set_lock);
1940} 1940}
1941 1941
1942static void init_cgroup_housekeeping(struct cgroup *cgrp) 1942static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2043 * Link the root cgroup in this hierarchy into all the css_set 2043 * Link the root cgroup in this hierarchy into all the css_set
2044 * objects. 2044 * objects.
2045 */ 2045 */
2046 spin_lock_bh(&css_set_lock); 2046 spin_lock_irq(&css_set_lock);
2047 hash_for_each(css_set_table, i, cset, hlist) { 2047 hash_for_each(css_set_table, i, cset, hlist) {
2048 link_css_set(&tmp_links, cset, root_cgrp); 2048 link_css_set(&tmp_links, cset, root_cgrp);
2049 if (css_set_populated(cset)) 2049 if (css_set_populated(cset))
2050 cgroup_update_populated(root_cgrp, true); 2050 cgroup_update_populated(root_cgrp, true);
2051 } 2051 }
2052 spin_unlock_bh(&css_set_lock); 2052 spin_unlock_irq(&css_set_lock);
2053 2053
2054 BUG_ON(!list_empty(&root_cgrp->self.children)); 2054 BUG_ON(!list_empty(&root_cgrp->self.children));
2055 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 2055 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2209 goto out_unlock; 2209 goto out_unlock;
2210 } 2210 }
2211 2211
2212 /* 2212 /* Hierarchies may only be created in the initial cgroup namespace. */
2213 * We know this subsystem has not yet been bound. Users in a non-init 2213 if (ns != &init_cgroup_ns) {
2214 * user namespace may only mount hierarchies with no bound subsystems,
2215 * i.e. 'none,name=user1'
2216 */
2217 if (!opts.none && !capable(CAP_SYS_ADMIN)) {
2218 ret = -EPERM; 2214 ret = -EPERM;
2219 goto out_unlock; 2215 goto out_unlock;
2220 } 2216 }
@@ -2256,11 +2252,11 @@ out_mount:
2256 struct cgroup *cgrp; 2252 struct cgroup *cgrp;
2257 2253
2258 mutex_lock(&cgroup_mutex); 2254 mutex_lock(&cgroup_mutex);
2259 spin_lock_bh(&css_set_lock); 2255 spin_lock_irq(&css_set_lock);
2260 2256
2261 cgrp = cset_cgroup_from_root(ns->root_cset, root); 2257 cgrp = cset_cgroup_from_root(ns->root_cset, root);
2262 2258
2263 spin_unlock_bh(&css_set_lock); 2259 spin_unlock_irq(&css_set_lock);
2264 mutex_unlock(&cgroup_mutex); 2260 mutex_unlock(&cgroup_mutex);
2265 2261
2266 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); 2262 nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
@@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2337 char *ret; 2333 char *ret;
2338 2334
2339 mutex_lock(&cgroup_mutex); 2335 mutex_lock(&cgroup_mutex);
2340 spin_lock_bh(&css_set_lock); 2336 spin_lock_irq(&css_set_lock);
2341 2337
2342 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); 2338 ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2343 2339
2344 spin_unlock_bh(&css_set_lock); 2340 spin_unlock_irq(&css_set_lock);
2345 mutex_unlock(&cgroup_mutex); 2341 mutex_unlock(&cgroup_mutex);
2346 2342
2347 return ret; 2343 return ret;
@@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2369 char *path = NULL; 2365 char *path = NULL;
2370 2366
2371 mutex_lock(&cgroup_mutex); 2367 mutex_lock(&cgroup_mutex);
2372 spin_lock_bh(&css_set_lock); 2368 spin_lock_irq(&css_set_lock);
2373 2369
2374 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 2370 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2375 2371
@@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2382 path = buf; 2378 path = buf;
2383 } 2379 }
2384 2380
2385 spin_unlock_bh(&css_set_lock); 2381 spin_unlock_irq(&css_set_lock);
2386 mutex_unlock(&cgroup_mutex); 2382 mutex_unlock(&cgroup_mutex);
2387 return path; 2383 return path;
2388} 2384}
@@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2557 * the new cgroup. There are no failure cases after here, so this 2553 * the new cgroup. There are no failure cases after here, so this
2558 * is the commit point. 2554 * is the commit point.
2559 */ 2555 */
2560 spin_lock_bh(&css_set_lock); 2556 spin_lock_irq(&css_set_lock);
2561 list_for_each_entry(cset, &tset->src_csets, mg_node) { 2557 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2562 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { 2558 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2563 struct css_set *from_cset = task_css_set(task); 2559 struct css_set *from_cset = task_css_set(task);
@@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2568 put_css_set_locked(from_cset); 2564 put_css_set_locked(from_cset);
2569 } 2565 }
2570 } 2566 }
2571 spin_unlock_bh(&css_set_lock); 2567 spin_unlock_irq(&css_set_lock);
2572 2568
2573 /* 2569 /*
2574 * Migration is committed, all target tasks are now on dst_csets. 2570 * Migration is committed, all target tasks are now on dst_csets.
@@ -2597,13 +2593,13 @@ out_cancel_attach:
2597 } 2593 }
2598 } while_each_subsys_mask(); 2594 } while_each_subsys_mask();
2599out_release_tset: 2595out_release_tset:
2600 spin_lock_bh(&css_set_lock); 2596 spin_lock_irq(&css_set_lock);
2601 list_splice_init(&tset->dst_csets, &tset->src_csets); 2597 list_splice_init(&tset->dst_csets, &tset->src_csets);
2602 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { 2598 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2603 list_splice_tail_init(&cset->mg_tasks, &cset->tasks); 2599 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2604 list_del_init(&cset->mg_node); 2600 list_del_init(&cset->mg_node);
2605 } 2601 }
2606 spin_unlock_bh(&css_set_lock); 2602 spin_unlock_irq(&css_set_lock);
2607 return ret; 2603 return ret;
2608} 2604}
2609 2605
@@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2634 2630
2635 lockdep_assert_held(&cgroup_mutex); 2631 lockdep_assert_held(&cgroup_mutex);
2636 2632
2637 spin_lock_bh(&css_set_lock); 2633 spin_lock_irq(&css_set_lock);
2638 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 2634 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2639 cset->mg_src_cgrp = NULL; 2635 cset->mg_src_cgrp = NULL;
2640 cset->mg_dst_cgrp = NULL; 2636 cset->mg_dst_cgrp = NULL;
@@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2642 list_del_init(&cset->mg_preload_node); 2638 list_del_init(&cset->mg_preload_node);
2643 put_css_set_locked(cset); 2639 put_css_set_locked(cset);
2644 } 2640 }
2645 spin_unlock_bh(&css_set_lock); 2641 spin_unlock_irq(&css_set_lock);
2646} 2642}
2647 2643
2648/** 2644/**
@@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2783 * already PF_EXITING could be freed from underneath us unless we 2779 * already PF_EXITING could be freed from underneath us unless we
2784 * take an rcu_read_lock. 2780 * take an rcu_read_lock.
2785 */ 2781 */
2786 spin_lock_bh(&css_set_lock); 2782 spin_lock_irq(&css_set_lock);
2787 rcu_read_lock(); 2783 rcu_read_lock();
2788 task = leader; 2784 task = leader;
2789 do { 2785 do {
@@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2792 break; 2788 break;
2793 } while_each_thread(leader, task); 2789 } while_each_thread(leader, task);
2794 rcu_read_unlock(); 2790 rcu_read_unlock();
2795 spin_unlock_bh(&css_set_lock); 2791 spin_unlock_irq(&css_set_lock);
2796 2792
2797 return cgroup_taskset_migrate(&tset, root); 2793 return cgroup_taskset_migrate(&tset, root);
2798} 2794}
@@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2816 return -EBUSY; 2812 return -EBUSY;
2817 2813
2818 /* look up all src csets */ 2814 /* look up all src csets */
2819 spin_lock_bh(&css_set_lock); 2815 spin_lock_irq(&css_set_lock);
2820 rcu_read_lock(); 2816 rcu_read_lock();
2821 task = leader; 2817 task = leader;
2822 do { 2818 do {
@@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2826 break; 2822 break;
2827 } while_each_thread(leader, task); 2823 } while_each_thread(leader, task);
2828 rcu_read_unlock(); 2824 rcu_read_unlock();
2829 spin_unlock_bh(&css_set_lock); 2825 spin_unlock_irq(&css_set_lock);
2830 2826
2831 /* prepare dst csets and commit */ 2827 /* prepare dst csets and commit */
2832 ret = cgroup_migrate_prepare_dst(&preloaded_csets); 2828 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
@@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2859 struct cgroup *cgrp; 2855 struct cgroup *cgrp;
2860 struct inode *inode; 2856 struct inode *inode;
2861 2857
2862 spin_lock_bh(&css_set_lock); 2858 spin_lock_irq(&css_set_lock);
2863 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 2859 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2864 spin_unlock_bh(&css_set_lock); 2860 spin_unlock_irq(&css_set_lock);
2865 2861
2866 while (!cgroup_is_descendant(dst_cgrp, cgrp)) 2862 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2867 cgrp = cgroup_parent(cgrp); 2863 cgrp = cgroup_parent(cgrp);
@@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2956 int retval = 0; 2952 int retval = 0;
2957 2953
2958 mutex_lock(&cgroup_mutex); 2954 mutex_lock(&cgroup_mutex);
2955 percpu_down_write(&cgroup_threadgroup_rwsem);
2959 for_each_root(root) { 2956 for_each_root(root) {
2960 struct cgroup *from_cgrp; 2957 struct cgroup *from_cgrp;
2961 2958
2962 if (root == &cgrp_dfl_root) 2959 if (root == &cgrp_dfl_root)
2963 continue; 2960 continue;
2964 2961
2965 spin_lock_bh(&css_set_lock); 2962 spin_lock_irq(&css_set_lock);
2966 from_cgrp = task_cgroup_from_root(from, root); 2963 from_cgrp = task_cgroup_from_root(from, root);
2967 spin_unlock_bh(&css_set_lock); 2964 spin_unlock_irq(&css_set_lock);
2968 2965
2969 retval = cgroup_attach_task(from_cgrp, tsk, false); 2966 retval = cgroup_attach_task(from_cgrp, tsk, false);
2970 if (retval) 2967 if (retval)
2971 break; 2968 break;
2972 } 2969 }
2970 percpu_up_write(&cgroup_threadgroup_rwsem);
2973 mutex_unlock(&cgroup_mutex); 2971 mutex_unlock(&cgroup_mutex);
2974 2972
2975 return retval; 2973 return retval;
@@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3080 percpu_down_write(&cgroup_threadgroup_rwsem); 3078 percpu_down_write(&cgroup_threadgroup_rwsem);
3081 3079
3082 /* look up all csses currently attached to @cgrp's subtree */ 3080 /* look up all csses currently attached to @cgrp's subtree */
3083 spin_lock_bh(&css_set_lock); 3081 spin_lock_irq(&css_set_lock);
3084 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { 3082 cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3085 struct cgrp_cset_link *link; 3083 struct cgrp_cset_link *link;
3086 3084
@@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3088 cgroup_migrate_add_src(link->cset, dsct, 3086 cgroup_migrate_add_src(link->cset, dsct,
3089 &preloaded_csets); 3087 &preloaded_csets);
3090 } 3088 }
3091 spin_unlock_bh(&css_set_lock); 3089 spin_unlock_irq(&css_set_lock);
3092 3090
3093 /* NULL dst indicates self on default hierarchy */ 3091 /* NULL dst indicates self on default hierarchy */
3094 ret = cgroup_migrate_prepare_dst(&preloaded_csets); 3092 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
3095 if (ret) 3093 if (ret)
3096 goto out_finish; 3094 goto out_finish;
3097 3095
3098 spin_lock_bh(&css_set_lock); 3096 spin_lock_irq(&css_set_lock);
3099 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { 3097 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
3100 struct task_struct *task, *ntask; 3098 struct task_struct *task, *ntask;
3101 3099
@@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3107 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) 3105 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3108 cgroup_taskset_add(task, &tset); 3106 cgroup_taskset_add(task, &tset);
3109 } 3107 }
3110 spin_unlock_bh(&css_set_lock); 3108 spin_unlock_irq(&css_set_lock);
3111 3109
3112 ret = cgroup_taskset_migrate(&tset, cgrp->root); 3110 ret = cgroup_taskset_migrate(&tset, cgrp->root);
3113out_finish: 3111out_finish:
@@ -3908,10 +3906,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
3908 int count = 0; 3906 int count = 0;
3909 struct cgrp_cset_link *link; 3907 struct cgrp_cset_link *link;
3910 3908
3911 spin_lock_bh(&css_set_lock); 3909 spin_lock_irq(&css_set_lock);
3912 list_for_each_entry(link, &cgrp->cset_links, cset_link) 3910 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3913 count += atomic_read(&link->cset->refcount); 3911 count += atomic_read(&link->cset->refcount);
3914 spin_unlock_bh(&css_set_lock); 3912 spin_unlock_irq(&css_set_lock);
3915 return count; 3913 return count;
3916} 3914}
3917 3915
@@ -4249,7 +4247,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
4249 4247
4250 memset(it, 0, sizeof(*it)); 4248 memset(it, 0, sizeof(*it));
4251 4249
4252 spin_lock_bh(&css_set_lock); 4250 spin_lock_irq(&css_set_lock);
4253 4251
4254 it->ss = css->ss; 4252 it->ss = css->ss;
4255 4253
@@ -4262,7 +4260,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
4262 4260
4263 css_task_iter_advance_css_set(it); 4261 css_task_iter_advance_css_set(it);
4264 4262
4265 spin_unlock_bh(&css_set_lock); 4263 spin_unlock_irq(&css_set_lock);
4266} 4264}
4267 4265
4268/** 4266/**
@@ -4280,7 +4278,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
4280 it->cur_task = NULL; 4278 it->cur_task = NULL;
4281 } 4279 }
4282 4280
4283 spin_lock_bh(&css_set_lock); 4281 spin_lock_irq(&css_set_lock);
4284 4282
4285 if (it->task_pos) { 4283 if (it->task_pos) {
4286 it->cur_task = list_entry(it->task_pos, struct task_struct, 4284 it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4289,7 +4287,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
4289 css_task_iter_advance(it); 4287 css_task_iter_advance(it);
4290 } 4288 }
4291 4289
4292 spin_unlock_bh(&css_set_lock); 4290 spin_unlock_irq(&css_set_lock);
4293 4291
4294 return it->cur_task; 4292 return it->cur_task;
4295} 4293}
@@ -4303,10 +4301,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
4303void css_task_iter_end(struct css_task_iter *it) 4301void css_task_iter_end(struct css_task_iter *it)
4304{ 4302{
4305 if (it->cur_cset) { 4303 if (it->cur_cset) {
4306 spin_lock_bh(&css_set_lock); 4304 spin_lock_irq(&css_set_lock);
4307 list_del(&it->iters_node); 4305 list_del(&it->iters_node);
4308 put_css_set_locked(it->cur_cset); 4306 put_css_set_locked(it->cur_cset);
4309 spin_unlock_bh(&css_set_lock); 4307 spin_unlock_irq(&css_set_lock);
4310 } 4308 }
4311 4309
4312 if (it->cur_task) 4310 if (it->cur_task)
@@ -4337,11 +4335,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4337 4335
4338 mutex_lock(&cgroup_mutex); 4336 mutex_lock(&cgroup_mutex);
4339 4337
4338 percpu_down_write(&cgroup_threadgroup_rwsem);
4339
4340 /* all tasks in @from are being moved, all csets are source */ 4340 /* all tasks in @from are being moved, all csets are source */
4341 spin_lock_bh(&css_set_lock); 4341 spin_lock_irq(&css_set_lock);
4342 list_for_each_entry(link, &from->cset_links, cset_link) 4342 list_for_each_entry(link, &from->cset_links, cset_link)
4343 cgroup_migrate_add_src(link->cset, to, &preloaded_csets); 4343 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4344 spin_unlock_bh(&css_set_lock); 4344 spin_unlock_irq(&css_set_lock);
4345 4345
4346 ret = cgroup_migrate_prepare_dst(&preloaded_csets); 4346 ret = cgroup_migrate_prepare_dst(&preloaded_csets);
4347 if (ret) 4347 if (ret)
@@ -4365,6 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4365 } while (task && !ret); 4365 } while (task && !ret);
4366out_err: 4366out_err:
4367 cgroup_migrate_finish(&preloaded_csets); 4367 cgroup_migrate_finish(&preloaded_csets);
4368 percpu_up_write(&cgroup_threadgroup_rwsem);
4368 mutex_unlock(&cgroup_mutex); 4369 mutex_unlock(&cgroup_mutex);
4369 return ret; 4370 return ret;
4370} 4371}
@@ -5063,6 +5064,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
5063 memset(css, 0, sizeof(*css)); 5064 memset(css, 0, sizeof(*css));
5064 css->cgroup = cgrp; 5065 css->cgroup = cgrp;
5065 css->ss = ss; 5066 css->ss = ss;
5067 css->id = -1;
5066 INIT_LIST_HEAD(&css->sibling); 5068 INIT_LIST_HEAD(&css->sibling);
5067 INIT_LIST_HEAD(&css->children); 5069 INIT_LIST_HEAD(&css->children);
5068 css->serial_nr = css_serial_nr_next++; 5070 css->serial_nr = css_serial_nr_next++;
@@ -5139,6 +5141,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5139 lockdep_assert_held(&cgroup_mutex); 5141 lockdep_assert_held(&cgroup_mutex);
5140 5142
5141 css = ss->css_alloc(parent_css); 5143 css = ss->css_alloc(parent_css);
5144 if (!css)
5145 css = ERR_PTR(-ENOMEM);
5142 if (IS_ERR(css)) 5146 if (IS_ERR(css))
5143 return css; 5147 return css;
5144 5148
@@ -5150,7 +5154,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5150 5154
5151 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); 5155 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5152 if (err < 0) 5156 if (err < 0)
5153 goto err_free_percpu_ref; 5157 goto err_free_css;
5154 css->id = err; 5158 css->id = err;
5155 5159
5156 /* @css is ready to be brought online now, make it visible */ 5160 /* @css is ready to be brought online now, make it visible */
@@ -5174,9 +5178,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5174 5178
5175err_list_del: 5179err_list_del:
5176 list_del_rcu(&css->sibling); 5180 list_del_rcu(&css->sibling);
5177 cgroup_idr_remove(&ss->css_idr, css->id);
5178err_free_percpu_ref:
5179 percpu_ref_exit(&css->refcnt);
5180err_free_css: 5181err_free_css:
5181 call_rcu(&css->rcu_head, css_free_rcu_fn); 5182 call_rcu(&css->rcu_head, css_free_rcu_fn);
5182 return ERR_PTR(err); 5183 return ERR_PTR(err);
@@ -5451,10 +5452,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5451 */ 5452 */
5452 cgrp->self.flags &= ~CSS_ONLINE; 5453 cgrp->self.flags &= ~CSS_ONLINE;
5453 5454
5454 spin_lock_bh(&css_set_lock); 5455 spin_lock_irq(&css_set_lock);
5455 list_for_each_entry(link, &cgrp->cset_links, cset_link) 5456 list_for_each_entry(link, &cgrp->cset_links, cset_link)
5456 link->cset->dead = true; 5457 link->cset->dead = true;
5457 spin_unlock_bh(&css_set_lock); 5458 spin_unlock_irq(&css_set_lock);
5458 5459
5459 /* initiate massacre of all css's */ 5460 /* initiate massacre of all css's */
5460 for_each_css(css, ssid, cgrp) 5461 for_each_css(css, ssid, cgrp)
@@ -5725,7 +5726,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5725 goto out; 5726 goto out;
5726 5727
5727 mutex_lock(&cgroup_mutex); 5728 mutex_lock(&cgroup_mutex);
5728 spin_lock_bh(&css_set_lock); 5729 spin_lock_irq(&css_set_lock);
5729 5730
5730 for_each_root(root) { 5731 for_each_root(root) {
5731 struct cgroup_subsys *ss; 5732 struct cgroup_subsys *ss;
@@ -5778,7 +5779,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5778 5779
5779 retval = 0; 5780 retval = 0;
5780out_unlock: 5781out_unlock:
5781 spin_unlock_bh(&css_set_lock); 5782 spin_unlock_irq(&css_set_lock);
5782 mutex_unlock(&cgroup_mutex); 5783 mutex_unlock(&cgroup_mutex);
5783 kfree(buf); 5784 kfree(buf);
5784out: 5785out:
@@ -5923,13 +5924,13 @@ void cgroup_post_fork(struct task_struct *child)
5923 if (use_task_css_set_links) { 5924 if (use_task_css_set_links) {
5924 struct css_set *cset; 5925 struct css_set *cset;
5925 5926
5926 spin_lock_bh(&css_set_lock); 5927 spin_lock_irq(&css_set_lock);
5927 cset = task_css_set(current); 5928 cset = task_css_set(current);
5928 if (list_empty(&child->cg_list)) { 5929 if (list_empty(&child->cg_list)) {
5929 get_css_set(cset); 5930 get_css_set(cset);
5930 css_set_move_task(child, NULL, cset, false); 5931 css_set_move_task(child, NULL, cset, false);
5931 } 5932 }
5932 spin_unlock_bh(&css_set_lock); 5933 spin_unlock_irq(&css_set_lock);
5933 } 5934 }
5934 5935
5935 /* 5936 /*
@@ -5974,9 +5975,9 @@ void cgroup_exit(struct task_struct *tsk)
5974 cset = task_css_set(tsk); 5975 cset = task_css_set(tsk);
5975 5976
5976 if (!list_empty(&tsk->cg_list)) { 5977 if (!list_empty(&tsk->cg_list)) {
5977 spin_lock_bh(&css_set_lock); 5978 spin_lock_irq(&css_set_lock);
5978 css_set_move_task(tsk, cset, NULL, false); 5979 css_set_move_task(tsk, cset, NULL, false);
5979 spin_unlock_bh(&css_set_lock); 5980 spin_unlock_irq(&css_set_lock);
5980 } else { 5981 } else {
5981 get_css_set(cset); 5982 get_css_set(cset);
5982 } 5983 }
@@ -6044,9 +6045,9 @@ static void cgroup_release_agent(struct work_struct *work)
6044 if (!pathbuf || !agentbuf) 6045 if (!pathbuf || !agentbuf)
6045 goto out; 6046 goto out;
6046 6047
6047 spin_lock_bh(&css_set_lock); 6048 spin_lock_irq(&css_set_lock);
6048 path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); 6049 path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
6049 spin_unlock_bh(&css_set_lock); 6050 spin_unlock_irq(&css_set_lock);
6050 if (!path) 6051 if (!path)
6051 goto out; 6052 goto out;
6052 6053
@@ -6168,7 +6169,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6168struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) 6169struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6169{ 6170{
6170 WARN_ON_ONCE(!rcu_read_lock_held()); 6171 WARN_ON_ONCE(!rcu_read_lock_held());
6171 return id > 0 ? idr_find(&ss->css_idr, id) : NULL; 6172 return idr_find(&ss->css_idr, id);
6172} 6173}
6173 6174
6174/** 6175/**
@@ -6205,6 +6206,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
6205} 6206}
6206EXPORT_SYMBOL_GPL(cgroup_get_from_path); 6207EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6207 6208
6209/**
6210 * cgroup_get_from_fd - get a cgroup pointer from a fd
6211 * @fd: fd obtained by open(cgroup2_dir)
6212 *
6213 * Find the cgroup from a fd which should be obtained
6214 * by opening a cgroup directory. Returns a pointer to the
6215 * cgroup on success. ERR_PTR is returned if the cgroup
6216 * cannot be found.
6217 */
6218struct cgroup *cgroup_get_from_fd(int fd)
6219{
6220 struct cgroup_subsys_state *css;
6221 struct cgroup *cgrp;
6222 struct file *f;
6223
6224 f = fget_raw(fd);
6225 if (!f)
6226 return ERR_PTR(-EBADF);
6227
6228 css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6229 fput(f);
6230 if (IS_ERR(css))
6231 return ERR_CAST(css);
6232
6233 cgrp = css->cgroup;
6234 if (!cgroup_on_dfl(cgrp)) {
6235 cgroup_put(cgrp);
6236 return ERR_PTR(-EBADF);
6237 }
6238
6239 return cgrp;
6240}
6241EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6242
6208/* 6243/*
6209 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data 6244 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
6210 * definition in cgroup-defs.h. 6245 * definition in cgroup-defs.h.
@@ -6305,14 +6340,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6305 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 6340 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6306 return ERR_PTR(-EPERM); 6341 return ERR_PTR(-EPERM);
6307 6342
6308 mutex_lock(&cgroup_mutex); 6343 /* It is not safe to take cgroup_mutex here */
6309 spin_lock_bh(&css_set_lock); 6344 spin_lock_irq(&css_set_lock);
6310
6311 cset = task_css_set(current); 6345 cset = task_css_set(current);
6312 get_css_set(cset); 6346 get_css_set(cset);
6313 6347 spin_unlock_irq(&css_set_lock);
6314 spin_unlock_bh(&css_set_lock);
6315 mutex_unlock(&cgroup_mutex);
6316 6348
6317 new_ns = alloc_cgroup_ns(); 6349 new_ns = alloc_cgroup_ns();
6318 if (IS_ERR(new_ns)) { 6350 if (IS_ERR(new_ns)) {
@@ -6435,7 +6467,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6435 if (!name_buf) 6467 if (!name_buf)
6436 return -ENOMEM; 6468 return -ENOMEM;
6437 6469
6438 spin_lock_bh(&css_set_lock); 6470 spin_lock_irq(&css_set_lock);
6439 rcu_read_lock(); 6471 rcu_read_lock();
6440 cset = rcu_dereference(current->cgroups); 6472 cset = rcu_dereference(current->cgroups);
6441 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 6473 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -6446,7 +6478,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6446 c->root->hierarchy_id, name_buf); 6478 c->root->hierarchy_id, name_buf);
6447 } 6479 }
6448 rcu_read_unlock(); 6480 rcu_read_unlock();
6449 spin_unlock_bh(&css_set_lock); 6481 spin_unlock_irq(&css_set_lock);
6450 kfree(name_buf); 6482 kfree(name_buf);
6451 return 0; 6483 return 0;
6452} 6484}
@@ -6457,7 +6489,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
6457 struct cgroup_subsys_state *css = seq_css(seq); 6489 struct cgroup_subsys_state *css = seq_css(seq);
6458 struct cgrp_cset_link *link; 6490 struct cgrp_cset_link *link;
6459 6491
6460 spin_lock_bh(&css_set_lock); 6492 spin_lock_irq(&css_set_lock);
6461 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 6493 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6462 struct css_set *cset = link->cset; 6494 struct css_set *cset = link->cset;
6463 struct task_struct *task; 6495 struct task_struct *task;
@@ -6480,7 +6512,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
6480 overflow: 6512 overflow:
6481 seq_puts(seq, " ...\n"); 6513 seq_puts(seq, " ...\n");
6482 } 6514 }
6483 spin_unlock_bh(&css_set_lock); 6515 spin_unlock_irq(&css_set_lock);
6484 return 0; 6516 return 0;
6485} 6517}
6486 6518
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
49 */ 49 */
50 atomic64_t counter; 50 atomic64_t counter;
51 int64_t limit; 51 int64_t limit;
52
53 /* Handle for "pids.events" */
54 struct cgroup_file events_file;
55
56 /* Number of times fork failed because limit was hit. */
57 atomic64_t events_limit;
52}; 58};
53 59
54static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) 60static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
72 78
73 pids->limit = PIDS_MAX; 79 pids->limit = PIDS_MAX;
74 atomic64_set(&pids->counter, 0); 80 atomic64_set(&pids->counter, 0);
81 atomic64_set(&pids->events_limit, 0);
75 return &pids->css; 82 return &pids->css;
76} 83}
77 84
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
213{ 220{
214 struct cgroup_subsys_state *css; 221 struct cgroup_subsys_state *css;
215 struct pids_cgroup *pids; 222 struct pids_cgroup *pids;
223 int err;
216 224
217 css = task_css_check(current, pids_cgrp_id, true); 225 css = task_css_check(current, pids_cgrp_id, true);
218 pids = css_pids(css); 226 pids = css_pids(css);
219 return pids_try_charge(pids, 1); 227 err = pids_try_charge(pids, 1);
228 if (err) {
229 /* Only log the first time events_limit is incremented. */
230 if (atomic64_inc_return(&pids->events_limit) == 1) {
231 pr_info("cgroup: fork rejected by pids controller in ");
232 pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
233 pr_cont("\n");
234 }
235 cgroup_file_notify(&pids->events_file);
236 }
237 return err;
220} 238}
221 239
222static void pids_cancel_fork(struct task_struct *task) 240static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
288 return atomic64_read(&pids->counter); 306 return atomic64_read(&pids->counter);
289} 307}
290 308
309static int pids_events_show(struct seq_file *sf, void *v)
310{
311 struct pids_cgroup *pids = css_pids(seq_css(sf));
312
313 seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
314 return 0;
315}
316
291static struct cftype pids_files[] = { 317static struct cftype pids_files[] = {
292 { 318 {
293 .name = "max", 319 .name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
300 .read_s64 = pids_current_read, 326 .read_s64 = pids_current_read,
301 .flags = CFTYPE_NOT_ON_ROOT, 327 .flags = CFTYPE_NOT_ON_ROOT,
302 }, 328 },
329 {
330 .name = "events",
331 .seq_show = pids_events_show,
332 .file_offset = offsetof(struct pids_cgroup, events_file),
333 .flags = CFTYPE_NOT_ON_ROOT,
334 },
303 { } /* terminate */ 335 { } /* terminate */
304}; 336};
305 337
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d948e44c471e..341bf80f80bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
517 if (!cpu_online(cpu)) 517 if (!cpu_online(cpu))
518 return 0; 518 return 0;
519 519
520 /*
521 * If we are up and running, use the hotplug thread. For early calls
522 * we invoke the thread function directly.
523 */
524 if (!st->thread)
525 return cpuhp_invoke_callback(cpu, state, cb);
526
520 st->cb_state = state; 527 st->cb_state = state;
521 st->cb = cb; 528 st->cb = cb;
522 /* 529 /*
@@ -1173,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1173 .teardown = NULL, 1180 .teardown = NULL,
1174 .cant_stop = true, 1181 .cant_stop = true,
1175 }, 1182 },
1183 [CPUHP_PERF_PREPARE] = {
1184 .name = "perf prepare",
1185 .startup = perf_event_init_cpu,
1186 .teardown = perf_event_exit_cpu,
1187 },
1188 [CPUHP_WORKQUEUE_PREP] = {
1189 .name = "workqueue prepare",
1190 .startup = workqueue_prepare_cpu,
1191 .teardown = NULL,
1192 },
1193 [CPUHP_HRTIMERS_PREPARE] = {
1194 .name = "hrtimers prepare",
1195 .startup = hrtimers_prepare_cpu,
1196 .teardown = hrtimers_dead_cpu,
1197 },
1198 [CPUHP_SMPCFD_PREPARE] = {
1199 .name = "SMPCFD prepare",
1200 .startup = smpcfd_prepare_cpu,
1201 .teardown = smpcfd_dead_cpu,
1202 },
1203 [CPUHP_RCUTREE_PREP] = {
1204 .name = "RCU-tree prepare",
1205 .startup = rcutree_prepare_cpu,
1206 .teardown = rcutree_dead_cpu,
1207 },
1176 /* 1208 /*
1177 * Preparatory and dead notifiers. Will be replaced once the notifiers 1209 * Preparatory and dead notifiers. Will be replaced once the notifiers
1178 * are converted to states. 1210 * are converted to states.
@@ -1184,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1184 .skip_onerr = true, 1216 .skip_onerr = true,
1185 .cant_stop = true, 1217 .cant_stop = true,
1186 }, 1218 },
1219 /*
1220 * On the tear-down path, timers_dead_cpu() must be invoked
1221 * before blk_mq_queue_reinit_notify() from notify_dead(),
1222 * otherwise a RCU stall occurs.
1223 */
1224 [CPUHP_TIMERS_DEAD] = {
1225 .name = "timers dead",
1226 .startup = NULL,
1227 .teardown = timers_dead_cpu,
1228 },
1187 /* Kicks the plugged cpu into life */ 1229 /* Kicks the plugged cpu into life */
1188 [CPUHP_BRINGUP_CPU] = { 1230 [CPUHP_BRINGUP_CPU] = {
1189 .name = "cpu:bringup", 1231 .name = "cpu:bringup",
@@ -1191,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1191 .teardown = NULL, 1233 .teardown = NULL,
1192 .cant_stop = true, 1234 .cant_stop = true,
1193 }, 1235 },
1236 [CPUHP_AP_SMPCFD_DYING] = {
1237 .startup = NULL,
1238 .teardown = smpcfd_dying_cpu,
1239 },
1194 /* 1240 /*
1195 * Handled on controll processor until the plugged processor manages 1241 * Handled on controll processor until the plugged processor manages
1196 * this itself. 1242 * this itself.
@@ -1201,6 +1247,8 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1201 .teardown = takedown_cpu, 1247 .teardown = takedown_cpu,
1202 .cant_stop = true, 1248 .cant_stop = true,
1203 }, 1249 },
1250#else
1251 [CPUHP_BRINGUP_CPU] = { },
1204#endif 1252#endif
1205}; 1253};
1206 1254
@@ -1225,6 +1273,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1225 .startup = sched_cpu_starting, 1273 .startup = sched_cpu_starting,
1226 .teardown = sched_cpu_dying, 1274 .teardown = sched_cpu_dying,
1227 }, 1275 },
1276 [CPUHP_AP_RCUTREE_DYING] = {
1277 .startup = NULL,
1278 .teardown = rcutree_dying_cpu,
1279 },
1228 /* 1280 /*
1229 * Low level startup/teardown notifiers. Run with interrupts 1281 * Low level startup/teardown notifiers. Run with interrupts
1230 * disabled. Will be removed once the notifiers are converted to 1282 * disabled. Will be removed once the notifiers are converted to
@@ -1248,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1248 .startup = smpboot_unpark_threads, 1300 .startup = smpboot_unpark_threads,
1249 .teardown = NULL, 1301 .teardown = NULL,
1250 }, 1302 },
1303 [CPUHP_AP_PERF_ONLINE] = {
1304 .name = "perf online",
1305 .startup = perf_event_init_cpu,
1306 .teardown = perf_event_exit_cpu,
1307 },
1308 [CPUHP_AP_WORKQUEUE_ONLINE] = {
1309 .name = "workqueue online",
1310 .startup = workqueue_online_cpu,
1311 .teardown = workqueue_offline_cpu,
1312 },
1313 [CPUHP_AP_RCUTREE_ONLINE] = {
1314 .name = "RCU-tree online",
1315 .startup = rcutree_online_cpu,
1316 .teardown = rcutree_offline_cpu,
1317 },
1318
1251 /* 1319 /*
1252 * Online/down_prepare notifiers. Will be removed once the notifiers 1320 * Online/down_prepare notifiers. Will be removed once the notifiers
1253 * are converted to states. 1321 * are converted to states.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1034{ 1034{
1035 bool need_loop; 1035 bool need_loop;
1036 1036
1037 /*
1038 * Allow tasks that have access to memory reserves because they have
1039 * been OOM killed to get memory anywhere.
1040 */
1041 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1042 return;
1043 if (current->flags & PF_EXITING) /* Let dying task have memory */
1044 return;
1045
1046 task_lock(tsk); 1037 task_lock(tsk);
1047 /* 1038 /*
1048 * Determine if a loop is necessary if another thread is doing 1039 * Determine if a loop is necessary if another thread is doing
diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a62285..5f264fb5737d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
689 */ 689 */
690int set_create_files_as(struct cred *new, struct inode *inode) 690int set_create_files_as(struct cred *new, struct inode *inode)
691{ 691{
692 if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
693 return -EINVAL;
692 new->fsuid = inode->i_uid; 694 new->fsuid = inode->i_uid;
693 new->fsgid = inode->i_gid; 695 new->fsgid = inode->i_gid;
694 return security_kernel_create_files_as(new, inode); 696 return security_kernel_create_files_as(new, inode);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640964..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
104 return -ENOMEM; 104 return -ENOMEM;
105} 105}
106 106
107int get_callchain_buffers(void) 107int get_callchain_buffers(int event_max_stack)
108{ 108{
109 int err = 0; 109 int err = 0;
110 int count; 110 int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
121 /* If the allocation failed, give up */ 121 /* If the allocation failed, give up */
122 if (!callchain_cpus_entries) 122 if (!callchain_cpus_entries)
123 err = -ENOMEM; 123 err = -ENOMEM;
124 /*
125 * If requesting per event more than the global cap,
126 * return a different error to help userspace figure
127 * this out.
128 *
129 * And also do it here so that we have &callchain_mutex held.
130 */
131 if (event_max_stack > sysctl_perf_event_max_stack)
132 err = -EOVERFLOW;
124 goto exit; 133 goto exit;
125 } 134 }
126 135
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
174 bool user = !event->attr.exclude_callchain_user; 183 bool user = !event->attr.exclude_callchain_user;
175 /* Disallow cross-task user callchains. */ 184 /* Disallow cross-task user callchains. */
176 bool crosstask = event->ctx->task && event->ctx->task != current; 185 bool crosstask = event->ctx->task && event->ctx->task != current;
186 const u32 max_stack = event->attr.sample_max_stack;
177 187
178 if (!kernel && !user) 188 if (!kernel && !user)
179 return NULL; 189 return NULL;
180 190
181 return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); 191 return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
182} 192}
183 193
184struct perf_callchain_entry * 194struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..356a6c7cb52a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
335 335
336static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 336static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
337static DEFINE_PER_CPU(int, perf_sched_cb_usages); 337static DEFINE_PER_CPU(int, perf_sched_cb_usages);
338static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
338 339
339static atomic_t nr_mmap_events __read_mostly; 340static atomic_t nr_mmap_events __read_mostly;
340static atomic_t nr_comm_events __read_mostly; 341static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
396 if (ret || !write) 397 if (ret || !write)
397 return ret; 398 return ret;
398 399
400 /*
401 * If throttling is disabled don't allow the write:
402 */
403 if (sysctl_perf_cpu_time_max_percent == 100 ||
404 sysctl_perf_cpu_time_max_percent == 0)
405 return -EINVAL;
406
399 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 407 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
400 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 408 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
401 update_perf_cpu_limits(); 409 update_perf_cpu_limits();
@@ -1678,12 +1686,33 @@ static bool is_orphaned_event(struct perf_event *event)
1678 return event->state == PERF_EVENT_STATE_DEAD; 1686 return event->state == PERF_EVENT_STATE_DEAD;
1679} 1687}
1680 1688
1681static inline int pmu_filter_match(struct perf_event *event) 1689static inline int __pmu_filter_match(struct perf_event *event)
1682{ 1690{
1683 struct pmu *pmu = event->pmu; 1691 struct pmu *pmu = event->pmu;
1684 return pmu->filter_match ? pmu->filter_match(event) : 1; 1692 return pmu->filter_match ? pmu->filter_match(event) : 1;
1685} 1693}
1686 1694
1695/*
1696 * Check whether we should attempt to schedule an event group based on
1697 * PMU-specific filtering. An event group can consist of HW and SW events,
1698 * potentially with a SW leader, so we must check all the filters, to
1699 * determine whether a group is schedulable:
1700 */
1701static inline int pmu_filter_match(struct perf_event *event)
1702{
1703 struct perf_event *child;
1704
1705 if (!__pmu_filter_match(event))
1706 return 0;
1707
1708 list_for_each_entry(child, &event->sibling_list, group_entry) {
1709 if (!__pmu_filter_match(child))
1710 return 0;
1711 }
1712
1713 return 1;
1714}
1715
1687static inline int 1716static inline int
1688event_filter_match(struct perf_event *event) 1717event_filter_match(struct perf_event *event)
1689{ 1718{
@@ -3665,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head)
3665static void ring_buffer_attach(struct perf_event *event, 3694static void ring_buffer_attach(struct perf_event *event,
3666 struct ring_buffer *rb); 3695 struct ring_buffer *rb);
3667 3696
3697static void detach_sb_event(struct perf_event *event)
3698{
3699 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3700
3701 raw_spin_lock(&pel->lock);
3702 list_del_rcu(&event->sb_list);
3703 raw_spin_unlock(&pel->lock);
3704}
3705
3706static bool is_sb_event(struct perf_event *event)
3707{
3708 struct perf_event_attr *attr = &event->attr;
3709
3710 if (event->parent)
3711 return false;
3712
3713 if (event->attach_state & PERF_ATTACH_TASK)
3714 return false;
3715
3716 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3717 attr->comm || attr->comm_exec ||
3718 attr->task ||
3719 attr->context_switch)
3720 return true;
3721 return false;
3722}
3723
3724static void unaccount_pmu_sb_event(struct perf_event *event)
3725{
3726 if (is_sb_event(event))
3727 detach_sb_event(event);
3728}
3729
3668static void unaccount_event_cpu(struct perf_event *event, int cpu) 3730static void unaccount_event_cpu(struct perf_event *event, int cpu)
3669{ 3731{
3670 if (event->parent) 3732 if (event->parent)
@@ -3728,6 +3790,8 @@ static void unaccount_event(struct perf_event *event)
3728 } 3790 }
3729 3791
3730 unaccount_event_cpu(event, event->cpu); 3792 unaccount_event_cpu(event, event->cpu);
3793
3794 unaccount_pmu_sb_event(event);
3731} 3795}
3732 3796
3733static void perf_sched_delayed(struct work_struct *work) 3797static void perf_sched_delayed(struct work_struct *work)
@@ -3862,10 +3926,8 @@ static void _free_event(struct perf_event *event)
3862 if (event->ctx) 3926 if (event->ctx)
3863 put_ctx(event->ctx); 3927 put_ctx(event->ctx);
3864 3928
3865 if (event->pmu) { 3929 exclusive_event_destroy(event);
3866 exclusive_event_destroy(event); 3930 module_put(event->pmu->module);
3867 module_put(event->pmu->module);
3868 }
3869 3931
3870 call_rcu(&event->rcu_head, free_event_rcu); 3932 call_rcu(&event->rcu_head, free_event_rcu);
3871} 3933}
@@ -5555,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,
5555 } 5617 }
5556 5618
5557 if (sample_type & PERF_SAMPLE_RAW) { 5619 if (sample_type & PERF_SAMPLE_RAW) {
5558 if (data->raw) { 5620 struct perf_raw_record *raw = data->raw;
5559 u32 raw_size = data->raw->size; 5621
5560 u32 real_size = round_up(raw_size + sizeof(u32), 5622 if (raw) {
5561 sizeof(u64)) - sizeof(u32); 5623 struct perf_raw_frag *frag = &raw->frag;
5562 u64 zero = 0; 5624
5563 5625 perf_output_put(handle, raw->size);
5564 perf_output_put(handle, real_size); 5626 do {
5565 __output_copy(handle, data->raw->data, raw_size); 5627 if (frag->copy) {
5566 if (real_size - raw_size) 5628 __output_custom(handle, frag->copy,
5567 __output_copy(handle, &zero, real_size - raw_size); 5629 frag->data, frag->size);
5630 } else {
5631 __output_copy(handle, frag->data,
5632 frag->size);
5633 }
5634 if (perf_raw_frag_last(frag))
5635 break;
5636 frag = frag->next;
5637 } while (1);
5638 if (frag->pad)
5639 __output_skip(handle, NULL, frag->pad);
5568 } else { 5640 } else {
5569 struct { 5641 struct {
5570 u32 size; 5642 u32 size;
@@ -5689,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,
5689 } 5761 }
5690 5762
5691 if (sample_type & PERF_SAMPLE_RAW) { 5763 if (sample_type & PERF_SAMPLE_RAW) {
5692 int size = sizeof(u32); 5764 struct perf_raw_record *raw = data->raw;
5693 5765 int size;
5694 if (data->raw) 5766
5695 size += data->raw->size; 5767 if (raw) {
5696 else 5768 struct perf_raw_frag *frag = &raw->frag;
5697 size += sizeof(u32); 5769 u32 sum = 0;
5770
5771 do {
5772 sum += frag->size;
5773 if (perf_raw_frag_last(frag))
5774 break;
5775 frag = frag->next;
5776 } while (1);
5777
5778 size = round_up(sum + sizeof(u32), sizeof(u64));
5779 raw->size = size - sizeof(u32);
5780 frag->pad = raw->size - sum;
5781 } else {
5782 size = sizeof(u64);
5783 }
5698 5784
5699 header->size += round_up(size, sizeof(u64)); 5785 header->size += size;
5700 } 5786 }
5701 5787
5702 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 5788 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5856,11 +5942,11 @@ perf_event_read_event(struct perf_event *event,
5856 perf_output_end(&handle); 5942 perf_output_end(&handle);
5857} 5943}
5858 5944
5859typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 5945typedef void (perf_iterate_f)(struct perf_event *event, void *data);
5860 5946
5861static void 5947static void
5862perf_event_aux_ctx(struct perf_event_context *ctx, 5948perf_iterate_ctx(struct perf_event_context *ctx,
5863 perf_event_aux_output_cb output, 5949 perf_iterate_f output,
5864 void *data, bool all) 5950 void *data, bool all)
5865{ 5951{
5866 struct perf_event *event; 5952 struct perf_event *event;
@@ -5877,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
5877 } 5963 }
5878} 5964}
5879 5965
5880static void 5966static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
5881perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5882 struct perf_event_context *task_ctx)
5883{ 5967{
5884 rcu_read_lock(); 5968 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
5885 preempt_disable(); 5969 struct perf_event *event;
5886 perf_event_aux_ctx(task_ctx, output, data, false); 5970
5887 preempt_enable(); 5971 list_for_each_entry_rcu(event, &pel->list, sb_list) {
5888 rcu_read_unlock(); 5972 if (event->state < PERF_EVENT_STATE_INACTIVE)
5973 continue;
5974 if (!event_filter_match(event))
5975 continue;
5976 output(event, data);
5977 }
5889} 5978}
5890 5979
5980/*
5981 * Iterate all events that need to receive side-band events.
5982 *
5983 * For new callers; ensure that account_pmu_sb_event() includes
5984 * your event, otherwise it might not get delivered.
5985 */
5891static void 5986static void
5892perf_event_aux(perf_event_aux_output_cb output, void *data, 5987perf_iterate_sb(perf_iterate_f output, void *data,
5893 struct perf_event_context *task_ctx) 5988 struct perf_event_context *task_ctx)
5894{ 5989{
5895 struct perf_cpu_context *cpuctx;
5896 struct perf_event_context *ctx; 5990 struct perf_event_context *ctx;
5897 struct pmu *pmu;
5898 int ctxn; 5991 int ctxn;
5899 5992
5993 rcu_read_lock();
5994 preempt_disable();
5995
5900 /* 5996 /*
5901 * If we have task_ctx != NULL we only notify 5997 * If we have task_ctx != NULL we only notify the task context itself.
5902 * the task context itself. The task_ctx is set 5998 * The task_ctx is set only for EXIT events before releasing task
5903 * only for EXIT events before releasing task
5904 * context. 5999 * context.
5905 */ 6000 */
5906 if (task_ctx) { 6001 if (task_ctx) {
5907 perf_event_aux_task_ctx(output, data, task_ctx); 6002 perf_iterate_ctx(task_ctx, output, data, false);
5908 return; 6003 goto done;
5909 } 6004 }
5910 6005
5911 rcu_read_lock(); 6006 perf_iterate_sb_cpu(output, data);
5912 list_for_each_entry_rcu(pmu, &pmus, entry) { 6007
5913 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 6008 for_each_task_context_nr(ctxn) {
5914 if (cpuctx->unique_pmu != pmu)
5915 goto next;
5916 perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
5917 ctxn = pmu->task_ctx_nr;
5918 if (ctxn < 0)
5919 goto next;
5920 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 6009 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5921 if (ctx) 6010 if (ctx)
5922 perf_event_aux_ctx(ctx, output, data, false); 6011 perf_iterate_ctx(ctx, output, data, false);
5923next:
5924 put_cpu_ptr(pmu->pmu_cpu_context);
5925 } 6012 }
6013done:
6014 preempt_enable();
5926 rcu_read_unlock(); 6015 rcu_read_unlock();
5927} 6016}
5928 6017
@@ -5971,7 +6060,7 @@ void perf_event_exec(void)
5971 6060
5972 perf_event_enable_on_exec(ctxn); 6061 perf_event_enable_on_exec(ctxn);
5973 6062
5974 perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, 6063 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
5975 true); 6064 true);
5976 } 6065 }
5977 rcu_read_unlock(); 6066 rcu_read_unlock();
@@ -6015,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info)
6015 }; 6104 };
6016 6105
6017 rcu_read_lock(); 6106 rcu_read_lock();
6018 perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); 6107 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6019 if (cpuctx->task_ctx) 6108 if (cpuctx->task_ctx)
6020 perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, 6109 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6021 &ro, false); 6110 &ro, false);
6022 rcu_read_unlock(); 6111 rcu_read_unlock();
6023 6112
@@ -6146,7 +6235,7 @@ static void perf_event_task(struct task_struct *task,
6146 }, 6235 },
6147 }; 6236 };
6148 6237
6149 perf_event_aux(perf_event_task_output, 6238 perf_iterate_sb(perf_event_task_output,
6150 &task_event, 6239 &task_event,
6151 task_ctx); 6240 task_ctx);
6152} 6241}
@@ -6225,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
6225 6314
6226 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 6315 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6227 6316
6228 perf_event_aux(perf_event_comm_output, 6317 perf_iterate_sb(perf_event_comm_output,
6229 comm_event, 6318 comm_event,
6230 NULL); 6319 NULL);
6231} 6320}
@@ -6456,7 +6545,7 @@ got_name:
6456 6545
6457 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 6546 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6458 6547
6459 perf_event_aux(perf_event_mmap_output, 6548 perf_iterate_sb(perf_event_mmap_output,
6460 mmap_event, 6549 mmap_event,
6461 NULL); 6550 NULL);
6462 6551
@@ -6539,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
6539 if (!ctx) 6628 if (!ctx)
6540 continue; 6629 continue;
6541 6630
6542 perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); 6631 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
6543 } 6632 }
6544 rcu_read_unlock(); 6633 rcu_read_unlock();
6545} 6634}
@@ -6726,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task,
6726 }, 6815 },
6727 }; 6816 };
6728 6817
6729 perf_event_aux(perf_event_switch_output, 6818 perf_iterate_sb(perf_event_switch_output,
6730 &switch_event, 6819 &switch_event,
6731 NULL); 6820 NULL);
6732} 6821}
@@ -7333,7 +7422,7 @@ static struct pmu perf_swevent = {
7333static int perf_tp_filter_match(struct perf_event *event, 7422static int perf_tp_filter_match(struct perf_event *event,
7334 struct perf_sample_data *data) 7423 struct perf_sample_data *data)
7335{ 7424{
7336 void *record = data->raw->data; 7425 void *record = data->raw->frag.data;
7337 7426
7338 /* only top level events have filters set */ 7427 /* only top level events have filters set */
7339 if (event->parent) 7428 if (event->parent)
@@ -7389,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7389 struct perf_event *event; 7478 struct perf_event *event;
7390 7479
7391 struct perf_raw_record raw = { 7480 struct perf_raw_record raw = {
7392 .size = entry_size, 7481 .frag = {
7393 .data = record, 7482 .size = entry_size,
7483 .data = record,
7484 },
7394 }; 7485 };
7395 7486
7396 perf_sample_data_init(&data, 0, 0); 7487 perf_sample_data_init(&data, 0, 0);
@@ -8648,6 +8739,28 @@ unlock:
8648 return pmu; 8739 return pmu;
8649} 8740}
8650 8741
8742static void attach_sb_event(struct perf_event *event)
8743{
8744 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
8745
8746 raw_spin_lock(&pel->lock);
8747 list_add_rcu(&event->sb_list, &pel->list);
8748 raw_spin_unlock(&pel->lock);
8749}
8750
8751/*
8752 * We keep a list of all !task (and therefore per-cpu) events
8753 * that need to receive side-band records.
8754 *
8755 * This avoids having to scan all the various PMU per-cpu contexts
8756 * looking for them.
8757 */
8758static void account_pmu_sb_event(struct perf_event *event)
8759{
8760 if (is_sb_event(event))
8761 attach_sb_event(event);
8762}
8763
8651static void account_event_cpu(struct perf_event *event, int cpu) 8764static void account_event_cpu(struct perf_event *event, int cpu)
8652{ 8765{
8653 if (event->parent) 8766 if (event->parent)
@@ -8728,6 +8841,8 @@ static void account_event(struct perf_event *event)
8728enabled: 8841enabled:
8729 8842
8730 account_event_cpu(event, event->cpu); 8843 account_event_cpu(event, event->cpu);
8844
8845 account_pmu_sb_event(event);
8731} 8846}
8732 8847
8733/* 8848/*
@@ -8876,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
8876 8991
8877 if (!event->parent) { 8992 if (!event->parent) {
8878 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 8993 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
8879 err = get_callchain_buffers(); 8994 err = get_callchain_buffers(attr->sample_max_stack);
8880 if (err) 8995 if (err)
8881 goto err_addr_filters; 8996 goto err_addr_filters;
8882 } 8997 }
@@ -9198,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open,
9198 return -EINVAL; 9313 return -EINVAL;
9199 } 9314 }
9200 9315
9316 if (!attr.sample_max_stack)
9317 attr.sample_max_stack = sysctl_perf_event_max_stack;
9318
9201 /* 9319 /*
9202 * In cgroup mode, the pid argument is used to pass the fd 9320 * In cgroup mode, the pid argument is used to pass the fd
9203 * opened to the cgroup directory in cgroupfs. The cpu argument 9321 * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9271,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open,
9271 9389
9272 if (is_sampling_event(event)) { 9390 if (is_sampling_event(event)) {
9273 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { 9391 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
9274 err = -ENOTSUPP; 9392 err = -EOPNOTSUPP;
9275 goto err_alloc; 9393 goto err_alloc;
9276 } 9394 }
9277 } 9395 }
@@ -10233,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void)
10233 swhash = &per_cpu(swevent_htable, cpu); 10351 swhash = &per_cpu(swevent_htable, cpu);
10234 mutex_init(&swhash->hlist_mutex); 10352 mutex_init(&swhash->hlist_mutex);
10235 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); 10353 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
10354
10355 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10356 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
10236 } 10357 }
10237} 10358}
10238 10359
10239static void perf_event_init_cpu(int cpu) 10360int perf_event_init_cpu(unsigned int cpu)
10240{ 10361{
10241 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 10362 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10242 10363
@@ -10249,6 +10370,7 @@ static void perf_event_init_cpu(int cpu)
10249 rcu_assign_pointer(swhash->swevent_hlist, hlist); 10370 rcu_assign_pointer(swhash->swevent_hlist, hlist);
10250 } 10371 }
10251 mutex_unlock(&swhash->hlist_mutex); 10372 mutex_unlock(&swhash->hlist_mutex);
10373 return 0;
10252} 10374}
10253 10375
10254#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 10376#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10280,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu)
10280 } 10402 }
10281 srcu_read_unlock(&pmus_srcu, idx); 10403 srcu_read_unlock(&pmus_srcu, idx);
10282} 10404}
10405#else
10406
10407static void perf_event_exit_cpu_context(int cpu) { }
10283 10408
10284static void perf_event_exit_cpu(int cpu) 10409#endif
10410
10411int perf_event_exit_cpu(unsigned int cpu)
10285{ 10412{
10286 perf_event_exit_cpu_context(cpu); 10413 perf_event_exit_cpu_context(cpu);
10414 return 0;
10287} 10415}
10288#else
10289static inline void perf_event_exit_cpu(int cpu) { }
10290#endif
10291 10416
10292static int 10417static int
10293perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) 10418perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10309,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = {
10309 .priority = INT_MIN, 10434 .priority = INT_MIN,
10310}; 10435};
10311 10436
10312static int
10313perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
10314{
10315 unsigned int cpu = (long)hcpu;
10316
10317 switch (action & ~CPU_TASKS_FROZEN) {
10318
10319 case CPU_UP_PREPARE:
10320 /*
10321 * This must be done before the CPU comes alive, because the
10322 * moment we can run tasks we can encounter (software) events.
10323 *
10324 * Specifically, someone can have inherited events on kthreadd
10325 * or a pre-existing worker thread that gets re-bound.
10326 */
10327 perf_event_init_cpu(cpu);
10328 break;
10329
10330 case CPU_DOWN_PREPARE:
10331 /*
10332 * This must be done before the CPU dies because after that an
10333 * active event might want to IPI the CPU and that'll not work
10334 * so great for dead CPUs.
10335 *
10336 * XXX smp_call_function_single() return -ENXIO without a warn
10337 * so we could possibly deal with this.
10338 *
10339 * This is safe against new events arriving because
10340 * sys_perf_event_open() serializes against hotplug using
10341 * get_online_cpus().
10342 */
10343 perf_event_exit_cpu(cpu);
10344 break;
10345 default:
10346 break;
10347 }
10348
10349 return NOTIFY_OK;
10350}
10351
10352void __init perf_event_init(void) 10437void __init perf_event_init(void)
10353{ 10438{
10354 int ret; 10439 int ret;
@@ -10361,7 +10446,7 @@ void __init perf_event_init(void)
10361 perf_pmu_register(&perf_cpu_clock, NULL, -1); 10446 perf_pmu_register(&perf_cpu_clock, NULL, -1);
10362 perf_pmu_register(&perf_task_clock, NULL, -1); 10447 perf_pmu_register(&perf_task_clock, NULL, -1);
10363 perf_tp_register(); 10448 perf_tp_register();
10364 perf_cpu_notifier(perf_cpu_notify); 10449 perf_event_init_cpu(smp_processor_id());
10365 register_reboot_notifier(&perf_reboot_notifier); 10450 register_reboot_notifier(&perf_reboot_notifier);
10366 10451
10367 ret = init_hw_breakpoint(); 10452 ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
123 return rb->aux_nr_pages << PAGE_SHIFT; 123 return rb->aux_nr_pages << PAGE_SHIFT;
124} 124}
125 125
126#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 126#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
127static inline unsigned long \
128func_name(struct perf_output_handle *handle, \
129 const void *buf, unsigned long len) \
130{ \ 127{ \
131 unsigned long size, written; \ 128 unsigned long size, written; \
132 \ 129 \
133 do { \ 130 do { \
134 size = min(handle->size, len); \ 131 size = min(handle->size, len); \
135 written = memcpy_func(handle->addr, buf, size); \ 132 written = memcpy_func(__VA_ARGS__); \
136 written = size - written; \ 133 written = size - written; \
137 \ 134 \
138 len -= written; \ 135 len -= written; \
139 handle->addr += written; \ 136 handle->addr += written; \
140 buf += written; \ 137 if (advance_buf) \
138 buf += written; \
141 handle->size -= written; \ 139 handle->size -= written; \
142 if (!handle->size) { \ 140 if (!handle->size) { \
143 struct ring_buffer *rb = handle->rb; \ 141 struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
152 return len; \ 150 return len; \
153} 151}
154 152
153#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
154static inline unsigned long \
155func_name(struct perf_output_handle *handle, \
156 const void *buf, unsigned long len) \
157__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
158
159static inline unsigned long
160__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
161 const void *buf, unsigned long len)
162{
163 unsigned long orig_len = len;
164 __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
165 orig_len - len, size)
166}
167
155static inline unsigned long 168static inline unsigned long
156memcpy_common(void *dst, const void *src, unsigned long n) 169memcpy_common(void *dst, const void *src, unsigned long n)
157{ 170{
diff --git a/kernel/exit.c b/kernel/exit.c
index 9e6e1356e6bb..84ae830234f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -211,6 +211,82 @@ repeat:
211} 211}
212 212
213/* 213/*
214 * Note that if this function returns a valid task_struct pointer (!NULL)
215 * task->usage must remain >0 for the duration of the RCU critical section.
216 */
217struct task_struct *task_rcu_dereference(struct task_struct **ptask)
218{
219 struct sighand_struct *sighand;
220 struct task_struct *task;
221
222 /*
223 * We need to verify that release_task() was not called and thus
224 * delayed_put_task_struct() can't run and drop the last reference
225 * before rcu_read_unlock(). We check task->sighand != NULL,
226 * but we can read the already freed and reused memory.
227 */
228retry:
229 task = rcu_dereference(*ptask);
230 if (!task)
231 return NULL;
232
233 probe_kernel_address(&task->sighand, sighand);
234
235 /*
236 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
237 * was already freed we can not miss the preceding update of this
238 * pointer.
239 */
240 smp_rmb();
241 if (unlikely(task != READ_ONCE(*ptask)))
242 goto retry;
243
244 /*
245 * We've re-checked that "task == *ptask", now we have two different
246 * cases:
247 *
248 * 1. This is actually the same task/task_struct. In this case
249 * sighand != NULL tells us it is still alive.
250 *
251 * 2. This is another task which got the same memory for task_struct.
252 * We can't know this of course, and we can not trust
253 * sighand != NULL.
254 *
255 * In this case we actually return a random value, but this is
256 * correct.
257 *
258 * If we return NULL - we can pretend that we actually noticed that
259 * *ptask was updated when the previous task has exited. Or pretend
260 * that probe_slab_address(&sighand) reads NULL.
261 *
262 * If we return the new task (because sighand is not NULL for any
263 * reason) - this is fine too. This (new) task can't go away before
264 * another gp pass.
265 *
266 * And note: We could even eliminate the false positive if re-read
267 * task->sighand once again to avoid the falsely NULL. But this case
268 * is very unlikely so we don't care.
269 */
270 if (!sighand)
271 return NULL;
272
273 return task;
274}
275
276struct task_struct *try_get_task_struct(struct task_struct **ptask)
277{
278 struct task_struct *task;
279
280 rcu_read_lock();
281 task = task_rcu_dereference(ptask);
282 if (task)
283 get_task_struct(task);
284 rcu_read_unlock();
285
286 return task;
287}
288
289/*
214 * Determine if a process group is "orphaned", according to the POSIX 290 * Determine if a process group is "orphaned", according to the POSIX
215 * definition in 2.2.2.52. Orphaned process groups are not to be affected 291 * definition in 2.2.2.52. Orphaned process groups are not to be affected
216 * by terminal-generated stop signals. Newly orphaned process groups are 292 * by terminal-generated stop signals. Newly orphaned process groups are
@@ -700,10 +776,14 @@ void do_exit(long code)
700 776
701 exit_signals(tsk); /* sets PF_EXITING */ 777 exit_signals(tsk); /* sets PF_EXITING */
702 /* 778 /*
703 * tsk->flags are checked in the futex code to protect against 779 * Ensure that all new tsk->pi_lock acquisitions must observe
704 * an exiting task cleaning up the robust pi futexes. 780 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
705 */ 781 */
706 smp_mb(); 782 smp_mb();
783 /*
784 * Ensure that we must observe the pi_state in exit_mm() ->
785 * mm_release() -> exit_pi_state_list().
786 */
707 raw_spin_unlock_wait(&tsk->pi_lock); 787 raw_spin_unlock_wait(&tsk->pi_lock);
708 788
709 if (unlikely(in_atomic())) { 789 if (unlikely(in_atomic())) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk)
148} 148}
149#endif 149#endif
150 150
151void __weak arch_release_thread_info(struct thread_info *ti) 151void __weak arch_release_thread_stack(unsigned long *stack)
152{ 152{
153} 153}
154 154
155#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR 155#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
156 156
157/* 157/*
158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
159 * kmemcache based allocator. 159 * kmemcache based allocator.
160 */ 160 */
161# if THREAD_SIZE >= PAGE_SIZE 161# if THREAD_SIZE >= PAGE_SIZE
162static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 162static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
163 int node) 163 int node)
164{ 164{
165 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, 165 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
166 THREAD_SIZE_ORDER); 166 THREAD_SIZE_ORDER);
167
168 if (page)
169 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
170 1 << THREAD_SIZE_ORDER);
171 167
172 return page ? page_address(page) : NULL; 168 return page ? page_address(page) : NULL;
173} 169}
174 170
175static inline void free_thread_info(struct thread_info *ti) 171static inline void free_thread_stack(unsigned long *stack)
176{ 172{
177 struct page *page = virt_to_page(ti); 173 __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
178
179 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
180 -(1 << THREAD_SIZE_ORDER));
181 __free_kmem_pages(page, THREAD_SIZE_ORDER);
182} 174}
183# else 175# else
184static struct kmem_cache *thread_info_cache; 176static struct kmem_cache *thread_stack_cache;
185 177
186static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 178static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
187 int node) 179 int node)
188{ 180{
189 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); 181 return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
190} 182}
191 183
192static void free_thread_info(struct thread_info *ti) 184static void free_thread_stack(unsigned long *stack)
193{ 185{
194 kmem_cache_free(thread_info_cache, ti); 186 kmem_cache_free(thread_stack_cache, stack);
195} 187}
196 188
197void thread_info_cache_init(void) 189void thread_stack_cache_init(void)
198{ 190{
199 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, 191 thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
200 THREAD_SIZE, 0, NULL); 192 THREAD_SIZE, 0, NULL);
201 BUG_ON(thread_info_cache == NULL); 193 BUG_ON(thread_stack_cache == NULL);
202} 194}
203# endif 195# endif
204#endif 196#endif
@@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep;
221/* SLAB cache for mm_struct structures (tsk->mm) */ 213/* SLAB cache for mm_struct structures (tsk->mm) */
222static struct kmem_cache *mm_cachep; 214static struct kmem_cache *mm_cachep;
223 215
224static void account_kernel_stack(struct thread_info *ti, int account) 216static void account_kernel_stack(unsigned long *stack, int account)
225{ 217{
226 struct zone *zone = page_zone(virt_to_page(ti)); 218 /* All stack pages are in the same zone and belong to the same memcg. */
219 struct page *first_page = virt_to_page(stack);
220
221 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
222 THREAD_SIZE / 1024 * account);
227 223
228 mod_zone_page_state(zone, NR_KERNEL_STACK, account); 224 memcg_kmem_update_page_stat(
225 first_page, MEMCG_KERNEL_STACK_KB,
226 account * (THREAD_SIZE / 1024));
229} 227}
230 228
231void free_task(struct task_struct *tsk) 229void free_task(struct task_struct *tsk)
232{ 230{
233 account_kernel_stack(tsk->stack, -1); 231 account_kernel_stack(tsk->stack, -1);
234 arch_release_thread_info(tsk->stack); 232 arch_release_thread_stack(tsk->stack);
235 free_thread_info(tsk->stack); 233 free_thread_stack(tsk->stack);
236 rt_mutex_debug_task_free(tsk); 234 rt_mutex_debug_task_free(tsk);
237 ftrace_graph_exit_task(tsk); 235 ftrace_graph_exit_task(tsk);
238 put_seccomp_filter(tsk); 236 put_seccomp_filter(tsk);
@@ -343,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
343static struct task_struct *dup_task_struct(struct task_struct *orig, int node) 341static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
344{ 342{
345 struct task_struct *tsk; 343 struct task_struct *tsk;
346 struct thread_info *ti; 344 unsigned long *stack;
347 int err; 345 int err;
348 346
349 if (node == NUMA_NO_NODE) 347 if (node == NUMA_NO_NODE)
@@ -352,15 +350,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
352 if (!tsk) 350 if (!tsk)
353 return NULL; 351 return NULL;
354 352
355 ti = alloc_thread_info_node(tsk, node); 353 stack = alloc_thread_stack_node(tsk, node);
356 if (!ti) 354 if (!stack)
357 goto free_tsk; 355 goto free_tsk;
358 356
359 err = arch_dup_task_struct(tsk, orig); 357 err = arch_dup_task_struct(tsk, orig);
360 if (err) 358 if (err)
361 goto free_ti; 359 goto free_stack;
362 360
363 tsk->stack = ti; 361 tsk->stack = stack;
364#ifdef CONFIG_SECCOMP 362#ifdef CONFIG_SECCOMP
365 /* 363 /*
366 * We must handle setting up seccomp filters once we're under 364 * We must handle setting up seccomp filters once we're under
@@ -392,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
392 tsk->task_frag.page = NULL; 390 tsk->task_frag.page = NULL;
393 tsk->wake_q.next = NULL; 391 tsk->wake_q.next = NULL;
394 392
395 account_kernel_stack(ti, 1); 393 account_kernel_stack(stack, 1);
396 394
397 kcov_task_init(tsk); 395 kcov_task_init(tsk);
398 396
399 return tsk; 397 return tsk;
400 398
401free_ti: 399free_stack:
402 free_thread_info(ti); 400 free_thread_stack(stack);
403free_tsk: 401free_tsk:
404 free_task_struct(tsk); 402 free_task_struct(tsk);
405 return NULL; 403 return NULL;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) 42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
43 return false; 43 return false;
44 44
45 if (test_thread_flag(TIF_MEMDIE)) 45 if (test_tsk_thread_flag(p, TIF_MEMDIE))
46 return false; 46 return false;
47 47
48 if (pm_nosig_freezing || cgroup_freezing(p)) 48 if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
469{ 469{
470 unsigned long address = (unsigned long)uaddr; 470 unsigned long address = (unsigned long)uaddr;
471 struct mm_struct *mm = current->mm; 471 struct mm_struct *mm = current->mm;
472 struct page *page; 472 struct page *page, *tail;
473 struct address_space *mapping; 473 struct address_space *mapping;
474 int err, ro = 0; 474 int err, ro = 0;
475 475
@@ -530,7 +530,15 @@ again:
530 * considered here and page lock forces unnecessarily serialization 530 * considered here and page lock forces unnecessarily serialization
531 * From this point on, mapping will be re-verified if necessary and 531 * From this point on, mapping will be re-verified if necessary and
532 * page lock will be acquired only if it is unavoidable 532 * page lock will be acquired only if it is unavoidable
533 */ 533 *
534 * Mapping checks require the head page for any compound page so the
535 * head page and mapping is looked up now. For anonymous pages, it
536 * does not matter if the page splits in the future as the key is
537 * based on the address. For filesystem-backed pages, the tail is
538 * required as the index of the page determines the key. For
539 * base pages, there is no tail page and tail == page.
540 */
541 tail = page;
534 page = compound_head(page); 542 page = compound_head(page);
535 mapping = READ_ONCE(page->mapping); 543 mapping = READ_ONCE(page->mapping);
536 544
@@ -654,7 +662,7 @@ again:
654 662
655 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 663 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
656 key->shared.inode = inode; 664 key->shared.inode = inode;
657 key->shared.pgoff = basepage_index(page); 665 key->shared.pgoff = basepage_index(tail);
658 rcu_read_unlock(); 666 rcu_read_unlock();
659 } 667 }
660 668
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index e25e92fb44fa..6a5c239c7669 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,7 @@
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include "gcov.h" 19#include "gcov.h"
20 20
21#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 21#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
22#define GCOV_COUNTERS 10 22#define GCOV_COUNTERS 10
23#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 23#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
24#define GCOV_COUNTERS 9 24#define GCOV_COUNTERS 9
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2ee42e95a3ce..1d3ee3169202 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
9obj-$(CONFIG_PM_SLEEP) += pm.o 9obj-$(CONFIG_PM_SLEEP) += pm.o
10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o 10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
11obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o 11obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
12obj-$(CONFIG_SMP) += affinity.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..f68959341c0f
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,61 @@
1
2#include <linux/interrupt.h>
3#include <linux/kernel.h>
4#include <linux/slab.h>
5#include <linux/cpu.h>
6
7static int get_first_sibling(unsigned int cpu)
8{
9 unsigned int ret;
10
11 ret = cpumask_first(topology_sibling_cpumask(cpu));
12 if (ret < nr_cpu_ids)
13 return ret;
14 return cpu;
15}
16
17/*
18 * Take a map of online CPUs and the number of available interrupt vectors
19 * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
20 * so that they are distributed as good as possible around the CPUs. If
21 * more vectors than CPUs are available we'll map one to each CPU,
22 * otherwise we map one to the first sibling of each socket.
23 *
24 * If there are more vectors than CPUs we will still only have one bit
25 * set per CPU, but interrupt code will keep on assigning the vectors from
26 * the start of the bitmap until we run out of vectors.
27 */
28struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
29{
30 struct cpumask *affinity_mask;
31 unsigned int max_vecs = *nr_vecs;
32
33 if (max_vecs == 1)
34 return NULL;
35
36 affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
37 if (!affinity_mask) {
38 *nr_vecs = 1;
39 return NULL;
40 }
41
42 if (max_vecs >= num_online_cpus()) {
43 cpumask_copy(affinity_mask, cpu_online_mask);
44 *nr_vecs = num_online_cpus();
45 } else {
46 unsigned int vecs = 0, cpu;
47
48 for_each_online_cpu(cpu) {
49 if (cpu == get_first_sibling(cpu)) {
50 cpumask_set_cpu(cpu, affinity_mask);
51 vecs++;
52 }
53
54 if (--max_vecs == 0)
55 break;
56 }
57 *nr_vecs = vecs;
58 }
59
60 return affinity_mask;
61}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2f9f2b0e79f2..b4c1bc7c9ca2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -426,6 +426,49 @@ out_unlock:
426} 426}
427EXPORT_SYMBOL_GPL(handle_simple_irq); 427EXPORT_SYMBOL_GPL(handle_simple_irq);
428 428
429/**
430 * handle_untracked_irq - Simple and software-decoded IRQs.
431 * @desc: the interrupt description structure for this irq
432 *
433 * Untracked interrupts are sent from a demultiplexing interrupt
434 * handler when the demultiplexer does not know which device it its
435 * multiplexed irq domain generated the interrupt. IRQ's handled
436 * through here are not subjected to stats tracking, randomness, or
437 * spurious interrupt detection.
438 *
439 * Note: Like handle_simple_irq, the caller is expected to handle
440 * the ack, clear, mask and unmask issues if necessary.
441 */
442void handle_untracked_irq(struct irq_desc *desc)
443{
444 unsigned int flags = 0;
445
446 raw_spin_lock(&desc->lock);
447
448 if (!irq_may_run(desc))
449 goto out_unlock;
450
451 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
452
453 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
454 desc->istate |= IRQS_PENDING;
455 goto out_unlock;
456 }
457
458 desc->istate &= ~IRQS_PENDING;
459 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
460 raw_spin_unlock(&desc->lock);
461
462 __handle_irq_event_percpu(desc, &flags);
463
464 raw_spin_lock(&desc->lock);
465 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
466
467out_unlock:
468 raw_spin_unlock(&desc->lock);
469}
470EXPORT_SYMBOL_GPL(handle_untracked_irq);
471
429/* 472/*
430 * Called unconditionally from handle_level_irq() and only for oneshot 473 * Called unconditionally from handle_level_irq() and only for oneshot
431 * interrupts from handle_fasteoi_irq() 474 * interrupts from handle_fasteoi_irq()
@@ -1093,3 +1136,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1093 1136
1094 return 0; 1137 return 0;
1095} 1138}
1139
1140/**
1141 * irq_chip_pm_get - Enable power for an IRQ chip
1142 * @data: Pointer to interrupt specific data
1143 *
1144 * Enable the power to the IRQ chip referenced by the interrupt data
1145 * structure.
1146 */
1147int irq_chip_pm_get(struct irq_data *data)
1148{
1149 int retval;
1150
1151 if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
1152 retval = pm_runtime_get_sync(data->chip->parent_device);
1153 if (retval < 0) {
1154 pm_runtime_put_noidle(data->chip->parent_device);
1155 return retval;
1156 }
1157 }
1158
1159 return 0;
1160}
1161
1162/**
1163 * irq_chip_pm_put - Disable power for an IRQ chip
1164 * @data: Pointer to interrupt specific data
1165 *
1166 * Disable the power to the IRQ chip referenced by the interrupt data
1167 * structure, belongs. Note that power will only be disabled, once this
1168 * function has been called for all IRQs that have called irq_chip_pm_get().
1169 */
1170int irq_chip_pm_put(struct irq_data *data)
1171{
1172 int retval = 0;
1173
1174 if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
1175 retval = pm_runtime_put(data->chip->parent_device);
1176
1177 return (retval < 0) ? retval : 0;
1178}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b5485b446..d3f24905852c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
132 wake_up_process(action->thread); 132 wake_up_process(action->thread);
133} 133}
134 134
135irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) 135irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
136{ 136{
137 irqreturn_t retval = IRQ_NONE; 137 irqreturn_t retval = IRQ_NONE;
138 unsigned int flags = 0, irq = desc->irq_data.irq; 138 unsigned int irq = desc->irq_data.irq;
139 struct irqaction *action; 139 struct irqaction *action;
140 140
141 for_each_action_of_desc(desc, action) { 141 for_each_action_of_desc(desc, action) {
@@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
164 164
165 /* Fall through to add to randomness */ 165 /* Fall through to add to randomness */
166 case IRQ_HANDLED: 166 case IRQ_HANDLED:
167 flags |= action->flags; 167 *flags |= action->flags;
168 break; 168 break;
169 169
170 default: 170 default:
@@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
174 retval |= res; 174 retval |= res;
175 } 175 }
176 176
177 add_interrupt_randomness(irq, flags); 177 return retval;
178}
179
180irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
181{
182 irqreturn_t retval;
183 unsigned int flags = 0;
184
185 retval = __handle_irq_event_percpu(desc, &flags);
186
187 add_interrupt_randomness(desc->irq_data.irq, flags);
178 188
179 if (!noirqdebug) 189 if (!noirqdebug)
180 note_interrupt(desc, retval); 190 note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 09be2c903c6d..bc226e783bd2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
9#include <linux/kernel_stat.h> 9#include <linux/kernel_stat.h>
10#include <linux/pm_runtime.h>
10 11
11#ifdef CONFIG_SPARSE_IRQ 12#ifdef CONFIG_SPARSE_IRQ
12# define IRQ_BITMAP_BITS (NR_IRQS + 8196) 13# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq);
83 84
84extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
85 86
87irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
86irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); 88irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
87irqreturn_t handle_irq_event(struct irq_desc *desc); 89irqreturn_t handle_irq_event(struct irq_desc *desc);
88 90
@@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq,
105 struct irqaction *action) { } 107 struct irqaction *action) { }
106#endif 108#endif
107 109
110extern bool irq_can_set_affinity_usr(unsigned int irq);
111
108extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); 112extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
109 113
110extern void irq_set_thread_affinity(struct irq_desc *desc); 114extern void irq_set_thread_affinity(struct irq_desc *desc);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 89b49f6773f0..1a9abc1c8ea0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -76,14 +76,14 @@ int irq_reserve_ipi(struct irq_domain *domain,
76 } 76 }
77 } 77 }
78 78
79 virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); 79 virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
80 if (virq <= 0) { 80 if (virq <= 0) {
81 pr_warn("Can't reserve IPI, failed to alloc descs\n"); 81 pr_warn("Can't reserve IPI, failed to alloc descs\n");
82 return -ENOMEM; 82 return -ENOMEM;
83 } 83 }
84 84
85 virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, 85 virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
86 (void *) dest, true); 86 (void *) dest, true, NULL);
87 87
88 if (virq <= 0) { 88 if (virq <= 0) {
89 pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); 89 pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8731e1c5d1e7..a623b44f2d4b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
68 return 0; 68 return 0;
69} 69}
70 70
71static void desc_smp_init(struct irq_desc *desc, int node) 71static void desc_smp_init(struct irq_desc *desc, int node,
72 const struct cpumask *affinity)
72{ 73{
73 cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); 74 if (!affinity)
75 affinity = irq_default_affinity;
76 cpumask_copy(desc->irq_common_data.affinity, affinity);
77
74#ifdef CONFIG_GENERIC_PENDING_IRQ 78#ifdef CONFIG_GENERIC_PENDING_IRQ
75 cpumask_clear(desc->pending_mask); 79 cpumask_clear(desc->pending_mask);
76#endif 80#endif
@@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node)
82#else 86#else
83static inline int 87static inline int
84alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } 88alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
85static inline void desc_smp_init(struct irq_desc *desc, int node) { } 89static inline void
90desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
86#endif 91#endif
87 92
88static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, 93static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
89 struct module *owner) 94 const struct cpumask *affinity, struct module *owner)
90{ 95{
91 int cpu; 96 int cpu;
92 97
@@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
107 desc->owner = owner; 112 desc->owner = owner;
108 for_each_possible_cpu(cpu) 113 for_each_possible_cpu(cpu)
109 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; 114 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
110 desc_smp_init(desc, node); 115 desc_smp_init(desc, node, affinity);
111} 116}
112 117
113int nr_irqs = NR_IRQS; 118int nr_irqs = NR_IRQS;
@@ -158,7 +163,9 @@ void irq_unlock_sparse(void)
158 mutex_unlock(&sparse_irq_lock); 163 mutex_unlock(&sparse_irq_lock);
159} 164}
160 165
161static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) 166static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
167 const struct cpumask *affinity,
168 struct module *owner)
162{ 169{
163 struct irq_desc *desc; 170 struct irq_desc *desc;
164 gfp_t gfp = GFP_KERNEL; 171 gfp_t gfp = GFP_KERNEL;
@@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
178 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 185 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
179 init_rcu_head(&desc->rcu); 186 init_rcu_head(&desc->rcu);
180 187
181 desc_set_defaults(irq, desc, node, owner); 188 desc_set_defaults(irq, desc, node, affinity, owner);
189 irqd_set(&desc->irq_data, flags);
182 190
183 return desc; 191 return desc;
184 192
@@ -223,13 +231,32 @@ static void free_desc(unsigned int irq)
223} 231}
224 232
225static int alloc_descs(unsigned int start, unsigned int cnt, int node, 233static int alloc_descs(unsigned int start, unsigned int cnt, int node,
226 struct module *owner) 234 const struct cpumask *affinity, struct module *owner)
227{ 235{
236 const struct cpumask *mask = NULL;
228 struct irq_desc *desc; 237 struct irq_desc *desc;
229 int i; 238 unsigned int flags;
239 int i, cpu = -1;
240
241 if (affinity && cpumask_empty(affinity))
242 return -EINVAL;
243
244 flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
230 245
231 for (i = 0; i < cnt; i++) { 246 for (i = 0; i < cnt; i++) {
232 desc = alloc_desc(start + i, node, owner); 247 if (affinity) {
248 cpu = cpumask_next(cpu, affinity);
249 if (cpu >= nr_cpu_ids)
250 cpu = cpumask_first(affinity);
251 node = cpu_to_node(cpu);
252
253 /*
254 * For single allocations we use the caller provided
255 * mask otherwise we use the mask of the target cpu
256 */
257 mask = cnt == 1 ? affinity : cpumask_of(cpu);
258 }
259 desc = alloc_desc(start + i, node, flags, mask, owner);
233 if (!desc) 260 if (!desc)
234 goto err; 261 goto err;
235 mutex_lock(&sparse_irq_lock); 262 mutex_lock(&sparse_irq_lock);
@@ -277,7 +304,7 @@ int __init early_irq_init(void)
277 nr_irqs = initcnt; 304 nr_irqs = initcnt;
278 305
279 for (i = 0; i < initcnt; i++) { 306 for (i = 0; i < initcnt; i++) {
280 desc = alloc_desc(i, node, NULL); 307 desc = alloc_desc(i, node, 0, NULL, NULL);
281 set_bit(i, allocated_irqs); 308 set_bit(i, allocated_irqs);
282 irq_insert_desc(i, desc); 309 irq_insert_desc(i, desc);
283 } 310 }
@@ -311,7 +338,7 @@ int __init early_irq_init(void)
311 alloc_masks(&desc[i], GFP_KERNEL, node); 338 alloc_masks(&desc[i], GFP_KERNEL, node);
312 raw_spin_lock_init(&desc[i].lock); 339 raw_spin_lock_init(&desc[i].lock);
313 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 340 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
314 desc_set_defaults(i, &desc[i], node, NULL); 341 desc_set_defaults(i, &desc[i], node, NULL, NULL);
315 } 342 }
316 return arch_early_irq_init(); 343 return arch_early_irq_init();
317} 344}
@@ -328,11 +355,12 @@ static void free_desc(unsigned int irq)
328 unsigned long flags; 355 unsigned long flags;
329 356
330 raw_spin_lock_irqsave(&desc->lock, flags); 357 raw_spin_lock_irqsave(&desc->lock, flags);
331 desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); 358 desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
332 raw_spin_unlock_irqrestore(&desc->lock, flags); 359 raw_spin_unlock_irqrestore(&desc->lock, flags);
333} 360}
334 361
335static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, 362static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
363 const struct cpumask *affinity,
336 struct module *owner) 364 struct module *owner)
337{ 365{
338 u32 i; 366 u32 i;
@@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
453 * @cnt: Number of consecutive irqs to allocate. 481 * @cnt: Number of consecutive irqs to allocate.
454 * @node: Preferred node on which the irq descriptor should be allocated 482 * @node: Preferred node on which the irq descriptor should be allocated
455 * @owner: Owning module (can be NULL) 483 * @owner: Owning module (can be NULL)
484 * @affinity: Optional pointer to an affinity mask which hints where the
485 * irq descriptors should be allocated and which default
486 * affinities to use
456 * 487 *
457 * Returns the first irq number or error code 488 * Returns the first irq number or error code
458 */ 489 */
459int __ref 490int __ref
460__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, 491__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
461 struct module *owner) 492 struct module *owner, const struct cpumask *affinity)
462{ 493{
463 int start, ret; 494 int start, ret;
464 495
@@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
494 525
495 bitmap_set(allocated_irqs, start, cnt); 526 bitmap_set(allocated_irqs, start, cnt);
496 mutex_unlock(&sparse_irq_lock); 527 mutex_unlock(&sparse_irq_lock);
497 return alloc_descs(start, cnt, node, owner); 528 return alloc_descs(start, cnt, node, affinity, owner);
498 529
499err: 530err:
500 mutex_unlock(&sparse_irq_lock); 531 mutex_unlock(&sparse_irq_lock);
@@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs);
512 */ 543 */
513unsigned int irq_alloc_hwirqs(int cnt, int node) 544unsigned int irq_alloc_hwirqs(int cnt, int node)
514{ 545{
515 int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); 546 int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL);
516 547
517 if (irq < 0) 548 if (irq < 0)
518 return 0; 549 return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8798b6c9e945..4752b43662e0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
481 } 481 }
482 482
483 /* Allocate a virtual interrupt number */ 483 /* Allocate a virtual interrupt number */
484 virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); 484 virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
485 if (virq <= 0) { 485 if (virq <= 0) {
486 pr_debug("-> virq allocation failed\n"); 486 pr_debug("-> virq allocation failed\n");
487 return 0; 487 return 0;
@@ -567,6 +567,7 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
567unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) 567unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
568{ 568{
569 struct irq_domain *domain; 569 struct irq_domain *domain;
570 struct irq_data *irq_data;
570 irq_hw_number_t hwirq; 571 irq_hw_number_t hwirq;
571 unsigned int type = IRQ_TYPE_NONE; 572 unsigned int type = IRQ_TYPE_NONE;
572 int virq; 573 int virq;
@@ -588,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
588 if (irq_domain_translate(domain, fwspec, &hwirq, &type)) 589 if (irq_domain_translate(domain, fwspec, &hwirq, &type))
589 return 0; 590 return 0;
590 591
591 if (irq_domain_is_hierarchy(domain)) { 592 /*
593 * WARN if the irqchip returns a type with bits
594 * outside the sense mask set and clear these bits.
595 */
596 if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
597 type &= IRQ_TYPE_SENSE_MASK;
598
599 /*
600 * If we've already configured this interrupt,
601 * don't do it again, or hell will break loose.
602 */
603 virq = irq_find_mapping(domain, hwirq);
604 if (virq) {
605 /*
606 * If the trigger type is not specified or matches the
607 * current trigger type then we are done so return the
608 * interrupt number.
609 */
610 if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
611 return virq;
612
592 /* 613 /*
593 * If we've already configured this interrupt, 614 * If the trigger type has not been set yet, then set
594 * don't do it again, or hell will break loose. 615 * it now and return the interrupt number.
595 */ 616 */
596 virq = irq_find_mapping(domain, hwirq); 617 if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
597 if (virq) 618 irq_data = irq_get_irq_data(virq);
619 if (!irq_data)
620 return 0;
621
622 irqd_set_trigger_type(irq_data, type);
598 return virq; 623 return virq;
624 }
599 625
626 pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
627 hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
628 return 0;
629 }
630
631 if (irq_domain_is_hierarchy(domain)) {
600 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); 632 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
601 if (virq <= 0) 633 if (virq <= 0)
602 return 0; 634 return 0;
@@ -607,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
607 return virq; 639 return virq;
608 } 640 }
609 641
610 /* Set type if specified and different than the current one */ 642 irq_data = irq_get_irq_data(virq);
611 if (type != IRQ_TYPE_NONE && 643 if (!irq_data) {
612 type != irq_get_trigger_type(virq)) 644 if (irq_domain_is_hierarchy(domain))
613 irq_set_irq_type(virq, type); 645 irq_domain_free_irqs(virq, 1);
646 else
647 irq_dispose_mapping(virq);
648 return 0;
649 }
650
651 /* Store trigger type */
652 irqd_set_trigger_type(irq_data, type);
653
614 return virq; 654 return virq;
615} 655}
616EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); 656EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
@@ -640,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq)
640 if (WARN_ON(domain == NULL)) 680 if (WARN_ON(domain == NULL))
641 return; 681 return;
642 682
643 irq_domain_disassociate(domain, virq); 683 if (irq_domain_is_hierarchy(domain)) {
644 irq_free_desc(virq); 684 irq_domain_free_irqs(virq, 1);
685 } else {
686 irq_domain_disassociate(domain, virq);
687 irq_free_desc(virq);
688 }
645} 689}
646EXPORT_SYMBOL_GPL(irq_dispose_mapping); 690EXPORT_SYMBOL_GPL(irq_dispose_mapping);
647 691
@@ -835,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = {
835EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 879EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
836 880
837int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, 881int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
838 int node) 882 int node, const struct cpumask *affinity)
839{ 883{
840 unsigned int hint; 884 unsigned int hint;
841 885
842 if (virq >= 0) { 886 if (virq >= 0) {
843 virq = irq_alloc_descs(virq, virq, cnt, node); 887 virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
888 affinity);
844 } else { 889 } else {
845 hint = hwirq % nr_irqs; 890 hint = hwirq % nr_irqs;
846 if (hint == 0) 891 if (hint == 0)
847 hint++; 892 hint++;
848 virq = irq_alloc_descs_from(hint, cnt, node); 893 virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
849 if (virq <= 0 && hint > 1) 894 affinity);
850 virq = irq_alloc_descs_from(1, cnt, node); 895 if (virq <= 0 && hint > 1) {
896 virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
897 affinity);
898 }
851 } 899 }
852 900
853 return virq; 901 return virq;
@@ -1144,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1144 if (recursive) 1192 if (recursive)
1145 ret = irq_domain_alloc_irqs_recursive(parent, irq_base, 1193 ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
1146 nr_irqs, arg); 1194 nr_irqs, arg);
1147 if (ret >= 0) 1195 if (ret < 0)
1148 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); 1196 return ret;
1197
1198 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1149 if (ret < 0 && recursive) 1199 if (ret < 0 && recursive)
1150 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); 1200 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
1151 1201
@@ -1160,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1160 * @node: NUMA node id for memory allocation 1210 * @node: NUMA node id for memory allocation
1161 * @arg: domain specific argument 1211 * @arg: domain specific argument
1162 * @realloc: IRQ descriptors have already been allocated if true 1212 * @realloc: IRQ descriptors have already been allocated if true
1213 * @affinity: Optional irq affinity mask for multiqueue devices
1163 * 1214 *
1164 * Allocate IRQ numbers and initialized all data structures to support 1215 * Allocate IRQ numbers and initialized all data structures to support
1165 * hierarchy IRQ domains. 1216 * hierarchy IRQ domains.
@@ -1175,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1175 */ 1226 */
1176int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, 1227int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1177 unsigned int nr_irqs, int node, void *arg, 1228 unsigned int nr_irqs, int node, void *arg,
1178 bool realloc) 1229 bool realloc, const struct cpumask *affinity)
1179{ 1230{
1180 int i, ret, virq; 1231 int i, ret, virq;
1181 1232
@@ -1193,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1193 if (realloc && irq_base >= 0) { 1244 if (realloc && irq_base >= 0) {
1194 virq = irq_base; 1245 virq = irq_base;
1195 } else { 1246 } else {
1196 virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); 1247 virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
1248 affinity);
1197 if (virq < 0) { 1249 if (virq < 0) {
1198 pr_debug("cannot allocate IRQ(base %d, count %d)\n", 1250 pr_debug("cannot allocate IRQ(base %d, count %d)\n",
1199 irq_base, nr_irqs); 1251 irq_base, nr_irqs);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef0bc02c3a70..73a2b786b5e9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq);
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116cpumask_var_t irq_default_affinity; 116cpumask_var_t irq_default_affinity;
117 117
118static int __irq_can_set_affinity(struct irq_desc *desc) 118static bool __irq_can_set_affinity(struct irq_desc *desc)
119{ 119{
120 if (!desc || !irqd_can_balance(&desc->irq_data) || 120 if (!desc || !irqd_can_balance(&desc->irq_data) ||
121 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) 121 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
122 return 0; 122 return false;
123 return 1; 123 return true;
124} 124}
125 125
126/** 126/**
@@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq)
134} 134}
135 135
136/** 136/**
137 * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
138 * @irq: Interrupt to check
139 *
140 * Like irq_can_set_affinity() above, but additionally checks for the
141 * AFFINITY_MANAGED flag.
142 */
143bool irq_can_set_affinity_usr(unsigned int irq)
144{
145 struct irq_desc *desc = irq_to_desc(irq);
146
147 return __irq_can_set_affinity(desc) &&
148 !irqd_affinity_is_managed(&desc->irq_data);
149}
150
151/**
137 * irq_set_thread_affinity - Notify irq threads to adjust affinity 152 * irq_set_thread_affinity - Notify irq threads to adjust affinity
138 * @desc: irq descriptor which has affitnity changed 153 * @desc: irq descriptor which has affitnity changed
139 * 154 *
@@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
338 return 0; 353 return 0;
339 354
340 /* 355 /*
341 * Preserve an userspace affinity setup, but make sure that 356 * Preserve the managed affinity setting and an userspace affinity
342 * one of the targets is online. 357 * setup, but make sure that one of the targets is online.
343 */ 358 */
344 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { 359 if (irqd_affinity_is_managed(&desc->irq_data) ||
360 irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
345 if (cpumask_intersects(desc->irq_common_data.affinity, 361 if (cpumask_intersects(desc->irq_common_data.affinity,
346 cpu_online_mask)) 362 cpu_online_mask))
347 set = desc->irq_common_data.affinity; 363 set = desc->irq_common_data.affinity;
@@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1117 new->irq = irq; 1133 new->irq = irq;
1118 1134
1119 /* 1135 /*
1136 * If the trigger type is not specified by the caller,
1137 * then use the default for this interrupt.
1138 */
1139 if (!(new->flags & IRQF_TRIGGER_MASK))
1140 new->flags |= irqd_get_trigger_type(&desc->irq_data);
1141
1142 /*
1120 * Check whether the interrupt nests into another interrupt 1143 * Check whether the interrupt nests into another interrupt
1121 * thread. 1144 * thread.
1122 */ 1145 */
@@ -1409,10 +1432,18 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1409 1432
1410 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) 1433 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1411 return -EINVAL; 1434 return -EINVAL;
1435
1436 retval = irq_chip_pm_get(&desc->irq_data);
1437 if (retval < 0)
1438 return retval;
1439
1412 chip_bus_lock(desc); 1440 chip_bus_lock(desc);
1413 retval = __setup_irq(irq, desc, act); 1441 retval = __setup_irq(irq, desc, act);
1414 chip_bus_sync_unlock(desc); 1442 chip_bus_sync_unlock(desc);
1415 1443
1444 if (retval)
1445 irq_chip_pm_put(&desc->irq_data);
1446
1416 return retval; 1447 return retval;
1417} 1448}
1418EXPORT_SYMBOL_GPL(setup_irq); 1449EXPORT_SYMBOL_GPL(setup_irq);
@@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1506 } 1537 }
1507 } 1538 }
1508 1539
1540 irq_chip_pm_put(&desc->irq_data);
1509 module_put(desc->owner); 1541 module_put(desc->owner);
1510 kfree(action->secondary); 1542 kfree(action->secondary);
1511 return action; 1543 return action;
@@ -1648,11 +1680,16 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1648 action->name = devname; 1680 action->name = devname;
1649 action->dev_id = dev_id; 1681 action->dev_id = dev_id;
1650 1682
1683 retval = irq_chip_pm_get(&desc->irq_data);
1684 if (retval < 0)
1685 return retval;
1686
1651 chip_bus_lock(desc); 1687 chip_bus_lock(desc);
1652 retval = __setup_irq(irq, desc, action); 1688 retval = __setup_irq(irq, desc, action);
1653 chip_bus_sync_unlock(desc); 1689 chip_bus_sync_unlock(desc);
1654 1690
1655 if (retval) { 1691 if (retval) {
1692 irq_chip_pm_put(&desc->irq_data);
1656 kfree(action->secondary); 1693 kfree(action->secondary);
1657 kfree(action); 1694 kfree(action);
1658 } 1695 }
@@ -1730,7 +1767,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1730 if (!desc) 1767 if (!desc)
1731 return; 1768 return;
1732 1769
1770 /*
1771 * If the trigger type is not specified by the caller, then
1772 * use the default for this interrupt.
1773 */
1733 type &= IRQ_TYPE_SENSE_MASK; 1774 type &= IRQ_TYPE_SENSE_MASK;
1775 if (type == IRQ_TYPE_NONE)
1776 type = irqd_get_trigger_type(&desc->irq_data);
1777
1734 if (type != IRQ_TYPE_NONE) { 1778 if (type != IRQ_TYPE_NONE) {
1735 int ret; 1779 int ret;
1736 1780
@@ -1822,6 +1866,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
1822 1866
1823 unregister_handler_proc(irq, action); 1867 unregister_handler_proc(irq, action);
1824 1868
1869 irq_chip_pm_put(&desc->irq_data);
1825 module_put(desc->owner); 1870 module_put(desc->owner);
1826 return action; 1871 return action;
1827 1872
@@ -1884,10 +1929,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1884 1929
1885 if (!desc || !irq_settings_is_per_cpu_devid(desc)) 1930 if (!desc || !irq_settings_is_per_cpu_devid(desc))
1886 return -EINVAL; 1931 return -EINVAL;
1932
1933 retval = irq_chip_pm_get(&desc->irq_data);
1934 if (retval < 0)
1935 return retval;
1936
1887 chip_bus_lock(desc); 1937 chip_bus_lock(desc);
1888 retval = __setup_irq(irq, desc, act); 1938 retval = __setup_irq(irq, desc, act);
1889 chip_bus_sync_unlock(desc); 1939 chip_bus_sync_unlock(desc);
1890 1940
1941 if (retval)
1942 irq_chip_pm_put(&desc->irq_data);
1943
1891 return retval; 1944 return retval;
1892} 1945}
1893 1946
@@ -1931,12 +1984,18 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1931 action->name = devname; 1984 action->name = devname;
1932 action->percpu_dev_id = dev_id; 1985 action->percpu_dev_id = dev_id;
1933 1986
1987 retval = irq_chip_pm_get(&desc->irq_data);
1988 if (retval < 0)
1989 return retval;
1990
1934 chip_bus_lock(desc); 1991 chip_bus_lock(desc);
1935 retval = __setup_irq(irq, desc, action); 1992 retval = __setup_irq(irq, desc, action);
1936 chip_bus_sync_unlock(desc); 1993 chip_bus_sync_unlock(desc);
1937 1994
1938 if (retval) 1995 if (retval) {
1996 irq_chip_pm_put(&desc->irq_data);
1939 kfree(action); 1997 kfree(action);
1998 }
1940 1999
1941 return retval; 2000 return retval;
1942} 2001}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b071..54999350162c 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
324 struct msi_domain_ops *ops = info->ops; 324 struct msi_domain_ops *ops = info->ops;
325 msi_alloc_info_t arg; 325 msi_alloc_info_t arg;
326 struct msi_desc *desc; 326 struct msi_desc *desc;
327 int i, ret, virq = -1; 327 int i, ret, virq;
328 328
329 ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); 329 ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
330 if (ret) 330 if (ret)
@@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
332 332
333 for_each_msi_entry(desc, dev) { 333 for_each_msi_entry(desc, dev) {
334 ops->set_desc(&arg, desc); 334 ops->set_desc(&arg, desc);
335 if (info->flags & MSI_FLAG_IDENTITY_MAP)
336 virq = (int)ops->get_hwirq(info, &arg);
337 else
338 virq = -1;
339 335
340 virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, 336 virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
341 dev_to_node(dev), &arg, false); 337 dev_to_node(dev), &arg, false,
338 desc->affinity);
342 if (virq < 0) { 339 if (virq < 0) {
343 ret = -ENOSPC; 340 ret = -ENOSPC;
344 if (ops->handle_error) 341 if (ops->handle_error)
@@ -356,6 +353,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
356 ops->msi_finish(&arg, 0); 353 ops->msi_finish(&arg, 0);
357 354
358 for_each_msi_entry(desc, dev) { 355 for_each_msi_entry(desc, dev) {
356 virq = desc->irq;
359 if (desc->nvec_used == 1) 357 if (desc->nvec_used == 1)
360 dev_dbg(dev, "irq %d for MSI\n", virq); 358 dev_dbg(dev, "irq %d for MSI\n", virq);
361 else 359 else
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4e1b94726818..feaa813b84a9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file,
96 cpumask_var_t new_value; 96 cpumask_var_t new_value;
97 int err; 97 int err;
98 98
99 if (!irq_can_set_affinity(irq) || no_irq_affinity) 99 if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
100 return -EIO; 100 return -EIO;
101 101
102 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 102 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
311 !name_unique(irq, action)) 311 !name_unique(irq, action))
312 return; 312 return;
313 313
314 memset(name, 0, MAX_NAMELEN);
315 snprintf(name, MAX_NAMELEN, "%s", action->name); 314 snprintf(name, MAX_NAMELEN, "%s", action->name);
316 315
317 /* create /proc/irq/1234/handler/ */ 316 /* create /proc/irq/1234/handler/ */
@@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
340 if (desc->dir) 339 if (desc->dir)
341 goto out_unlock; 340 goto out_unlock;
342 341
343 memset(name, 0, MAX_NAMELEN);
344 sprintf(name, "%d", irq); 342 sprintf(name, "%d", irq);
345 343
346 /* create /proc/irq/1234 */ 344 /* create /proc/irq/1234 */
@@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
386#endif 384#endif
387 remove_proc_entry("spurious", desc->dir); 385 remove_proc_entry("spurious", desc->dir);
388 386
389 memset(name, 0, MAX_NAMELEN);
390 sprintf(name, "%u", irq); 387 sprintf(name, "%u", irq);
391 remove_proc_entry(name, root_irq_dir); 388 remove_proc_entry(name, root_irq_dir);
392} 389}
@@ -421,12 +418,8 @@ void init_irq_proc(void)
421 /* 418 /*
422 * Create entries for all existing IRQs. 419 * Create entries for all existing IRQs.
423 */ 420 */
424 for_each_irq_desc(irq, desc) { 421 for_each_irq_desc(irq, desc)
425 if (!desc)
426 continue;
427
428 register_irq_proc(irq, desc); 422 register_irq_proc(irq, desc);
429 }
430} 423}
431 424
432#ifdef CONFIG_GENERIC_IRQ_SHOW 425#ifdef CONFIG_GENERIC_IRQ_SHOW
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..0dbea887d625 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
58 58
59void static_key_slow_inc(struct static_key *key) 59void static_key_slow_inc(struct static_key *key)
60{ 60{
61 int v, v1;
62
61 STATIC_KEY_CHECK_USE(); 63 STATIC_KEY_CHECK_USE();
62 if (atomic_inc_not_zero(&key->enabled)) 64
63 return; 65 /*
66 * Careful if we get concurrent static_key_slow_inc() calls;
67 * later calls must wait for the first one to _finish_ the
68 * jump_label_update() process. At the same time, however,
69 * the jump_label_update() call below wants to see
70 * static_key_enabled(&key) for jumps to be updated properly.
71 *
72 * So give a special meaning to negative key->enabled: it sends
73 * static_key_slow_inc() down the slow path, and it is non-zero
74 * so it counts as "enabled" in jump_label_update(). Note that
75 * atomic_inc_unless_negative() checks >= 0, so roll our own.
76 */
77 for (v = atomic_read(&key->enabled); v > 0; v = v1) {
78 v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
79 if (likely(v1 == v))
80 return;
81 }
64 82
65 jump_label_lock(); 83 jump_label_lock();
66 if (atomic_inc_return(&key->enabled) == 1) 84 if (atomic_read(&key->enabled) == 0) {
85 atomic_set(&key->enabled, -1);
67 jump_label_update(key); 86 jump_label_update(key);
87 atomic_set(&key->enabled, 1);
88 } else {
89 atomic_inc(&key->enabled);
90 }
68 jump_label_unlock(); 91 jump_label_unlock();
69} 92}
70EXPORT_SYMBOL_GPL(static_key_slow_inc); 93EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
72static void __static_key_slow_dec(struct static_key *key, 95static void __static_key_slow_dec(struct static_key *key,
73 unsigned long rate_limit, struct delayed_work *work) 96 unsigned long rate_limit, struct delayed_work *work)
74{ 97{
98 /*
99 * The negative count check is valid even when a negative
100 * key->enabled is in use by static_key_slow_inc(); a
101 * __static_key_slow_dec() before the first static_key_slow_inc()
102 * returns is unbalanced, because all other static_key_slow_inc()
103 * instances block while the update is in progress.
104 */
75 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { 105 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
76 WARN(atomic_read(&key->enabled) < 0, 106 WARN(atomic_read(&key->enabled) < 0,
77 "jump label: negative count!\n"); 107 "jump label: negative count!\n");
@@ -422,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
422 return notifier_from_errno(ret); 452 return notifier_from_errno(ret);
423} 453}
424 454
425struct notifier_block jump_label_module_nb = { 455static struct notifier_block jump_label_module_nb = {
426 .notifier_call = jump_label_module_notify, 456 .notifier_call = jump_label_module_notify,
427 .priority = 1, /* higher than tracepoints */ 457 .priority = 1, /* higher than tracepoints */
428}; 458};
diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
264 264
265static int __init kcov_init(void) 265static int __init kcov_init(void)
266{ 266{
267 if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { 267 /*
268 * The kcov debugfs file won't ever get removed and thus,
269 * there is no need to protect it against removal races. The
270 * use of debugfs_create_file_unsafe() is actually safe here.
271 */
272 if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
268 pr_err("failed to create kcov in debugfs\n"); 273 pr_err("failed to create kcov in debugfs\n");
269 return -ENOMEM; 274 return -ENOMEM;
270 } 275 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81f1a7107c0e..589d763a49b3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -46,6 +46,7 @@
46#include <linux/gfp.h> 46#include <linux/gfp.h>
47#include <linux/kmemcheck.h> 47#include <linux/kmemcheck.h>
48#include <linux/random.h> 48#include <linux/random.h>
49#include <linux/jhash.h>
49 50
50#include <asm/sections.h> 51#include <asm/sections.h>
51 52
@@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE];
309 * It's a 64-bit hash, because it's important for the keys to be 310 * It's a 64-bit hash, because it's important for the keys to be
310 * unique. 311 * unique.
311 */ 312 */
312#define iterate_chain_key(key1, key2) \ 313static inline u64 iterate_chain_key(u64 key, u32 idx)
313 (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ 314{
314 ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ 315 u32 k0 = key, k1 = key >> 32;
315 (key2)) 316
317 __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
318
319 return k0 | (u64)k1 << 32;
320}
316 321
317void lockdep_off(void) 322void lockdep_off(void)
318{ 323{
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
49} 49}
50 50
51void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 51void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
52 struct thread_info *ti) 52 struct task_struct *task)
53{ 53{
54 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); 54 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
55 55
56 /* Mark the current thread as blocked on the lock: */ 56 /* Mark the current thread as blocked on the lock: */
57 ti->task->blocked_on = waiter; 57 task->blocked_on = waiter;
58} 58}
59 59
60void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 60void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
61 struct thread_info *ti) 61 struct task_struct *task)
62{ 62{
63 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); 63 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
64 DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); 64 DEBUG_LOCKS_WARN_ON(waiter->task != task);
65 DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); 65 DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
66 ti->task->blocked_on = NULL; 66 task->blocked_on = NULL;
67 67
68 list_del_init(&waiter->list); 68 list_del_init(&waiter->list);
69 waiter->task = NULL; 69 waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..57a871ae3c81 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
20extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); 20extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
21extern void debug_mutex_add_waiter(struct mutex *lock, 21extern void debug_mutex_add_waiter(struct mutex *lock,
22 struct mutex_waiter *waiter, 22 struct mutex_waiter *waiter,
23 struct thread_info *ti); 23 struct task_struct *task);
24extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 24extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
25 struct thread_info *ti); 25 struct task_struct *task);
26extern void debug_mutex_unlock(struct mutex *lock); 26extern void debug_mutex_unlock(struct mutex *lock);
27extern void debug_mutex_init(struct mutex *lock, const char *name, 27extern void debug_mutex_init(struct mutex *lock, const char *name,
28 struct lock_class_key *key); 28 struct lock_class_key *key);
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current; 32 WRITE_ONCE(lock->owner, current);
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
36{ 36{
37 lock->owner = NULL; 37 WRITE_ONCE(lock->owner, NULL);
38} 38}
39 39
40#define spin_lock_mutex(lock, flags) \ 40#define spin_lock_mutex(lock, flags) \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
486 if (!hold_ctx) 486 if (!hold_ctx)
487 return 0; 487 return 0;
488 488
489 if (unlikely(ctx == hold_ctx))
490 return -EALREADY;
491
492 if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && 489 if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
493 (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { 490 (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
494#ifdef CONFIG_DEBUG_MUTEXES 491#ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
514 unsigned long flags; 511 unsigned long flags;
515 int ret; 512 int ret;
516 513
514 if (use_ww_ctx) {
515 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
516 if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
517 return -EALREADY;
518 }
519
517 preempt_disable(); 520 preempt_disable();
518 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 521 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
519 522
@@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
534 goto skip_wait; 537 goto skip_wait;
535 538
536 debug_mutex_lock_common(lock, &waiter); 539 debug_mutex_lock_common(lock, &waiter);
537 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 540 debug_mutex_add_waiter(lock, &waiter, task);
538 541
539 /* add waiting tasks to the end of the waitqueue (FIFO): */ 542 /* add waiting tasks to the end of the waitqueue (FIFO): */
540 list_add_tail(&waiter.list, &lock->wait_list); 543 list_add_tail(&waiter.list, &lock->wait_list);
@@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
581 } 584 }
582 __set_task_state(task, TASK_RUNNING); 585 __set_task_state(task, TASK_RUNNING);
583 586
584 mutex_remove_waiter(lock, &waiter, current_thread_info()); 587 mutex_remove_waiter(lock, &waiter, task);
585 /* set it to 0 if there are no waiters left: */ 588 /* set it to 0 if there are no waiters left: */
586 if (likely(list_empty(&lock->wait_list))) 589 if (likely(list_empty(&lock->wait_list)))
587 atomic_set(&lock->count, 0); 590 atomic_set(&lock->count, 0);
@@ -602,7 +605,7 @@ skip_wait:
602 return 0; 605 return 0;
603 606
604err: 607err:
605 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 608 mutex_remove_waiter(lock, &waiter, task);
606 spin_unlock_mutex(&lock->wait_lock, flags); 609 spin_unlock_mutex(&lock->wait_lock, flags);
607 debug_mutex_free_waiter(&waiter); 610 debug_mutex_free_waiter(&waiter);
608 mutex_release(&lock->dep_map, 1, ip); 611 mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..6cd6b8e9efd7 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,18 +13,24 @@
13 do { spin_lock(lock); (void)(flags); } while (0) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \ 14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0) 15 do { spin_unlock(lock); (void)(flags); } while (0)
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, task) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 19#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
20/*
21 * The mutex owner can get read and written to locklessly.
22 * We should use WRITE_ONCE when writing the owner value to
23 * avoid store tearing, otherwise, a thread could potentially
24 * read a partially written and incomplete owner value.
25 */
20static inline void mutex_set_owner(struct mutex *lock) 26static inline void mutex_set_owner(struct mutex *lock)
21{ 27{
22 lock->owner = current; 28 WRITE_ONCE(lock->owner, current);
23} 29}
24 30
25static inline void mutex_clear_owner(struct mutex *lock) 31static inline void mutex_clear_owner(struct mutex *lock)
26{ 32{
27 lock->owner = NULL; 33 WRITE_ONCE(lock->owner, NULL);
28} 34}
29#else 35#else
30static inline void mutex_set_owner(struct mutex *lock) 36static inline void mutex_set_owner(struct mutex *lock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec082338668..19248ddf37ce 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
93 * that accesses can't leak upwards out of our subsequent critical 93 * that accesses can't leak upwards out of our subsequent critical
94 * section in the case that the lock is currently held for write. 94 * section in the case that the lock is currently held for write.
95 */ 95 */
96 cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; 96 cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
97 rspin_until_writer_unlock(lock, cnts); 97 rspin_until_writer_unlock(lock, cnts);
98 98
99 /* 99 /*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..b2caec7315af 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
90 * therefore increment the cpu number by one. 90 * therefore increment the cpu number by one.
91 */ 91 */
92 92
93static inline u32 encode_tail(int cpu, int idx) 93static inline __pure u32 encode_tail(int cpu, int idx)
94{ 94{
95 u32 tail; 95 u32 tail;
96 96
@@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx)
103 return tail; 103 return tail;
104} 104}
105 105
106static inline struct mcs_spinlock *decode_tail(u32 tail) 106static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
107{ 107{
108 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 108 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
109 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 109 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
@@ -267,6 +267,123 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
267#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 267#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
268#endif 268#endif
269 269
270/*
271 * Various notes on spin_is_locked() and spin_unlock_wait(), which are
272 * 'interesting' functions:
273 *
274 * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
275 * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
276 * PPC). Also qspinlock has a similar issue per construction, the setting of
277 * the locked byte can be unordered acquiring the lock proper.
278 *
279 * This gets to be 'interesting' in the following cases, where the /should/s
280 * end up false because of this issue.
281 *
282 *
283 * CASE 1:
284 *
285 * So the spin_is_locked() correctness issue comes from something like:
286 *
287 * CPU0 CPU1
288 *
289 * global_lock(); local_lock(i)
290 * spin_lock(&G) spin_lock(&L[i])
291 * for (i) if (!spin_is_locked(&G)) {
292 * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
293 * return;
294 * }
295 * // deal with fail
296 *
297 * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
298 * that there is exclusion between the two critical sections.
299 *
300 * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
301 * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
302 * /should/ be constrained by the ACQUIRE from spin_lock(&G).
303 *
304 * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
305 *
306 *
307 * CASE 2:
308 *
309 * For spin_unlock_wait() there is a second correctness issue, namely:
310 *
311 * CPU0 CPU1
312 *
313 * flag = set;
314 * smp_mb(); spin_lock(&l)
315 * spin_unlock_wait(&l); if (!flag)
316 * // add to lockless list
317 * spin_unlock(&l);
318 * // iterate lockless list
319 *
320 * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
321 * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
322 * semantics etc..)
323 *
324 * Where flag /should/ be ordered against the locked store of l.
325 */
326
327/*
328 * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
329 * issuing an _unordered_ store to set _Q_LOCKED_VAL.
330 *
331 * This means that the store can be delayed, but no later than the
332 * store-release from the unlock. This means that simply observing
333 * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
334 *
335 * There are two paths that can issue the unordered store:
336 *
337 * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
338 *
339 * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
340 * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
341 *
342 * However, in both cases we have other !0 state we've set before to queue
343 * ourseves:
344 *
345 * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
346 * load is constrained by that ACQUIRE to not pass before that, and thus must
347 * observe the store.
348 *
349 * For (2) we have a more intersting scenario. We enqueue ourselves using
350 * xchg_tail(), which ends up being a RELEASE. This in itself is not
351 * sufficient, however that is followed by an smp_cond_acquire() on the same
352 * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
353 * guarantees we must observe that store.
354 *
355 * Therefore both cases have other !0 state that is observable before the
356 * unordered locked byte store comes through. This means we can use that to
357 * wait for the lock store, and then wait for an unlock.
358 */
359#ifndef queued_spin_unlock_wait
360void queued_spin_unlock_wait(struct qspinlock *lock)
361{
362 u32 val;
363
364 for (;;) {
365 val = atomic_read(&lock->val);
366
367 if (!val) /* not locked, we're done */
368 goto done;
369
370 if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
371 break;
372
373 /* not locked, but pending, wait until we observe the lock */
374 cpu_relax();
375 }
376
377 /* any unlock is good */
378 while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
379 cpu_relax();
380
381done:
382 smp_acquire__after_ctrl_dep();
383}
384EXPORT_SYMBOL(queued_spin_unlock_wait);
385#endif
386
270#endif /* _GEN_PV_LOCK_SLOWPATH */ 387#endif /* _GEN_PV_LOCK_SLOWPATH */
271 388
272/** 389/**
@@ -358,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
358 * sequentiality; this is because not all clear_pending_set_locked() 475 * sequentiality; this is because not all clear_pending_set_locked()
359 * implementations imply full barriers. 476 * implementations imply full barriers.
360 */ 477 */
361 smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); 478 smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
362 479
363 /* 480 /*
364 * take ownership and clear the pending bit. 481 * take ownership and clear the pending bit.
@@ -395,6 +512,8 @@ queue:
395 * pending stuff. 512 * pending stuff.
396 * 513 *
397 * p,*,* -> n,*,* 514 * p,*,* -> n,*,*
515 *
516 * RELEASE, such that the stores to @node must be complete.
398 */ 517 */
399 old = xchg_tail(lock, tail); 518 old = xchg_tail(lock, tail);
400 next = NULL; 519 next = NULL;
@@ -405,6 +524,15 @@ queue:
405 */ 524 */
406 if (old & _Q_TAIL_MASK) { 525 if (old & _Q_TAIL_MASK) {
407 prev = decode_tail(old); 526 prev = decode_tail(old);
527 /*
528 * The above xchg_tail() is also a load of @lock which generates,
529 * through decode_tail(), a pointer.
530 *
531 * The address dependency matches the RELEASE of xchg_tail()
532 * such that the access to @prev must happen after.
533 */
534 smp_read_barrier_depends();
535
408 WRITE_ONCE(prev->next, node); 536 WRITE_ONCE(prev->next, node);
409 537
410 pv_wait_node(node, prev); 538 pv_wait_node(node, prev);
@@ -434,7 +562,7 @@ queue:
434 * 562 *
435 * The PV pv_wait_head_or_lock function, if active, will acquire 563 * The PV pv_wait_head_or_lock function, if active, will acquire
436 * the lock and return a non-zero value. So we have to skip the 564 * the lock and return a non-zero value. So we have to skip the
437 * smp_cond_acquire() call. As the next PV queue head hasn't been 565 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
438 * designated yet, there is no way for the locked value to become 566 * designated yet, there is no way for the locked value to become
439 * _Q_SLOW_VAL. So both the set_locked() and the 567 * _Q_SLOW_VAL. So both the set_locked() and the
440 * atomic_cmpxchg_relaxed() calls will be safe. 568 * atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +573,7 @@ queue:
445 if ((val = pv_wait_head_or_lock(lock, node))) 573 if ((val = pv_wait_head_or_lock(lock, node)))
446 goto locked; 574 goto locked;
447 575
448 smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); 576 val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
449 577
450locked: 578locked:
451 /* 579 /*
@@ -465,9 +593,9 @@ locked:
465 break; 593 break;
466 } 594 }
467 /* 595 /*
468 * The smp_cond_acquire() call above has provided the necessary 596 * The smp_cond_load_acquire() call above has provided the
469 * acquire semantics required for locking. At most two 597 * necessary acquire semantics required for locking. At most
470 * iterations of this loop may be ran. 598 * two iterations of this loop may be ran.
471 */ 599 */
472 old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); 600 old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
473 if (old == val) 601 if (old == val)
@@ -491,7 +619,7 @@ release:
491 /* 619 /*
492 * release the node 620 * release the node
493 */ 621 */
494 this_cpu_dec(mcs_nodes[0].count); 622 __this_cpu_dec(mcs_nodes[0].count);
495} 623}
496EXPORT_SYMBOL(queued_spin_lock_slowpath); 624EXPORT_SYMBOL(queued_spin_lock_slowpath);
497 625
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 21ede57f68b3..37649e69056c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
112#else /* _Q_PENDING_BITS == 8 */ 112#else /* _Q_PENDING_BITS == 8 */
113static __always_inline void set_pending(struct qspinlock *lock) 113static __always_inline void set_pending(struct qspinlock *lock)
114{ 114{
115 atomic_set_mask(_Q_PENDING_VAL, &lock->val); 115 atomic_or(_Q_PENDING_VAL, &lock->val);
116} 116}
117 117
118static __always_inline void clear_pending(struct qspinlock *lock) 118static __always_inline void clear_pending(struct qspinlock *lock)
119{ 119{
120 atomic_clear_mask(_Q_PENDING_VAL, &lock->val); 120 atomic_andnot(_Q_PENDING_VAL, &lock->val);
121} 121}
122 122
123static __always_inline int trylock_clear_pending(struct qspinlock *lock) 123static __always_inline int trylock_clear_pending(struct qspinlock *lock)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 3e746607abe5..1ec0f48962b3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
1478 */ 1478 */
1479int __sched rt_mutex_trylock(struct rt_mutex *lock) 1479int __sched rt_mutex_trylock(struct rt_mutex *lock)
1480{ 1480{
1481 if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) 1481 if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
1482 return 0; 1482 return 0;
1483 1483
1484 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); 1484 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09e30c6225e5..447e08de1fab 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
80 debug_check_no_locks_freed((void *)sem, sizeof(*sem)); 80 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
81 lockdep_init_map(&sem->dep_map, name, key, 0); 81 lockdep_init_map(&sem->dep_map, name, key, 0);
82#endif 82#endif
83 sem->count = RWSEM_UNLOCKED_VALUE; 83 atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
84 raw_spin_lock_init(&sem->wait_lock); 84 raw_spin_lock_init(&sem->wait_lock);
85 INIT_LIST_HEAD(&sem->wait_list); 85 INIT_LIST_HEAD(&sem->wait_list);
86#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 86#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,12 +114,16 @@ enum rwsem_wake_type {
114 * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) 114 * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
115 * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) 115 * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
116 * - there must be someone on the queue 116 * - there must be someone on the queue
117 * - the spinlock must be held by the caller 117 * - the wait_lock must be held by the caller
118 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
119 * to actually wakeup the blocked task(s) and drop the reference count,
120 * preferably when the wait_lock is released
118 * - woken process blocks are discarded from the list after having task zeroed 121 * - woken process blocks are discarded from the list after having task zeroed
119 * - writers are only woken if downgrading is false 122 * - writers are only marked woken if downgrading is false
120 */ 123 */
121static struct rw_semaphore * 124static struct rw_semaphore *
122__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) 125__rwsem_mark_wake(struct rw_semaphore *sem,
126 enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
123{ 127{
124 struct rwsem_waiter *waiter; 128 struct rwsem_waiter *waiter;
125 struct task_struct *tsk; 129 struct task_struct *tsk;
@@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
128 132
129 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); 133 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
130 if (waiter->type == RWSEM_WAITING_FOR_WRITE) { 134 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
131 if (wake_type == RWSEM_WAKE_ANY) 135 if (wake_type == RWSEM_WAKE_ANY) {
132 /* Wake writer at the front of the queue, but do not 136 /*
133 * grant it the lock yet as we want other writers 137 * Mark writer at the front of the queue for wakeup.
134 * to be able to steal it. Readers, on the other hand, 138 * Until the task is actually later awoken later by
135 * will block as they will notice the queued writer. 139 * the caller, other writers are able to steal it.
140 * Readers, on the other hand, will block as they
141 * will notice the queued writer.
136 */ 142 */
137 wake_up_process(waiter->task); 143 wake_q_add(wake_q, waiter->task);
144 }
138 goto out; 145 goto out;
139 } 146 }
140 147
@@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
146 if (wake_type != RWSEM_WAKE_READ_OWNED) { 153 if (wake_type != RWSEM_WAKE_READ_OWNED) {
147 adjustment = RWSEM_ACTIVE_READ_BIAS; 154 adjustment = RWSEM_ACTIVE_READ_BIAS;
148 try_reader_grant: 155 try_reader_grant:
149 oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; 156 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
157
150 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { 158 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
151 /* A writer stole the lock. Undo our reader grant. */ 159 /*
152 if (rwsem_atomic_update(-adjustment, sem) & 160 * If the count is still less than RWSEM_WAITING_BIAS
153 RWSEM_ACTIVE_MASK) 161 * after removing the adjustment, it is assumed that
162 * a writer has stolen the lock. We have to undo our
163 * reader grant.
164 */
165 if (atomic_long_add_return(-adjustment, &sem->count) <
166 RWSEM_WAITING_BIAS)
154 goto out; 167 goto out;
155 /* Last active locker left. Retry waking readers. */ 168 /* Last active locker left. Retry waking readers. */
156 goto try_reader_grant; 169 goto try_reader_grant;
157 } 170 }
171 /*
172 * It is not really necessary to set it to reader-owned here,
173 * but it gives the spinners an early indication that the
174 * readers now have the lock.
175 */
176 rwsem_set_reader_owned(sem);
158 } 177 }
159 178
160 /* Grant an infinite number of read locks to the readers at the front 179 /* Grant an infinite number of read locks to the readers at the front
@@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
179 adjustment -= RWSEM_WAITING_BIAS; 198 adjustment -= RWSEM_WAITING_BIAS;
180 199
181 if (adjustment) 200 if (adjustment)
182 rwsem_atomic_add(adjustment, sem); 201 atomic_long_add(adjustment, &sem->count);
183 202
184 next = sem->wait_list.next; 203 next = sem->wait_list.next;
185 loop = woken; 204 loop = woken;
@@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
187 waiter = list_entry(next, struct rwsem_waiter, list); 206 waiter = list_entry(next, struct rwsem_waiter, list);
188 next = waiter->list.next; 207 next = waiter->list.next;
189 tsk = waiter->task; 208 tsk = waiter->task;
209
210 wake_q_add(wake_q, tsk);
190 /* 211 /*
191 * Make sure we do not wakeup the next reader before 212 * Ensure that the last operation is setting the reader
192 * setting the nil condition to grant the next reader; 213 * waiter to nil such that rwsem_down_read_failed() cannot
193 * otherwise we could miss the wakeup on the other 214 * race with do_exit() by always holding a reference count
194 * side and end up sleeping again. See the pairing 215 * to the task to wakeup.
195 * in rwsem_down_read_failed().
196 */ 216 */
197 smp_mb(); 217 smp_store_release(&waiter->task, NULL);
198 waiter->task = NULL;
199 wake_up_process(tsk);
200 put_task_struct(tsk);
201 } while (--loop); 218 } while (--loop);
202 219
203 sem->wait_list.next = next; 220 sem->wait_list.next = next;
@@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
216 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; 233 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
217 struct rwsem_waiter waiter; 234 struct rwsem_waiter waiter;
218 struct task_struct *tsk = current; 235 struct task_struct *tsk = current;
236 WAKE_Q(wake_q);
219 237
220 /* set up my own style of waitqueue */ 238 /* set up my own style of waitqueue */
221 waiter.task = tsk; 239 waiter.task = tsk;
222 waiter.type = RWSEM_WAITING_FOR_READ; 240 waiter.type = RWSEM_WAITING_FOR_READ;
223 get_task_struct(tsk);
224 241
225 raw_spin_lock_irq(&sem->wait_lock); 242 raw_spin_lock_irq(&sem->wait_lock);
226 if (list_empty(&sem->wait_list)) 243 if (list_empty(&sem->wait_list))
@@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
228 list_add_tail(&waiter.list, &sem->wait_list); 245 list_add_tail(&waiter.list, &sem->wait_list);
229 246
230 /* we're now waiting on the lock, but no longer actively locking */ 247 /* we're now waiting on the lock, but no longer actively locking */
231 count = rwsem_atomic_update(adjustment, sem); 248 count = atomic_long_add_return(adjustment, &sem->count);
232 249
233 /* If there are no active locks, wake the front queued process(es). 250 /* If there are no active locks, wake the front queued process(es).
234 * 251 *
@@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
238 if (count == RWSEM_WAITING_BIAS || 255 if (count == RWSEM_WAITING_BIAS ||
239 (count > RWSEM_WAITING_BIAS && 256 (count > RWSEM_WAITING_BIAS &&
240 adjustment != -RWSEM_ACTIVE_READ_BIAS)) 257 adjustment != -RWSEM_ACTIVE_READ_BIAS))
241 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); 258 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
242 259
243 raw_spin_unlock_irq(&sem->wait_lock); 260 raw_spin_unlock_irq(&sem->wait_lock);
261 wake_up_q(&wake_q);
244 262
245 /* wait to be given the lock */ 263 /* wait to be given the lock */
246 while (true) { 264 while (true) {
@@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
255} 273}
256EXPORT_SYMBOL(rwsem_down_read_failed); 274EXPORT_SYMBOL(rwsem_down_read_failed);
257 275
276/*
277 * This function must be called with the sem->wait_lock held to prevent
278 * race conditions between checking the rwsem wait list and setting the
279 * sem->count accordingly.
280 */
258static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) 281static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
259{ 282{
260 /* 283 /*
261 * Try acquiring the write lock. Check count first in order 284 * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
262 * to reduce unnecessary expensive cmpxchg() operations.
263 */ 285 */
264 if (count == RWSEM_WAITING_BIAS && 286 if (count != RWSEM_WAITING_BIAS)
265 cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, 287 return false;
266 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 288
267 if (!list_is_singular(&sem->wait_list)) 289 /*
268 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 290 * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
291 * are other tasks on the wait list, we need to add on WAITING_BIAS.
292 */
293 count = list_is_singular(&sem->wait_list) ?
294 RWSEM_ACTIVE_WRITE_BIAS :
295 RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
296
297 if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
298 == RWSEM_WAITING_BIAS) {
269 rwsem_set_owner(sem); 299 rwsem_set_owner(sem);
270 return true; 300 return true;
271 } 301 }
@@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
279 */ 309 */
280static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 310static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
281{ 311{
282 long old, count = READ_ONCE(sem->count); 312 long old, count = atomic_long_read(&sem->count);
283 313
284 while (true) { 314 while (true) {
285 if (!(count == 0 || count == RWSEM_WAITING_BIAS)) 315 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
286 return false; 316 return false;
287 317
288 old = cmpxchg_acquire(&sem->count, count, 318 old = atomic_long_cmpxchg_acquire(&sem->count, count,
289 count + RWSEM_ACTIVE_WRITE_BIAS); 319 count + RWSEM_ACTIVE_WRITE_BIAS);
290 if (old == count) { 320 if (old == count) {
291 rwsem_set_owner(sem); 321 rwsem_set_owner(sem);
@@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
306 336
307 rcu_read_lock(); 337 rcu_read_lock();
308 owner = READ_ONCE(sem->owner); 338 owner = READ_ONCE(sem->owner);
309 if (!owner) { 339 if (!rwsem_owner_is_writer(owner)) {
310 long count = READ_ONCE(sem->count);
311 /* 340 /*
312 * If sem->owner is not set, yet we have just recently entered the 341 * Don't spin if the rwsem is readers owned.
313 * slowpath with the lock being active, then there is a possibility
314 * reader(s) may have the lock. To be safe, bail spinning in these
315 * situations.
316 */ 342 */
317 if (count & RWSEM_ACTIVE_MASK) 343 ret = !rwsem_owner_is_reader(owner);
318 ret = false;
319 goto done; 344 goto done;
320 } 345 }
321 346
@@ -325,10 +350,15 @@ done:
325 return ret; 350 return ret;
326} 351}
327 352
328static noinline 353/*
329bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) 354 * Return true only if we can still spin on the owner field of the rwsem.
355 */
356static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
330{ 357{
331 long count; 358 struct task_struct *owner = READ_ONCE(sem->owner);
359
360 if (!rwsem_owner_is_writer(owner))
361 goto out;
332 362
333 rcu_read_lock(); 363 rcu_read_lock();
334 while (sem->owner == owner) { 364 while (sem->owner == owner) {
@@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
349 cpu_relax_lowlatency(); 379 cpu_relax_lowlatency();
350 } 380 }
351 rcu_read_unlock(); 381 rcu_read_unlock();
352 382out:
353 if (READ_ONCE(sem->owner))
354 return true; /* new owner, continue spinning */
355
356 /* 383 /*
357 * When the owner is not set, the lock could be free or 384 * If there is a new owner or the owner is not set, we continue
358 * held by readers. Check the counter to verify the 385 * spinning.
359 * state.
360 */ 386 */
361 count = READ_ONCE(sem->count); 387 return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
362 return (count == 0 || count == RWSEM_WAITING_BIAS);
363} 388}
364 389
365static bool rwsem_optimistic_spin(struct rw_semaphore *sem) 390static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
366{ 391{
367 struct task_struct *owner;
368 bool taken = false; 392 bool taken = false;
369 393
370 preempt_disable(); 394 preempt_disable();
@@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
376 if (!osq_lock(&sem->osq)) 400 if (!osq_lock(&sem->osq))
377 goto done; 401 goto done;
378 402
379 while (true) { 403 /*
380 owner = READ_ONCE(sem->owner); 404 * Optimistically spin on the owner field and attempt to acquire the
381 if (owner && !rwsem_spin_on_owner(sem, owner)) 405 * lock whenever the owner changes. Spinning will be stopped when:
382 break; 406 * 1) the owning writer isn't running; or
383 407 * 2) readers own the lock as we can't determine if they are
384 /* wait_lock will be acquired if write_lock is obtained */ 408 * actively running or not.
409 */
410 while (rwsem_spin_on_owner(sem)) {
411 /*
412 * Try to acquire the lock
413 */
385 if (rwsem_try_write_lock_unqueued(sem)) { 414 if (rwsem_try_write_lock_unqueued(sem)) {
386 taken = true; 415 taken = true;
387 break; 416 break;
@@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
393 * we're an RT task that will live-lock because we won't let 422 * we're an RT task that will live-lock because we won't let
394 * the owner complete. 423 * the owner complete.
395 */ 424 */
396 if (!owner && (need_resched() || rt_task(current))) 425 if (!sem->owner && (need_resched() || rt_task(current)))
397 break; 426 break;
398 427
399 /* 428 /*
@@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
440 bool waiting = true; /* any queued threads before us */ 469 bool waiting = true; /* any queued threads before us */
441 struct rwsem_waiter waiter; 470 struct rwsem_waiter waiter;
442 struct rw_semaphore *ret = sem; 471 struct rw_semaphore *ret = sem;
472 WAKE_Q(wake_q);
443 473
444 /* undo write bias from down_write operation, stop active locking */ 474 /* undo write bias from down_write operation, stop active locking */
445 count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); 475 count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
446 476
447 /* do optimistic spinning and steal lock if possible */ 477 /* do optimistic spinning and steal lock if possible */
448 if (rwsem_optimistic_spin(sem)) 478 if (rwsem_optimistic_spin(sem))
@@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
465 495
466 /* we're now waiting on the lock, but no longer actively locking */ 496 /* we're now waiting on the lock, but no longer actively locking */
467 if (waiting) { 497 if (waiting) {
468 count = READ_ONCE(sem->count); 498 count = atomic_long_read(&sem->count);
469 499
470 /* 500 /*
471 * If there were already threads queued before us and there are 501 * If there were already threads queued before us and there are
472 * no active writers, the lock must be read owned; so we try to 502 * no active writers, the lock must be read owned; so we try to
473 * wake any read locks that were queued ahead of us. 503 * wake any read locks that were queued ahead of us.
474 */ 504 */
475 if (count > RWSEM_WAITING_BIAS) 505 if (count > RWSEM_WAITING_BIAS) {
476 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); 506 WAKE_Q(wake_q);
507
508 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
509 /*
510 * The wakeup is normally called _after_ the wait_lock
511 * is released, but given that we are proactively waking
512 * readers we can deal with the wake_q overhead as it is
513 * similar to releasing and taking the wait_lock again
514 * for attempting rwsem_try_write_lock().
515 */
516 wake_up_q(&wake_q);
517 }
477 518
478 } else 519 } else
479 count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 520 count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
480 521
481 /* wait until we successfully acquire the lock */ 522 /* wait until we successfully acquire the lock */
482 set_current_state(state); 523 set_current_state(state);
@@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
492 533
493 schedule(); 534 schedule();
494 set_current_state(state); 535 set_current_state(state);
495 } while ((count = sem->count) & RWSEM_ACTIVE_MASK); 536 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
496 537
497 raw_spin_lock_irq(&sem->wait_lock); 538 raw_spin_lock_irq(&sem->wait_lock);
498 } 539 }
@@ -507,10 +548,11 @@ out_nolock:
507 raw_spin_lock_irq(&sem->wait_lock); 548 raw_spin_lock_irq(&sem->wait_lock);
508 list_del(&waiter.list); 549 list_del(&waiter.list);
509 if (list_empty(&sem->wait_list)) 550 if (list_empty(&sem->wait_list))
510 rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); 551 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
511 else 552 else
512 __rwsem_do_wake(sem, RWSEM_WAKE_ANY); 553 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
513 raw_spin_unlock_irq(&sem->wait_lock); 554 raw_spin_unlock_irq(&sem->wait_lock);
555 wake_up_q(&wake_q);
514 556
515 return ERR_PTR(-EINTR); 557 return ERR_PTR(-EINTR);
516} 558}
@@ -537,6 +579,7 @@ __visible
537struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) 579struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
538{ 580{
539 unsigned long flags; 581 unsigned long flags;
582 WAKE_Q(wake_q);
540 583
541 /* 584 /*
542 * If a spinner is present, it is not necessary to do the wakeup. 585 * If a spinner is present, it is not necessary to do the wakeup.
@@ -573,9 +616,10 @@ locked:
573 616
574 /* do nothing if list empty */ 617 /* do nothing if list empty */
575 if (!list_empty(&sem->wait_list)) 618 if (!list_empty(&sem->wait_list))
576 sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); 619 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
577 620
578 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 621 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
622 wake_up_q(&wake_q);
579 623
580 return sem; 624 return sem;
581} 625}
@@ -590,14 +634,16 @@ __visible
590struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) 634struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
591{ 635{
592 unsigned long flags; 636 unsigned long flags;
637 WAKE_Q(wake_q);
593 638
594 raw_spin_lock_irqsave(&sem->wait_lock, flags); 639 raw_spin_lock_irqsave(&sem->wait_lock, flags);
595 640
596 /* do nothing if list empty */ 641 /* do nothing if list empty */
597 if (!list_empty(&sem->wait_list)) 642 if (!list_empty(&sem->wait_list))
598 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); 643 sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
599 644
600 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 645 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
646 wake_up_q(&wake_q);
601 647
602 return sem; 648 return sem;
603} 649}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2e853ad93a3a..45ba475d4be3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem)
22 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 22 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
23 23
24 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 24 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
25 rwsem_set_reader_owned(sem);
25} 26}
26 27
27EXPORT_SYMBOL(down_read); 28EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem)
33{ 34{
34 int ret = __down_read_trylock(sem); 35 int ret = __down_read_trylock(sem);
35 36
36 if (ret == 1) 37 if (ret == 1) {
37 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); 38 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
39 rwsem_set_reader_owned(sem);
40 }
38 return ret; 41 return ret;
39} 42}
40 43
@@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem)
124 * lockdep: a downgraded write will live on as a write 127 * lockdep: a downgraded write will live on as a write
125 * dependency. 128 * dependency.
126 */ 129 */
127 rwsem_clear_owner(sem); 130 rwsem_set_reader_owned(sem);
128 __downgrade_write(sem); 131 __downgrade_write(sem);
129} 132}
130 133
@@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
138 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 141 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
139 142
140 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 143 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
144 rwsem_set_reader_owned(sem);
141} 145}
142 146
143EXPORT_SYMBOL(down_read_nested); 147EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 870ed9a5b426..a699f4048ba1 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,14 +1,58 @@
1/*
2 * The owner field of the rw_semaphore structure will be set to
3 * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
4 * the owner field when it unlocks. A reader, on the other hand, will
5 * not touch the owner field when it unlocks.
6 *
7 * In essence, the owner field now has the following 3 states:
8 * 1) 0
9 * - lock is free or the owner hasn't set the field yet
10 * 2) RWSEM_READER_OWNED
11 * - lock is currently or previously owned by readers (lock is free
12 * or not set by owner yet)
13 * 3) Other non-zero value
14 * - a writer owns the lock
15 */
16#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
17
1#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 18#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
19/*
20 * All writes to owner are protected by WRITE_ONCE() to make sure that
21 * store tearing can't happen as optimistic spinners may read and use
22 * the owner value concurrently without lock. Read from owner, however,
23 * may not need READ_ONCE() as long as the pointer value is only used
24 * for comparison and isn't being dereferenced.
25 */
2static inline void rwsem_set_owner(struct rw_semaphore *sem) 26static inline void rwsem_set_owner(struct rw_semaphore *sem)
3{ 27{
4 sem->owner = current; 28 WRITE_ONCE(sem->owner, current);
5} 29}
6 30
7static inline void rwsem_clear_owner(struct rw_semaphore *sem) 31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
8{ 32{
9 sem->owner = NULL; 33 WRITE_ONCE(sem->owner, NULL);
34}
35
36static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
37{
38 /*
39 * We check the owner value first to make sure that we will only
40 * do a write to the rwsem cacheline when it is really necessary
41 * to minimize cacheline contention.
42 */
43 if (sem->owner != RWSEM_READER_OWNED)
44 WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
45}
46
47static inline bool rwsem_owner_is_writer(struct task_struct *owner)
48{
49 return owner && owner != RWSEM_READER_OWNED;
10} 50}
11 51
52static inline bool rwsem_owner_is_reader(struct task_struct *owner)
53{
54 return owner == RWSEM_READER_OWNED;
55}
12#else 56#else
13static inline void rwsem_set_owner(struct rw_semaphore *sem) 57static inline void rwsem_set_owner(struct rw_semaphore *sem)
14{ 58{
@@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
17static inline void rwsem_clear_owner(struct rw_semaphore *sem) 61static inline void rwsem_clear_owner(struct rw_semaphore *sem)
18{ 62{
19} 63}
64
65static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
66{
67}
20#endif 68#endif
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..251d16b4cb41 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
169} 169}
170EXPORT_SYMBOL(devm_memunmap); 170EXPORT_SYMBOL(devm_memunmap);
171 171
172pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
173{
174 return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
175}
176EXPORT_SYMBOL(phys_to_pfn_t);
177
178#ifdef CONFIG_ZONE_DEVICE 172#ifdef CONFIG_ZONE_DEVICE
179static DEFINE_MUTEX(pgmap_lock); 173static DEFINE_MUTEX(pgmap_lock);
180static RADIX_TREE(pgmap_radix, GFP_KERNEL); 174static RADIX_TREE(pgmap_radix, GFP_KERNEL);
@@ -308,12 +302,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
308 if (is_ram == REGION_INTERSECTS) 302 if (is_ram == REGION_INTERSECTS)
309 return __va(res->start); 303 return __va(res->start);
310 304
311 if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
312 dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
313 __func__);
314 return ERR_PTR(-ENXIO);
315 }
316
317 if (!ref) 305 if (!ref)
318 return ERR_PTR(-EINVAL); 306 return ERR_PTR(-EINVAL);
319 307
@@ -401,7 +389,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
401 altmap->alloc -= nr_pfns; 389 altmap->alloc -= nr_pfns;
402} 390}
403 391
404#ifdef CONFIG_SPARSEMEM_VMEMMAP
405struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) 392struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
406{ 393{
407 /* 394 /*
@@ -427,5 +414,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
427 414
428 return pgmap ? pgmap->altmap : NULL; 415 return pgmap ? pgmap->altmap : NULL;
429} 416}
430#endif /* CONFIG_SPARSEMEM_VMEMMAP */
431#endif /* CONFIG_ZONE_DEVICE */ 417#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4KASAN_SANITIZE_snapshot.o := n
5
4obj-y += qos.o 6obj-y += qos.o
5obj-$(CONFIG_PM) += main.o 7obj-$(CONFIG_PM) += main.o
6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 8obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
126 return ret; 126 return ret;
127} 127}
128 128
129int pm_prepare_console(void) 129void pm_prepare_console(void)
130{ 130{
131 if (!pm_vt_switch()) 131 if (!pm_vt_switch())
132 return 0; 132 return;
133 133
134 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); 134 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
135 if (orig_fgconsole < 0) 135 if (orig_fgconsole < 0)
136 return 1; 136 return;
137 137
138 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); 138 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
139 return 0; 139 return;
140} 140}
141 141
142void pm_restore_console(void) 142void pm_restore_console(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..a881c6a7ba74 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
52#ifdef CONFIG_SUSPEND 52#ifdef CONFIG_SUSPEND
53 HIBERNATION_SUSPEND, 53 HIBERNATION_SUSPEND,
54#endif 54#endif
55 HIBERNATION_TEST_RESUME,
55 /* keep last */ 56 /* keep last */
56 __HIBERNATION_AFTER_LAST 57 __HIBERNATION_AFTER_LAST
57}; 58};
@@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode)
409 goto Close; 410 goto Close;
410} 411}
411 412
413int __weak hibernate_resume_nonboot_cpu_disable(void)
414{
415 return disable_nonboot_cpus();
416}
417
412/** 418/**
413 * resume_target_kernel - Restore system state from a hibernation image. 419 * resume_target_kernel - Restore system state from a hibernation image.
414 * @platform_mode: Whether or not to use the platform driver. 420 * @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode)
433 if (error) 439 if (error)
434 goto Cleanup; 440 goto Cleanup;
435 441
436 error = disable_nonboot_cpus(); 442 error = hibernate_resume_nonboot_cpu_disable();
437 if (error) 443 if (error)
438 goto Enable_cpus; 444 goto Enable_cpus;
439 445
@@ -642,12 +648,39 @@ static void power_down(void)
642 cpu_relax(); 648 cpu_relax();
643} 649}
644 650
651static int load_image_and_restore(void)
652{
653 int error;
654 unsigned int flags;
655
656 pr_debug("PM: Loading hibernation image.\n");
657
658 lock_device_hotplug();
659 error = create_basic_memory_bitmaps();
660 if (error)
661 goto Unlock;
662
663 error = swsusp_read(&flags);
664 swsusp_close(FMODE_READ);
665 if (!error)
666 hibernation_restore(flags & SF_PLATFORM_MODE);
667
668 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
669 swsusp_free();
670 free_basic_memory_bitmaps();
671 Unlock:
672 unlock_device_hotplug();
673
674 return error;
675}
676
645/** 677/**
646 * hibernate - Carry out system hibernation, including saving the image. 678 * hibernate - Carry out system hibernation, including saving the image.
647 */ 679 */
648int hibernate(void) 680int hibernate(void)
649{ 681{
650 int error; 682 int error, nr_calls = 0;
683 bool snapshot_test = false;
651 684
652 if (!hibernation_available()) { 685 if (!hibernation_available()) {
653 pr_debug("PM: Hibernation not available.\n"); 686 pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +695,11 @@ int hibernate(void)
662 } 695 }
663 696
664 pm_prepare_console(); 697 pm_prepare_console();
665 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 698 error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
666 if (error) 699 if (error) {
700 nr_calls--;
667 goto Exit; 701 goto Exit;
702 }
668 703
669 printk(KERN_INFO "PM: Syncing filesystems ... "); 704 printk(KERN_INFO "PM: Syncing filesystems ... ");
670 sys_sync(); 705 sys_sync();
@@ -697,8 +732,12 @@ int hibernate(void)
697 pr_debug("PM: writing image.\n"); 732 pr_debug("PM: writing image.\n");
698 error = swsusp_write(flags); 733 error = swsusp_write(flags);
699 swsusp_free(); 734 swsusp_free();
700 if (!error) 735 if (!error) {
701 power_down(); 736 if (hibernation_mode == HIBERNATION_TEST_RESUME)
737 snapshot_test = true;
738 else
739 power_down();
740 }
702 in_suspend = 0; 741 in_suspend = 0;
703 pm_restore_gfp_mask(); 742 pm_restore_gfp_mask();
704 } else { 743 } else {
@@ -709,12 +748,18 @@ int hibernate(void)
709 free_basic_memory_bitmaps(); 748 free_basic_memory_bitmaps();
710 Thaw: 749 Thaw:
711 unlock_device_hotplug(); 750 unlock_device_hotplug();
751 if (snapshot_test) {
752 pr_debug("PM: Checking hibernation image\n");
753 error = swsusp_check();
754 if (!error)
755 error = load_image_and_restore();
756 }
712 thaw_processes(); 757 thaw_processes();
713 758
714 /* Don't bother checking whether freezer_test_done is true */ 759 /* Don't bother checking whether freezer_test_done is true */
715 freezer_test_done = false; 760 freezer_test_done = false;
716 Exit: 761 Exit:
717 pm_notifier_call_chain(PM_POST_HIBERNATION); 762 __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
718 pm_restore_console(); 763 pm_restore_console();
719 atomic_inc(&snapshot_device_available); 764 atomic_inc(&snapshot_device_available);
720 Unlock: 765 Unlock:
@@ -740,8 +785,7 @@ int hibernate(void)
740 */ 785 */
741static int software_resume(void) 786static int software_resume(void)
742{ 787{
743 int error; 788 int error, nr_calls = 0;
744 unsigned int flags;
745 789
746 /* 790 /*
747 * If the user said "noresume".. bail out early. 791 * If the user said "noresume".. bail out early.
@@ -827,35 +871,20 @@ static int software_resume(void)
827 } 871 }
828 872
829 pm_prepare_console(); 873 pm_prepare_console();
830 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 874 error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
831 if (error) 875 if (error) {
876 nr_calls--;
832 goto Close_Finish; 877 goto Close_Finish;
878 }
833 879
834 pr_debug("PM: Preparing processes for restore.\n"); 880 pr_debug("PM: Preparing processes for restore.\n");
835 error = freeze_processes(); 881 error = freeze_processes();
836 if (error) 882 if (error)
837 goto Close_Finish; 883 goto Close_Finish;
838 884 error = load_image_and_restore();
839 pr_debug("PM: Loading hibernation image.\n");
840
841 lock_device_hotplug();
842 error = create_basic_memory_bitmaps();
843 if (error)
844 goto Thaw;
845
846 error = swsusp_read(&flags);
847 swsusp_close(FMODE_READ);
848 if (!error)
849 hibernation_restore(flags & SF_PLATFORM_MODE);
850
851 printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
852 swsusp_free();
853 free_basic_memory_bitmaps();
854 Thaw:
855 unlock_device_hotplug();
856 thaw_processes(); 885 thaw_processes();
857 Finish: 886 Finish:
858 pm_notifier_call_chain(PM_POST_RESTORE); 887 __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
859 pm_restore_console(); 888 pm_restore_console();
860 atomic_inc(&snapshot_device_available); 889 atomic_inc(&snapshot_device_available);
861 /* For success case, the suspend path will release the lock */ 890 /* For success case, the suspend path will release the lock */
@@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = {
878#ifdef CONFIG_SUSPEND 907#ifdef CONFIG_SUSPEND
879 [HIBERNATION_SUSPEND] = "suspend", 908 [HIBERNATION_SUSPEND] = "suspend",
880#endif 909#endif
910 [HIBERNATION_TEST_RESUME] = "test_resume",
881}; 911};
882 912
883/* 913/*
@@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
924#ifdef CONFIG_SUSPEND 954#ifdef CONFIG_SUSPEND
925 case HIBERNATION_SUSPEND: 955 case HIBERNATION_SUSPEND:
926#endif 956#endif
957 case HIBERNATION_TEST_RESUME:
927 break; 958 break;
928 case HIBERNATION_PLATFORM: 959 case HIBERNATION_PLATFORM:
929 if (hibernation_ops) 960 if (hibernation_ops)
@@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
970#ifdef CONFIG_SUSPEND 1001#ifdef CONFIG_SUSPEND
971 case HIBERNATION_SUSPEND: 1002 case HIBERNATION_SUSPEND:
972#endif 1003#endif
1004 case HIBERNATION_TEST_RESUME:
973 hibernation_mode = mode; 1005 hibernation_mode = mode;
974 break; 1006 break;
975 case HIBERNATION_PLATFORM: 1007 case HIBERNATION_PLATFORM:
@@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str)
1115 1147
1116static int __init hibernate_setup(char *str) 1148static int __init hibernate_setup(char *str)
1117{ 1149{
1118 if (!strncmp(str, "noresume", 8)) 1150 if (!strncmp(str, "noresume", 8)) {
1119 noresume = 1; 1151 noresume = 1;
1120 else if (!strncmp(str, "nocompress", 10)) 1152 } else if (!strncmp(str, "nocompress", 10)) {
1121 nocompress = 1; 1153 nocompress = 1;
1122 else if (!strncmp(str, "no", 2)) { 1154 } else if (!strncmp(str, "no", 2)) {
1123 noresume = 1; 1155 noresume = 1;
1124 nohibernate = 1; 1156 nohibernate = 1;
1157 } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
1158 && !strncmp(str, "protect_image", 13)) {
1159 enable_restore_image_protection();
1125 } 1160 }
1126 return 1; 1161 return 1;
1127} 1162}
@@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str)
1154 return 1; 1189 return 1;
1155} 1190}
1156 1191
1157static int __init kaslr_nohibernate_setup(char *str)
1158{
1159 return nohibernate_setup(str);
1160}
1161
1162static int __init page_poison_nohibernate_setup(char *str) 1192static int __init page_poison_nohibernate_setup(char *str)
1163{ 1193{
1164#ifdef CONFIG_PAGE_POISONING_ZERO 1194#ifdef CONFIG_PAGE_POISONING_ZERO
@@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup);
1182__setup("resumewait", resumewait_setup); 1212__setup("resumewait", resumewait_setup);
1183__setup("resumedelay=", resumedelay_setup); 1213__setup("resumedelay=", resumedelay_setup);
1184__setup("nohibernate", nohibernate_setup); 1214__setup("nohibernate", nohibernate_setup);
1185__setup("kaslr", kaslr_nohibernate_setup);
1186__setup("page_poison=", page_poison_nohibernate_setup); 1215__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
38} 38}
39EXPORT_SYMBOL_GPL(unregister_pm_notifier); 39EXPORT_SYMBOL_GPL(unregister_pm_notifier);
40 40
41int pm_notifier_call_chain(unsigned long val) 41int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
42{ 42{
43 int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); 43 int ret;
44
45 ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
46 nr_to_call, nr_calls);
44 47
45 return notifier_to_errno(ret); 48 return notifier_to_errno(ret);
46} 49}
50int pm_notifier_call_chain(unsigned long val)
51{
52 return __pm_notifier_call_chain(val, -1, NULL);
53}
47 54
48/* If set, devices may be suspended and resumed asynchronously. */ 55/* If set, devices may be suspended and resumed asynchronously. */
49int pm_async_enabled = 1; 56int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
38} 38}
39#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ 39#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
40 40
41extern int hibernate_resume_nonboot_cpu_disable(void);
42
41/* 43/*
42 * Keep some memory free so that I/O operations can succeed without paging 44 * Keep some memory free so that I/O operations can succeed without paging
43 * [Might this be more than 4 MB?] 45 * [Might this be more than 4 MB?]
@@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode);
59extern int hibernation_restore(int platform_mode); 61extern int hibernation_restore(int platform_mode);
60extern int hibernation_platform_enter(void); 62extern int hibernation_platform_enter(void);
61 63
64#ifdef CONFIG_DEBUG_RODATA
65/* kernel/power/snapshot.c */
66extern void enable_restore_image_protection(void);
67#else
68static inline void enable_restore_image_protection(void) {}
69#endif /* CONFIG_DEBUG_RODATA */
70
62#else /* !CONFIG_HIBERNATION */ 71#else /* !CONFIG_HIBERNATION */
63 72
64static inline void hibernate_reserved_size_init(void) {} 73static inline void hibernate_reserved_size_init(void) {}
@@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {}
200 209
201#ifdef CONFIG_PM_SLEEP 210#ifdef CONFIG_PM_SLEEP
202/* kernel/power/main.c */ 211/* kernel/power/main.c */
212extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
213 int *nr_calls);
203extern int pm_notifier_call_chain(unsigned long val); 214extern int pm_notifier_call_chain(unsigned long val);
204#endif 215#endif
205 216
diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..8f27d5a8adf6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
89 elapsed_msecs / 1000, elapsed_msecs % 1000, 89 elapsed_msecs / 1000, elapsed_msecs % 1000,
90 todo - wq_busy, wq_busy); 90 todo - wq_busy, wq_busy);
91 91
92 if (wq_busy)
93 show_workqueue_state();
94
92 if (!wakeup) { 95 if (!wakeup) {
93 read_lock(&tasklist_lock); 96 read_lock(&tasklist_lock);
94 for_each_process_thread(g, p) { 97 for_each_process_thread(g, p) {
@@ -146,6 +149,18 @@ int freeze_processes(void)
146 if (!error && !oom_killer_disable()) 149 if (!error && !oom_killer_disable())
147 error = -EBUSY; 150 error = -EBUSY;
148 151
152 /*
153 * There is a hard to fix race between oom_reaper kernel thread
154 * and oom_killer_disable. oom_reaper calls exit_oom_victim
155 * before the victim reaches exit_mm so try to freeze all the tasks
156 * again and catch such a left over task.
157 */
158 if (!error) {
159 pr_info("Double checking all user space processes after OOM killer disable... ");
160 error = try_to_freeze_tasks(true);
161 pr_cont("\n");
162 }
163
149 if (error) 164 if (error)
150 thaw_processes(); 165 thaw_processes();
151 return error; 166 return error;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
38 38
39#include "power.h" 39#include "power.h"
40 40
41#ifdef CONFIG_DEBUG_RODATA
42static bool hibernate_restore_protection;
43static bool hibernate_restore_protection_active;
44
45void enable_restore_image_protection(void)
46{
47 hibernate_restore_protection = true;
48}
49
50static inline void hibernate_restore_protection_begin(void)
51{
52 hibernate_restore_protection_active = hibernate_restore_protection;
53}
54
55static inline void hibernate_restore_protection_end(void)
56{
57 hibernate_restore_protection_active = false;
58}
59
60static inline void hibernate_restore_protect_page(void *page_address)
61{
62 if (hibernate_restore_protection_active)
63 set_memory_ro((unsigned long)page_address, 1);
64}
65
66static inline void hibernate_restore_unprotect_page(void *page_address)
67{
68 if (hibernate_restore_protection_active)
69 set_memory_rw((unsigned long)page_address, 1);
70}
71#else
72static inline void hibernate_restore_protection_begin(void) {}
73static inline void hibernate_restore_protection_end(void) {}
74static inline void hibernate_restore_protect_page(void *page_address) {}
75static inline void hibernate_restore_unprotect_page(void *page_address) {}
76#endif /* CONFIG_DEBUG_RODATA */
77
41static int swsusp_page_is_free(struct page *); 78static int swsusp_page_is_free(struct page *);
42static void swsusp_set_page_forbidden(struct page *); 79static void swsusp_set_page_forbidden(struct page *);
43static void swsusp_unset_page_forbidden(struct page *); 80static void swsusp_unset_page_forbidden(struct page *);
@@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void)
67 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; 104 image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
68} 105}
69 106
70/* List of PBEs needed for restoring the pages that were allocated before 107/*
108 * List of PBEs needed for restoring the pages that were allocated before
71 * the suspend and included in the suspend image, but have also been 109 * the suspend and included in the suspend image, but have also been
72 * allocated by the "resume" kernel, so their contents cannot be written 110 * allocated by the "resume" kernel, so their contents cannot be written
73 * directly to their "original" page frames. 111 * directly to their "original" page frames.
74 */ 112 */
75struct pbe *restore_pblist; 113struct pbe *restore_pblist;
76 114
77/* Pointer to an auxiliary buffer (1 page) */ 115/* struct linked_page is used to build chains of pages */
78static void *buffer;
79 116
80/** 117#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
81 * @safe_needed - on resume, for storing the PBE list and the image, 118
82 * we can only use memory pages that do not conflict with the pages 119struct linked_page {
83 * used before suspend. The unsafe pages have PageNosaveFree set 120 struct linked_page *next;
84 * and we count them using unsafe_pages. 121 char data[LINKED_PAGE_DATA_SIZE];
85 * 122} __packed;
86 * Each allocated image page is marked as PageNosave and PageNosaveFree 123
87 * so that swsusp_free() can release it. 124/*
125 * List of "safe" pages (ie. pages that were not used by the image kernel
126 * before hibernation) that may be used as temporary storage for image kernel
127 * memory contents.
88 */ 128 */
129static struct linked_page *safe_pages_list;
130
131/* Pointer to an auxiliary buffer (1 page) */
132static void *buffer;
89 133
90#define PG_ANY 0 134#define PG_ANY 0
91#define PG_SAFE 1 135#define PG_SAFE 1
@@ -94,6 +138,19 @@ static void *buffer;
94 138
95static unsigned int allocated_unsafe_pages; 139static unsigned int allocated_unsafe_pages;
96 140
141/**
142 * get_image_page - Allocate a page for a hibernation image.
143 * @gfp_mask: GFP mask for the allocation.
144 * @safe_needed: Get pages that were not used before hibernation (restore only)
145 *
146 * During image restoration, for storing the PBE list and the image data, we can
147 * only use memory pages that do not conflict with the pages used before
148 * hibernation. The "unsafe" pages have PageNosaveFree set and we count them
149 * using allocated_unsafe_pages.
150 *
151 * Each allocated image page is marked as PageNosave and PageNosaveFree so that
152 * swsusp_free() can release it.
153 */
97static void *get_image_page(gfp_t gfp_mask, int safe_needed) 154static void *get_image_page(gfp_t gfp_mask, int safe_needed)
98{ 155{
99 void *res; 156 void *res;
@@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
113 return res; 170 return res;
114} 171}
115 172
173static void *__get_safe_page(gfp_t gfp_mask)
174{
175 if (safe_pages_list) {
176 void *ret = safe_pages_list;
177
178 safe_pages_list = safe_pages_list->next;
179 memset(ret, 0, PAGE_SIZE);
180 return ret;
181 }
182 return get_image_page(gfp_mask, PG_SAFE);
183}
184
116unsigned long get_safe_page(gfp_t gfp_mask) 185unsigned long get_safe_page(gfp_t gfp_mask)
117{ 186{
118 return (unsigned long)get_image_page(gfp_mask, PG_SAFE); 187 return (unsigned long)__get_safe_page(gfp_mask);
119} 188}
120 189
121static struct page *alloc_image_page(gfp_t gfp_mask) 190static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
130 return page; 199 return page;
131} 200}
132 201
202static void recycle_safe_page(void *page_address)
203{
204 struct linked_page *lp = page_address;
205
206 lp->next = safe_pages_list;
207 safe_pages_list = lp;
208}
209
133/** 210/**
134 * free_image_page - free page represented by @addr, allocated with 211 * free_image_page - Free a page allocated for hibernation image.
135 * get_image_page (page flags set by it must be cleared) 212 * @addr: Address of the page to free.
213 * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
214 *
215 * The page to free should have been allocated by get_image_page() (page flags
216 * set by it are affected).
136 */ 217 */
137
138static inline void free_image_page(void *addr, int clear_nosave_free) 218static inline void free_image_page(void *addr, int clear_nosave_free)
139{ 219{
140 struct page *page; 220 struct page *page;
@@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
150 __free_page(page); 230 __free_page(page);
151} 231}
152 232
153/* struct linked_page is used to build chains of pages */ 233static inline void free_list_of_pages(struct linked_page *list,
154 234 int clear_page_nosave)
155#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
156
157struct linked_page {
158 struct linked_page *next;
159 char data[LINKED_PAGE_DATA_SIZE];
160} __packed;
161
162static inline void
163free_list_of_pages(struct linked_page *list, int clear_page_nosave)
164{ 235{
165 while (list) { 236 while (list) {
166 struct linked_page *lp = list->next; 237 struct linked_page *lp = list->next;
@@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
170 } 241 }
171} 242}
172 243
173/** 244/*
174 * struct chain_allocator is used for allocating small objects out of 245 * struct chain_allocator is used for allocating small objects out of
175 * a linked list of pages called 'the chain'. 246 * a linked list of pages called 'the chain'.
176 * 247 *
177 * The chain grows each time when there is no room for a new object in 248 * The chain grows each time when there is no room for a new object in
178 * the current page. The allocated objects cannot be freed individually. 249 * the current page. The allocated objects cannot be freed individually.
179 * It is only possible to free them all at once, by freeing the entire 250 * It is only possible to free them all at once, by freeing the entire
180 * chain. 251 * chain.
181 * 252 *
182 * NOTE: The chain allocator may be inefficient if the allocated objects 253 * NOTE: The chain allocator may be inefficient if the allocated objects
183 * are not much smaller than PAGE_SIZE. 254 * are not much smaller than PAGE_SIZE.
184 */ 255 */
185
186struct chain_allocator { 256struct chain_allocator {
187 struct linked_page *chain; /* the chain */ 257 struct linked_page *chain; /* the chain */
188 unsigned int used_space; /* total size of objects allocated out 258 unsigned int used_space; /* total size of objects allocated out
189 * of the current page 259 of the current page */
190 */
191 gfp_t gfp_mask; /* mask for allocating pages */ 260 gfp_t gfp_mask; /* mask for allocating pages */
192 int safe_needed; /* if set, only "safe" pages are allocated */ 261 int safe_needed; /* if set, only "safe" pages are allocated */
193}; 262};
194 263
195static void 264static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
196chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) 265 int safe_needed)
197{ 266{
198 ca->chain = NULL; 267 ca->chain = NULL;
199 ca->used_space = LINKED_PAGE_DATA_SIZE; 268 ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
208 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { 277 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
209 struct linked_page *lp; 278 struct linked_page *lp;
210 279
211 lp = get_image_page(ca->gfp_mask, ca->safe_needed); 280 lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
281 get_image_page(ca->gfp_mask, PG_ANY);
212 if (!lp) 282 if (!lp)
213 return NULL; 283 return NULL;
214 284
@@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
222} 292}
223 293
224/** 294/**
225 * Data types related to memory bitmaps. 295 * Data types related to memory bitmaps.
226 * 296 *
227 * Memory bitmap is a structure consiting of many linked lists of 297 * Memory bitmap is a structure consiting of many linked lists of
228 * objects. The main list's elements are of type struct zone_bitmap 298 * objects. The main list's elements are of type struct zone_bitmap
229 * and each of them corresonds to one zone. For each zone bitmap 299 * and each of them corresonds to one zone. For each zone bitmap
230 * object there is a list of objects of type struct bm_block that 300 * object there is a list of objects of type struct bm_block that
231 * represent each blocks of bitmap in which information is stored. 301 * represent each blocks of bitmap in which information is stored.
232 * 302 *
233 * struct memory_bitmap contains a pointer to the main list of zone 303 * struct memory_bitmap contains a pointer to the main list of zone
234 * bitmap objects, a struct bm_position used for browsing the bitmap, 304 * bitmap objects, a struct bm_position used for browsing the bitmap,
235 * and a pointer to the list of pages used for allocating all of the 305 * and a pointer to the list of pages used for allocating all of the
236 * zone bitmap objects and bitmap block objects. 306 * zone bitmap objects and bitmap block objects.
237 * 307 *
238 * NOTE: It has to be possible to lay out the bitmap in memory 308 * NOTE: It has to be possible to lay out the bitmap in memory
239 * using only allocations of order 0. Additionally, the bitmap is 309 * using only allocations of order 0. Additionally, the bitmap is
240 * designed to work with arbitrary number of zones (this is over the 310 * designed to work with arbitrary number of zones (this is over the
241 * top for now, but let's avoid making unnecessary assumptions ;-). 311 * top for now, but let's avoid making unnecessary assumptions ;-).
242 * 312 *
243 * struct zone_bitmap contains a pointer to a list of bitmap block 313 * struct zone_bitmap contains a pointer to a list of bitmap block
244 * objects and a pointer to the bitmap block object that has been 314 * objects and a pointer to the bitmap block object that has been
245 * most recently used for setting bits. Additionally, it contains the 315 * most recently used for setting bits. Additionally, it contains the
246 * pfns that correspond to the start and end of the represented zone. 316 * PFNs that correspond to the start and end of the represented zone.
247 * 317 *
248 * struct bm_block contains a pointer to the memory page in which 318 * struct bm_block contains a pointer to the memory page in which
249 * information is stored (in the form of a block of bitmap) 319 * information is stored (in the form of a block of bitmap)
250 * It also contains the pfns that correspond to the start and end of 320 * It also contains the pfns that correspond to the start and end of
251 * the represented memory area. 321 * the represented memory area.
252 * 322 *
253 * The memory bitmap is organized as a radix tree to guarantee fast random 323 * The memory bitmap is organized as a radix tree to guarantee fast random
254 * access to the bits. There is one radix tree for each zone (as returned 324 * access to the bits. There is one radix tree for each zone (as returned
255 * from create_mem_extents). 325 * from create_mem_extents).
256 * 326 *
257 * One radix tree is represented by one struct mem_zone_bm_rtree. There are 327 * One radix tree is represented by one struct mem_zone_bm_rtree. There are
258 * two linked lists for the nodes of the tree, one for the inner nodes and 328 * two linked lists for the nodes of the tree, one for the inner nodes and
259 * one for the leave nodes. The linked leave nodes are used for fast linear 329 * one for the leave nodes. The linked leave nodes are used for fast linear
260 * access of the memory bitmap. 330 * access of the memory bitmap.
261 * 331 *
262 * The struct rtree_node represents one node of the radix tree. 332 * The struct rtree_node represents one node of the radix tree.
263 */ 333 */
264 334
265#define BM_END_OF_MAP (~0UL) 335#define BM_END_OF_MAP (~0UL)
@@ -305,9 +375,8 @@ struct bm_position {
305struct memory_bitmap { 375struct memory_bitmap {
306 struct list_head zones; 376 struct list_head zones;
307 struct linked_page *p_list; /* list of pages used to store zone 377 struct linked_page *p_list; /* list of pages used to store zone
308 * bitmap objects and bitmap block 378 bitmap objects and bitmap block
309 * objects 379 objects */
310 */
311 struct bm_position cur; /* most recently used bit position */ 380 struct bm_position cur; /* most recently used bit position */
312}; 381};
313 382
@@ -321,12 +390,12 @@ struct memory_bitmap {
321#endif 390#endif
322#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) 391#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
323 392
324/* 393/**
325 * alloc_rtree_node - Allocate a new node and add it to the radix tree. 394 * alloc_rtree_node - Allocate a new node and add it to the radix tree.
326 * 395 *
327 * This function is used to allocate inner nodes as well as the 396 * This function is used to allocate inner nodes as well as the
328 * leave nodes of the radix tree. It also adds the node to the 397 * leave nodes of the radix tree. It also adds the node to the
329 * corresponding linked list passed in by the *list parameter. 398 * corresponding linked list passed in by the *list parameter.
330 */ 399 */
331static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, 400static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
332 struct chain_allocator *ca, 401 struct chain_allocator *ca,
@@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
347 return node; 416 return node;
348} 417}
349 418
350/* 419/**
351 * add_rtree_block - Add a new leave node to the radix tree 420 * add_rtree_block - Add a new leave node to the radix tree.
352 * 421 *
353 * The leave nodes need to be allocated in order to keep the leaves 422 * The leave nodes need to be allocated in order to keep the leaves
354 * linked list in order. This is guaranteed by the zone->blocks 423 * linked list in order. This is guaranteed by the zone->blocks
355 * counter. 424 * counter.
356 */ 425 */
357static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, 426static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
358 int safe_needed, struct chain_allocator *ca) 427 int safe_needed, struct chain_allocator *ca)
@@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
417static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, 486static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
418 int clear_nosave_free); 487 int clear_nosave_free);
419 488
420/* 489/**
421 * create_zone_bm_rtree - create a radix tree for one zone 490 * create_zone_bm_rtree - Create a radix tree for one zone.
422 * 491 *
423 * Allocated the mem_zone_bm_rtree structure and initializes it. 492 * Allocated the mem_zone_bm_rtree structure and initializes it.
424 * This function also allocated and builds the radix tree for the 493 * This function also allocated and builds the radix tree for the
425 * zone. 494 * zone.
426 */ 495 */
427static struct mem_zone_bm_rtree * 496static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
428create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, 497 int safe_needed,
429 struct chain_allocator *ca, 498 struct chain_allocator *ca,
430 unsigned long start, unsigned long end) 499 unsigned long start,
500 unsigned long end)
431{ 501{
432 struct mem_zone_bm_rtree *zone; 502 struct mem_zone_bm_rtree *zone;
433 unsigned int i, nr_blocks; 503 unsigned int i, nr_blocks;
@@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
454 return zone; 524 return zone;
455} 525}
456 526
457/* 527/**
458 * free_zone_bm_rtree - Free the memory of the radix tree 528 * free_zone_bm_rtree - Free the memory of the radix tree.
459 * 529 *
460 * Free all node pages of the radix tree. The mem_zone_bm_rtree 530 * Free all node pages of the radix tree. The mem_zone_bm_rtree
461 * structure itself is not freed here nor are the rtree_node 531 * structure itself is not freed here nor are the rtree_node
462 * structs. 532 * structs.
463 */ 533 */
464static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, 534static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
465 int clear_nosave_free) 535 int clear_nosave_free)
@@ -492,8 +562,8 @@ struct mem_extent {
492}; 562};
493 563
494/** 564/**
495 * free_mem_extents - free a list of memory extents 565 * free_mem_extents - Free a list of memory extents.
496 * @list - list of extents to empty 566 * @list: List of extents to free.
497 */ 567 */
498static void free_mem_extents(struct list_head *list) 568static void free_mem_extents(struct list_head *list)
499{ 569{
@@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list)
506} 576}
507 577
508/** 578/**
509 * create_mem_extents - create a list of memory extents representing 579 * create_mem_extents - Create a list of memory extents.
510 * contiguous ranges of PFNs 580 * @list: List to put the extents into.
511 * @list - list to put the extents into 581 * @gfp_mask: Mask to use for memory allocations.
512 * @gfp_mask - mask to use for memory allocations 582 *
583 * The extents represent contiguous ranges of PFNs.
513 */ 584 */
514static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) 585static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
515{ 586{
@@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
565} 636}
566 637
567/** 638/**
568 * memory_bm_create - allocate memory for a memory bitmap 639 * memory_bm_create - Allocate memory for a memory bitmap.
569 */ 640 */
570static int 641static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
571memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) 642 int safe_needed)
572{ 643{
573 struct chain_allocator ca; 644 struct chain_allocator ca;
574 struct list_head mem_extents; 645 struct list_head mem_extents;
@@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
607} 678}
608 679
609/** 680/**
610 * memory_bm_free - free memory occupied by the memory bitmap @bm 681 * memory_bm_free - Free memory occupied by the memory bitmap.
611 */ 682 * @bm: Memory bitmap.
683 */
612static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 684static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
613{ 685{
614 struct mem_zone_bm_rtree *zone; 686 struct mem_zone_bm_rtree *zone;
@@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
622} 694}
623 695
624/** 696/**
625 * memory_bm_find_bit - Find the bit for pfn in the memory 697 * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
626 * bitmap
627 * 698 *
628 * Find the bit in the bitmap @bm that corresponds to given pfn. 699 * Find the bit in memory bitmap @bm that corresponds to the given PFN.
629 * The cur.zone, cur.block and cur.node_pfn member of @bm are 700 * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
630 * updated. 701 *
631 * It walks the radix tree to find the page which contains the bit for 702 * Walk the radix tree to find the page containing the bit that represents @pfn
632 * pfn and returns the bit position in **addr and *bit_nr. 703 * and return the position of the bit in @addr and @bit_nr.
633 */ 704 */
634static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 705static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
635 void **addr, unsigned int *bit_nr) 706 void **addr, unsigned int *bit_nr)
@@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
658 729
659zone_found: 730zone_found:
660 /* 731 /*
661 * We have a zone. Now walk the radix tree to find the leave 732 * We have found the zone. Now walk the radix tree to find the leaf node
662 * node for our pfn. 733 * for our PFN.
663 */ 734 */
664
665 node = bm->cur.node; 735 node = bm->cur.node;
666 if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) 736 if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
667 goto node_found; 737 goto node_found;
@@ -754,14 +824,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
754} 824}
755 825
756/* 826/*
757 * rtree_next_node - Jumps to the next leave node 827 * rtree_next_node - Jump to the next leaf node.
758 * 828 *
759 * Sets the position to the beginning of the next node in the 829 * Set the position to the beginning of the next node in the
760 * memory bitmap. This is either the next node in the current 830 * memory bitmap. This is either the next node in the current
761 * zone's radix tree or the first node in the radix tree of the 831 * zone's radix tree or the first node in the radix tree of the
762 * next zone. 832 * next zone.
763 * 833 *
764 * Returns true if there is a next node, false otherwise. 834 * Return true if there is a next node, false otherwise.
765 */ 835 */
766static bool rtree_next_node(struct memory_bitmap *bm) 836static bool rtree_next_node(struct memory_bitmap *bm)
767{ 837{
@@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
790} 860}
791 861
792/** 862/**
793 * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm 863 * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
864 * @bm: Memory bitmap.
794 * 865 *
795 * Starting from the last returned position this function searches 866 * Starting from the last returned position this function searches for the next
796 * for the next set bit in the memory bitmap and returns its 867 * set bit in @bm and returns the PFN represented by it. If no more bits are
797 * number. If no more bit is set BM_END_OF_MAP is returned. 868 * set, BM_END_OF_MAP is returned.
798 * 869 *
799 * It is required to run memory_bm_position_reset() before the 870 * It is required to run memory_bm_position_reset() before the first call to
800 * first call to this function. 871 * this function for the given memory bitmap.
801 */ 872 */
802static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 873static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
803{ 874{
@@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
819 return BM_END_OF_MAP; 890 return BM_END_OF_MAP;
820} 891}
821 892
822/** 893/*
823 * This structure represents a range of page frames the contents of which 894 * This structure represents a range of page frames the contents of which
824 * should not be saved during the suspend. 895 * should not be saved during hibernation.
825 */ 896 */
826
827struct nosave_region { 897struct nosave_region {
828 struct list_head list; 898 struct list_head list;
829 unsigned long start_pfn; 899 unsigned long start_pfn;
@@ -832,15 +902,42 @@ struct nosave_region {
832 902
833static LIST_HEAD(nosave_regions); 903static LIST_HEAD(nosave_regions);
834 904
905static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
906{
907 struct rtree_node *node;
908
909 list_for_each_entry(node, &zone->nodes, list)
910 recycle_safe_page(node->data);
911
912 list_for_each_entry(node, &zone->leaves, list)
913 recycle_safe_page(node->data);
914}
915
916static void memory_bm_recycle(struct memory_bitmap *bm)
917{
918 struct mem_zone_bm_rtree *zone;
919 struct linked_page *p_list;
920
921 list_for_each_entry(zone, &bm->zones, list)
922 recycle_zone_bm_rtree(zone);
923
924 p_list = bm->p_list;
925 while (p_list) {
926 struct linked_page *lp = p_list;
927
928 p_list = lp->next;
929 recycle_safe_page(lp);
930 }
931}
932
835/** 933/**
836 * register_nosave_region - register a range of page frames the contents 934 * register_nosave_region - Register a region of unsaveable memory.
837 * of which should not be saved during the suspend (to be used in the early 935 *
838 * initialization code) 936 * Register a range of page frames the contents of which should not be saved
937 * during hibernation (to be used in the early initialization code).
839 */ 938 */
840 939void __init __register_nosave_region(unsigned long start_pfn,
841void __init 940 unsigned long end_pfn, int use_kmalloc)
842__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
843 int use_kmalloc)
844{ 941{
845 struct nosave_region *region; 942 struct nosave_region *region;
846 943
@@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
857 } 954 }
858 } 955 }
859 if (use_kmalloc) { 956 if (use_kmalloc) {
860 /* during init, this shouldn't fail */ 957 /* During init, this shouldn't fail */
861 region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); 958 region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
862 BUG_ON(!region); 959 BUG_ON(!region);
863 } else 960 } else {
864 /* This allocation cannot fail */ 961 /* This allocation cannot fail */
865 region = memblock_virt_alloc(sizeof(struct nosave_region), 0); 962 region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
963 }
866 region->start_pfn = start_pfn; 964 region->start_pfn = start_pfn;
867 region->end_pfn = end_pfn; 965 region->end_pfn = end_pfn;
868 list_add_tail(&region->list, &nosave_regions); 966 list_add_tail(&region->list, &nosave_regions);
@@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
923} 1021}
924 1022
925/** 1023/**
926 * mark_nosave_pages - set bits corresponding to the page frames the 1024 * mark_nosave_pages - Mark pages that should not be saved.
927 * contents of which should not be saved in a given bitmap. 1025 * @bm: Memory bitmap.
1026 *
1027 * Set the bits in @bm that correspond to the page frames the contents of which
1028 * should not be saved.
928 */ 1029 */
929
930static void mark_nosave_pages(struct memory_bitmap *bm) 1030static void mark_nosave_pages(struct memory_bitmap *bm)
931{ 1031{
932 struct nosave_region *region; 1032 struct nosave_region *region;
@@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
956} 1056}
957 1057
958/** 1058/**
959 * create_basic_memory_bitmaps - create bitmaps needed for marking page 1059 * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
960 * frames that should not be saved and free page frames. The pointers 1060 *
961 * forbidden_pages_map and free_pages_map are only modified if everything 1061 * Create bitmaps needed for marking page frames that should not be saved and
962 * goes well, because we don't want the bits to be used before both bitmaps 1062 * free page frames. The forbidden_pages_map and free_pages_map pointers are
963 * are set up. 1063 * only modified if everything goes well, because we don't want the bits to be
1064 * touched before both bitmaps are set up.
964 */ 1065 */
965
966int create_basic_memory_bitmaps(void) 1066int create_basic_memory_bitmaps(void)
967{ 1067{
968 struct memory_bitmap *bm1, *bm2; 1068 struct memory_bitmap *bm1, *bm2;
@@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void)
1007} 1107}
1008 1108
1009/** 1109/**
1010 * free_basic_memory_bitmaps - free memory bitmaps allocated by 1110 * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
1011 * create_basic_memory_bitmaps(). The auxiliary pointers are necessary 1111 *
1012 * so that the bitmaps themselves are not referred to while they are being 1112 * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The
1013 * freed. 1113 * auxiliary pointers are necessary so that the bitmaps themselves are not
1114 * referred to while they are being freed.
1014 */ 1115 */
1015
1016void free_basic_memory_bitmaps(void) 1116void free_basic_memory_bitmaps(void)
1017{ 1117{
1018 struct memory_bitmap *bm1, *bm2; 1118 struct memory_bitmap *bm1, *bm2;
@@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void)
1033} 1133}
1034 1134
1035/** 1135/**
1036 * snapshot_additional_pages - estimate the number of additional pages 1136 * snapshot_additional_pages - Estimate the number of extra pages needed.
1037 * be needed for setting up the suspend image data structures for given 1137 * @zone: Memory zone to carry out the computation for.
1038 * zone (usually the returned value is greater than the exact number) 1138 *
1139 * Estimate the number of additional pages needed for setting up a hibernation
1140 * image data structures for @zone (usually, the returned value is greater than
1141 * the exact number).
1039 */ 1142 */
1040
1041unsigned int snapshot_additional_pages(struct zone *zone) 1143unsigned int snapshot_additional_pages(struct zone *zone)
1042{ 1144{
1043 unsigned int rtree, nodes; 1145 unsigned int rtree, nodes;
@@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
1055 1157
1056#ifdef CONFIG_HIGHMEM 1158#ifdef CONFIG_HIGHMEM
1057/** 1159/**
1058 * count_free_highmem_pages - compute the total number of free highmem 1160 * count_free_highmem_pages - Compute the total number of free highmem pages.
1059 * pages, system-wide. 1161 *
1162 * The returned number is system-wide.
1060 */ 1163 */
1061
1062static unsigned int count_free_highmem_pages(void) 1164static unsigned int count_free_highmem_pages(void)
1063{ 1165{
1064 struct zone *zone; 1166 struct zone *zone;
@@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void)
1072} 1174}
1073 1175
1074/** 1176/**
1075 * saveable_highmem_page - Determine whether a highmem page should be 1177 * saveable_highmem_page - Check if a highmem page is saveable.
1076 * included in the suspend image.
1077 * 1178 *
1078 * We should save the page if it isn't Nosave or NosaveFree, or Reserved, 1179 * Determine whether a highmem page should be included in a hibernation image.
1079 * and it isn't a part of a free chunk of pages. 1180 *
1181 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
1182 * and it isn't part of a free chunk of pages.
1080 */ 1183 */
1081static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) 1184static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
1082{ 1185{
@@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
1102} 1205}
1103 1206
1104/** 1207/**
1105 * count_highmem_pages - compute the total number of saveable highmem 1208 * count_highmem_pages - Compute the total number of saveable highmem pages.
1106 * pages.
1107 */ 1209 */
1108
1109static unsigned int count_highmem_pages(void) 1210static unsigned int count_highmem_pages(void)
1110{ 1211{
1111 struct zone *zone; 1212 struct zone *zone;
@@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
1133#endif /* CONFIG_HIGHMEM */ 1234#endif /* CONFIG_HIGHMEM */
1134 1235
1135/** 1236/**
1136 * saveable_page - Determine whether a non-highmem page should be included 1237 * saveable_page - Check if the given page is saveable.
1137 * in the suspend image.
1138 * 1238 *
1139 * We should save the page if it isn't Nosave, and is not in the range 1239 * Determine whether a non-highmem page should be included in a hibernation
1140 * of pages statically defined as 'unsaveable', and it isn't a part of 1240 * image.
1141 * a free chunk of pages. 1241 *
1242 * We should save the page if it isn't Nosave, and is not in the range
1243 * of pages statically defined as 'unsaveable', and it isn't part of
1244 * a free chunk of pages.
1142 */ 1245 */
1143static struct page *saveable_page(struct zone *zone, unsigned long pfn) 1246static struct page *saveable_page(struct zone *zone, unsigned long pfn)
1144{ 1247{
@@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
1167} 1270}
1168 1271
1169/** 1272/**
1170 * count_data_pages - compute the total number of saveable non-highmem 1273 * count_data_pages - Compute the total number of saveable non-highmem pages.
1171 * pages.
1172 */ 1274 */
1173
1174static unsigned int count_data_pages(void) 1275static unsigned int count_data_pages(void)
1175{ 1276{
1176 struct zone *zone; 1277 struct zone *zone;
@@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void)
1190 return n; 1291 return n;
1191} 1292}
1192 1293
1193/* This is needed, because copy_page and memcpy are not usable for copying 1294/*
1295 * This is needed, because copy_page and memcpy are not usable for copying
1194 * task structs. 1296 * task structs.
1195 */ 1297 */
1196static inline void do_copy_page(long *dst, long *src) 1298static inline void do_copy_page(long *dst, long *src)
@@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src)
1201 *dst++ = *src++; 1303 *dst++ = *src++;
1202} 1304}
1203 1305
1204
1205/** 1306/**
1206 * safe_copy_page - check if the page we are going to copy is marked as 1307 * safe_copy_page - Copy a page in a safe way.
1207 * present in the kernel page tables (this always is the case if 1308 *
1208 * CONFIG_DEBUG_PAGEALLOC is not set and in that case 1309 * Check if the page we are going to copy is marked as present in the kernel
1209 * kernel_page_present() always returns 'true'). 1310 * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
1311 * and in that case kernel_page_present() always returns 'true').
1210 */ 1312 */
1211static void safe_copy_page(void *dst, struct page *s_page) 1313static void safe_copy_page(void *dst, struct page *s_page)
1212{ 1314{
@@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page)
1219 } 1321 }
1220} 1322}
1221 1323
1222
1223#ifdef CONFIG_HIGHMEM 1324#ifdef CONFIG_HIGHMEM
1224static inline struct page * 1325static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
1225page_is_saveable(struct zone *zone, unsigned long pfn)
1226{ 1326{
1227 return is_highmem(zone) ? 1327 return is_highmem(zone) ?
1228 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); 1328 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1243 kunmap_atomic(src); 1343 kunmap_atomic(src);
1244 } else { 1344 } else {
1245 if (PageHighMem(d_page)) { 1345 if (PageHighMem(d_page)) {
1246 /* Page pointed to by src may contain some kernel 1346 /*
1347 * The page pointed to by src may contain some kernel
1247 * data modified by kmap_atomic() 1348 * data modified by kmap_atomic()
1248 */ 1349 */
1249 safe_copy_page(buffer, s_page); 1350 safe_copy_page(buffer, s_page);
@@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1265} 1366}
1266#endif /* CONFIG_HIGHMEM */ 1367#endif /* CONFIG_HIGHMEM */
1267 1368
1268static void 1369static void copy_data_pages(struct memory_bitmap *copy_bm,
1269copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) 1370 struct memory_bitmap *orig_bm)
1270{ 1371{
1271 struct zone *zone; 1372 struct zone *zone;
1272 unsigned long pfn; 1373 unsigned long pfn;
@@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm;
1315static struct memory_bitmap copy_bm; 1416static struct memory_bitmap copy_bm;
1316 1417
1317/** 1418/**
1318 * swsusp_free - free pages allocated for the suspend. 1419 * swsusp_free - Free pages allocated for hibernation image.
1319 * 1420 *
1320 * Suspend pages are alocated before the atomic copy is made, so we 1421 * Image pages are alocated before snapshot creation, so they need to be
1321 * need to release them after the resume. 1422 * released after resume.
1322 */ 1423 */
1323
1324void swsusp_free(void) 1424void swsusp_free(void)
1325{ 1425{
1326 unsigned long fb_pfn, fr_pfn; 1426 unsigned long fb_pfn, fr_pfn;
@@ -1351,6 +1451,7 @@ loop:
1351 1451
1352 memory_bm_clear_current(forbidden_pages_map); 1452 memory_bm_clear_current(forbidden_pages_map);
1353 memory_bm_clear_current(free_pages_map); 1453 memory_bm_clear_current(free_pages_map);
1454 hibernate_restore_unprotect_page(page_address(page));
1354 __free_page(page); 1455 __free_page(page);
1355 goto loop; 1456 goto loop;
1356 } 1457 }
@@ -1362,6 +1463,7 @@ out:
1362 buffer = NULL; 1463 buffer = NULL;
1363 alloc_normal = 0; 1464 alloc_normal = 0;
1364 alloc_highmem = 0; 1465 alloc_highmem = 0;
1466 hibernate_restore_protection_end();
1365} 1467}
1366 1468
1367/* Helper functions used for the shrinking of memory. */ 1469/* Helper functions used for the shrinking of memory. */
@@ -1369,7 +1471,7 @@ out:
1369#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) 1471#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1370 1472
1371/** 1473/**
1372 * preallocate_image_pages - Allocate a number of pages for hibernation image 1474 * preallocate_image_pages - Allocate a number of pages for hibernation image.
1373 * @nr_pages: Number of page frames to allocate. 1475 * @nr_pages: Number of page frames to allocate.
1374 * @mask: GFP flags to use for the allocation. 1476 * @mask: GFP flags to use for the allocation.
1375 * 1477 *
@@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1419} 1521}
1420 1522
1421/** 1523/**
1422 * __fraction - Compute (an approximation of) x * (multiplier / base) 1524 * __fraction - Compute (an approximation of) x * (multiplier / base).
1423 */ 1525 */
1424static unsigned long __fraction(u64 x, u64 multiplier, u64 base) 1526static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1425{ 1527{
@@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1429} 1531}
1430 1532
1431static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, 1533static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1432 unsigned long highmem, 1534 unsigned long highmem,
1433 unsigned long total) 1535 unsigned long total)
1434{ 1536{
1435 unsigned long alloc = __fraction(nr_pages, highmem, total); 1537 unsigned long alloc = __fraction(nr_pages, highmem, total);
1436 1538
@@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1443} 1545}
1444 1546
1445static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, 1547static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1446 unsigned long highmem, 1548 unsigned long highmem,
1447 unsigned long total) 1549 unsigned long total)
1448{ 1550{
1449 return 0; 1551 return 0;
1450} 1552}
1451#endif /* CONFIG_HIGHMEM */ 1553#endif /* CONFIG_HIGHMEM */
1452 1554
1453/** 1555/**
1454 * free_unnecessary_pages - Release preallocated pages not needed for the image 1556 * free_unnecessary_pages - Release preallocated pages not needed for the image.
1455 */ 1557 */
1456static unsigned long free_unnecessary_pages(void) 1558static unsigned long free_unnecessary_pages(void)
1457{ 1559{
@@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void)
1505} 1607}
1506 1608
1507/** 1609/**
1508 * minimum_image_size - Estimate the minimum acceptable size of an image 1610 * minimum_image_size - Estimate the minimum acceptable size of an image.
1509 * @saveable: Number of saveable pages in the system. 1611 * @saveable: Number of saveable pages in the system.
1510 * 1612 *
1511 * We want to avoid attempting to free too much memory too hard, so estimate the 1613 * We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable)
1525 unsigned long size; 1627 unsigned long size;
1526 1628
1527 size = global_page_state(NR_SLAB_RECLAIMABLE) 1629 size = global_page_state(NR_SLAB_RECLAIMABLE)
1528 + global_page_state(NR_ACTIVE_ANON) 1630 + global_node_page_state(NR_ACTIVE_ANON)
1529 + global_page_state(NR_INACTIVE_ANON) 1631 + global_node_page_state(NR_INACTIVE_ANON)
1530 + global_page_state(NR_ACTIVE_FILE) 1632 + global_node_page_state(NR_ACTIVE_FILE)
1531 + global_page_state(NR_INACTIVE_FILE) 1633 + global_node_page_state(NR_INACTIVE_FILE)
1532 - global_page_state(NR_FILE_MAPPED); 1634 - global_node_page_state(NR_FILE_MAPPED);
1533 1635
1534 return saveable <= size ? 0 : saveable - size; 1636 return saveable <= size ? 0 : saveable - size;
1535} 1637}
1536 1638
1537/** 1639/**
1538 * hibernate_preallocate_memory - Preallocate memory for hibernation image 1640 * hibernate_preallocate_memory - Preallocate memory for hibernation image.
1539 * 1641 *
1540 * To create a hibernation image it is necessary to make a copy of every page 1642 * To create a hibernation image it is necessary to make a copy of every page
1541 * frame in use. We also need a number of page frames to be free during 1643 * frame in use. We also need a number of page frames to be free during
@@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void)
1708 1810
1709#ifdef CONFIG_HIGHMEM 1811#ifdef CONFIG_HIGHMEM
1710/** 1812/**
1711 * count_pages_for_highmem - compute the number of non-highmem pages 1813 * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
1712 * that will be necessary for creating copies of highmem pages. 1814 *
1713 */ 1815 * Compute the number of non-highmem pages that will be necessary for creating
1714 1816 * copies of highmem pages.
1817 */
1715static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1818static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1716{ 1819{
1717 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; 1820 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1724 return nr_highmem; 1827 return nr_highmem;
1725} 1828}
1726#else 1829#else
1727static unsigned int 1830static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1728count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1729#endif /* CONFIG_HIGHMEM */ 1831#endif /* CONFIG_HIGHMEM */
1730 1832
1731/** 1833/**
1732 * enough_free_mem - Make sure we have enough free memory for the 1834 * enough_free_mem - Check if there is enough free memory for the image.
1733 * snapshot image.
1734 */ 1835 */
1735
1736static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1836static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1737{ 1837{
1738 struct zone *zone; 1838 struct zone *zone;
@@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1751 1851
1752#ifdef CONFIG_HIGHMEM 1852#ifdef CONFIG_HIGHMEM
1753/** 1853/**
1754 * get_highmem_buffer - if there are some highmem pages in the suspend 1854 * get_highmem_buffer - Allocate a buffer for highmem pages.
1755 * image, we may need the buffer to copy them and/or load their data. 1855 *
1856 * If there are some highmem pages in the hibernation image, we may need a
1857 * buffer to copy them and/or load their data.
1756 */ 1858 */
1757
1758static inline int get_highmem_buffer(int safe_needed) 1859static inline int get_highmem_buffer(int safe_needed)
1759{ 1860{
1760 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); 1861 buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed)
1762} 1863}
1763 1864
1764/** 1865/**
1765 * alloc_highmem_image_pages - allocate some highmem pages for the image. 1866 * alloc_highmem_image_pages - Allocate some highmem pages for the image.
1766 * Try to allocate as many pages as needed, but if the number of free 1867 *
1767 * highmem pages is lesser than that, allocate them all. 1868 * Try to allocate as many pages as needed, but if the number of free highmem
1869 * pages is less than that, allocate them all.
1768 */ 1870 */
1769 1871static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
1770static inline unsigned int 1872 unsigned int nr_highmem)
1771alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1772{ 1873{
1773 unsigned int to_alloc = count_free_highmem_pages(); 1874 unsigned int to_alloc = count_free_highmem_pages();
1774 1875
@@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1787#else 1888#else
1788static inline int get_highmem_buffer(int safe_needed) { return 0; } 1889static inline int get_highmem_buffer(int safe_needed) { return 0; }
1789 1890
1790static inline unsigned int 1891static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
1791alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1892 unsigned int n) { return 0; }
1792#endif /* CONFIG_HIGHMEM */ 1893#endif /* CONFIG_HIGHMEM */
1793 1894
1794/** 1895/**
1795 * swsusp_alloc - allocate memory for the suspend image 1896 * swsusp_alloc - Allocate memory for hibernation image.
1796 * 1897 *
1797 * We first try to allocate as many highmem pages as there are 1898 * We first try to allocate as many highmem pages as there are
1798 * saveable highmem pages in the system. If that fails, we allocate 1899 * saveable highmem pages in the system. If that fails, we allocate
1799 * non-highmem pages for the copies of the remaining highmem ones. 1900 * non-highmem pages for the copies of the remaining highmem ones.
1800 * 1901 *
1801 * In this approach it is likely that the copies of highmem pages will 1902 * In this approach it is likely that the copies of highmem pages will
1802 * also be located in the high memory, because of the way in which 1903 * also be located in the high memory, because of the way in which
1803 * copy_data_pages() works. 1904 * copy_data_pages() works.
1804 */ 1905 */
1805 1906static int swsusp_alloc(struct memory_bitmap *orig_bm,
1806static int 1907 struct memory_bitmap *copy_bm,
1807swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1908 unsigned int nr_pages, unsigned int nr_highmem)
1808 unsigned int nr_pages, unsigned int nr_highmem)
1809{ 1909{
1810 if (nr_highmem > 0) { 1910 if (nr_highmem > 0) {
1811 if (get_highmem_buffer(PG_ANY)) 1911 if (get_highmem_buffer(PG_ANY))
@@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void)
1855 return -ENOMEM; 1955 return -ENOMEM;
1856 } 1956 }
1857 1957
1858 /* During allocating of suspend pagedir, new cold pages may appear. 1958 /*
1959 * During allocating of suspend pagedir, new cold pages may appear.
1859 * Kill them. 1960 * Kill them.
1860 */ 1961 */
1861 drain_local_pages(NULL); 1962 drain_local_pages(NULL);
@@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info)
1918} 2019}
1919 2020
1920/** 2021/**
1921 * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm 2022 * pack_pfns - Prepare PFNs for saving.
1922 * are stored in the array @buf[] (1 page at a time) 2023 * @bm: Memory bitmap.
2024 * @buf: Memory buffer to store the PFNs in.
2025 *
2026 * PFNs corresponding to set bits in @bm are stored in the area of memory
2027 * pointed to by @buf (1 page at a time).
1923 */ 2028 */
1924 2029static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1925static inline void
1926pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1927{ 2030{
1928 int j; 2031 int j;
1929 2032
@@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1937} 2040}
1938 2041
1939/** 2042/**
1940 * snapshot_read_next - used for reading the system memory snapshot. 2043 * snapshot_read_next - Get the address to read the next image page from.
2044 * @handle: Snapshot handle to be used for the reading.
1941 * 2045 *
1942 * On the first call to it @handle should point to a zeroed 2046 * On the first call, @handle should point to a zeroed snapshot_handle
1943 * snapshot_handle structure. The structure gets updated and a pointer 2047 * structure. The structure gets populated then and a pointer to it should be
1944 * to it should be passed to this function every next time. 2048 * passed to this function every next time.
1945 * 2049 *
1946 * On success the function returns a positive number. Then, the caller 2050 * On success, the function returns a positive number. Then, the caller
1947 * is allowed to read up to the returned number of bytes from the memory 2051 * is allowed to read up to the returned number of bytes from the memory
1948 * location computed by the data_of() macro. 2052 * location computed by the data_of() macro.
1949 * 2053 *
1950 * The function returns 0 to indicate the end of data stream condition, 2054 * The function returns 0 to indicate the end of the data stream condition,
1951 * and a negative number is returned on error. In such cases the 2055 * and negative numbers are returned on errors. If that happens, the structure
1952 * structure pointed to by @handle is not updated and should not be used 2056 * pointed to by @handle is not updated and should not be used any more.
1953 * any more.
1954 */ 2057 */
1955
1956int snapshot_read_next(struct snapshot_handle *handle) 2058int snapshot_read_next(struct snapshot_handle *handle)
1957{ 2059{
1958 if (handle->cur > nr_meta_pages + nr_copy_pages) 2060 if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
1981 2083
1982 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 2084 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1983 if (PageHighMem(page)) { 2085 if (PageHighMem(page)) {
1984 /* Highmem pages are copied to the buffer, 2086 /*
2087 * Highmem pages are copied to the buffer,
1985 * because we can't return with a kmapped 2088 * because we can't return with a kmapped
1986 * highmem page (we may not be called again). 2089 * highmem page (we may not be called again).
1987 */ 2090 */
@@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
1999 return PAGE_SIZE; 2102 return PAGE_SIZE;
2000} 2103}
2001 2104
2002/** 2105static void duplicate_memory_bitmap(struct memory_bitmap *dst,
2003 * mark_unsafe_pages - mark the pages that cannot be used for storing 2106 struct memory_bitmap *src)
2004 * the image during resume, because they conflict with the pages that
2005 * had been used before suspend
2006 */
2007
2008static int mark_unsafe_pages(struct memory_bitmap *bm)
2009{ 2107{
2010 struct zone *zone; 2108 unsigned long pfn;
2011 unsigned long pfn, max_zone_pfn;
2012 2109
2013 /* Clear page flags */ 2110 memory_bm_position_reset(src);
2014 for_each_populated_zone(zone) { 2111 pfn = memory_bm_next_pfn(src);
2015 max_zone_pfn = zone_end_pfn(zone); 2112 while (pfn != BM_END_OF_MAP) {
2016 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 2113 memory_bm_set_bit(dst, pfn);
2017 if (pfn_valid(pfn)) 2114 pfn = memory_bm_next_pfn(src);
2018 swsusp_unset_page_free(pfn_to_page(pfn));
2019 } 2115 }
2020
2021 /* Mark pages that correspond to the "original" pfns as "unsafe" */
2022 memory_bm_position_reset(bm);
2023 do {
2024 pfn = memory_bm_next_pfn(bm);
2025 if (likely(pfn != BM_END_OF_MAP)) {
2026 if (likely(pfn_valid(pfn)))
2027 swsusp_set_page_free(pfn_to_page(pfn));
2028 else
2029 return -EFAULT;
2030 }
2031 } while (pfn != BM_END_OF_MAP);
2032
2033 allocated_unsafe_pages = 0;
2034
2035 return 0;
2036} 2116}
2037 2117
2038static void 2118/**
2039duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) 2119 * mark_unsafe_pages - Mark pages that were used before hibernation.
2120 *
2121 * Mark the pages that cannot be used for storing the image during restoration,
2122 * because they conflict with the pages that had been used before hibernation.
2123 */
2124static void mark_unsafe_pages(struct memory_bitmap *bm)
2040{ 2125{
2041 unsigned long pfn; 2126 unsigned long pfn;
2042 2127
2043 memory_bm_position_reset(src); 2128 /* Clear the "free"/"unsafe" bit for all PFNs */
2044 pfn = memory_bm_next_pfn(src); 2129 memory_bm_position_reset(free_pages_map);
2130 pfn = memory_bm_next_pfn(free_pages_map);
2045 while (pfn != BM_END_OF_MAP) { 2131 while (pfn != BM_END_OF_MAP) {
2046 memory_bm_set_bit(dst, pfn); 2132 memory_bm_clear_current(free_pages_map);
2047 pfn = memory_bm_next_pfn(src); 2133 pfn = memory_bm_next_pfn(free_pages_map);
2048 } 2134 }
2135
2136 /* Mark pages that correspond to the "original" PFNs as "unsafe" */
2137 duplicate_memory_bitmap(free_pages_map, bm);
2138
2139 allocated_unsafe_pages = 0;
2049} 2140}
2050 2141
2051static int check_header(struct swsusp_info *info) 2142static int check_header(struct swsusp_info *info)
@@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info)
2063} 2154}
2064 2155
2065/** 2156/**
2066 * load header - check the image header and copy data from it 2157 * load header - Check the image header and copy the data from it.
2067 */ 2158 */
2068 2159static int load_header(struct swsusp_info *info)
2069static int
2070load_header(struct swsusp_info *info)
2071{ 2160{
2072 int error; 2161 int error;
2073 2162
@@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info)
2081} 2170}
2082 2171
2083/** 2172/**
2084 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set 2173 * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
2085 * the corresponding bit in the memory bitmap @bm 2174 * @bm: Memory bitmap.
2175 * @buf: Area of memory containing the PFNs.
2176 *
2177 * For each element of the array pointed to by @buf (1 page at a time), set the
2178 * corresponding bit in @bm.
2086 */ 2179 */
2087static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) 2180static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
2088{ 2181{
@@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
2095 /* Extract and buffer page key for data page (s390 only). */ 2188 /* Extract and buffer page key for data page (s390 only). */
2096 page_key_memorize(buf + j); 2189 page_key_memorize(buf + j);
2097 2190
2098 if (memory_bm_pfn_present(bm, buf[j])) 2191 if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
2099 memory_bm_set_bit(bm, buf[j]); 2192 memory_bm_set_bit(bm, buf[j]);
2100 else 2193 else
2101 return -EFAULT; 2194 return -EFAULT;
@@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
2104 return 0; 2197 return 0;
2105} 2198}
2106 2199
2107/* List of "safe" pages that may be used to store data loaded from the suspend
2108 * image
2109 */
2110static struct linked_page *safe_pages_list;
2111
2112#ifdef CONFIG_HIGHMEM 2200#ifdef CONFIG_HIGHMEM
2113/* struct highmem_pbe is used for creating the list of highmem pages that 2201/*
2202 * struct highmem_pbe is used for creating the list of highmem pages that
2114 * should be restored atomically during the resume from disk, because the page 2203 * should be restored atomically during the resume from disk, because the page
2115 * frames they have occupied before the suspend are in use. 2204 * frames they have occupied before the suspend are in use.
2116 */ 2205 */
@@ -2120,7 +2209,8 @@ struct highmem_pbe {
2120 struct highmem_pbe *next; 2209 struct highmem_pbe *next;
2121}; 2210};
2122 2211
2123/* List of highmem PBEs needed for restoring the highmem pages that were 2212/*
2213 * List of highmem PBEs needed for restoring the highmem pages that were
2124 * allocated before the suspend and included in the suspend image, but have 2214 * allocated before the suspend and included in the suspend image, but have
2125 * also been allocated by the "resume" kernel, so their contents cannot be 2215 * also been allocated by the "resume" kernel, so their contents cannot be
2126 * written directly to their "original" page frames. 2216 * written directly to their "original" page frames.
@@ -2128,11 +2218,11 @@ struct highmem_pbe {
2128static struct highmem_pbe *highmem_pblist; 2218static struct highmem_pbe *highmem_pblist;
2129 2219
2130/** 2220/**
2131 * count_highmem_image_pages - compute the number of highmem pages in the 2221 * count_highmem_image_pages - Compute the number of highmem pages in the image.
2132 * suspend image. The bits in the memory bitmap @bm that correspond to the 2222 * @bm: Memory bitmap.
2133 * image pages are assumed to be set. 2223 *
2224 * The bits in @bm that correspond to image pages are assumed to be set.
2134 */ 2225 */
2135
2136static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) 2226static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
2137{ 2227{
2138 unsigned long pfn; 2228 unsigned long pfn;
@@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
2149 return cnt; 2239 return cnt;
2150} 2240}
2151 2241
2152/**
2153 * prepare_highmem_image - try to allocate as many highmem pages as
2154 * there are highmem image pages (@nr_highmem_p points to the variable
2155 * containing the number of highmem image pages). The pages that are
2156 * "safe" (ie. will not be overwritten when the suspend image is
2157 * restored) have the corresponding bits set in @bm (it must be
2158 * unitialized).
2159 *
2160 * NOTE: This function should not be called if there are no highmem
2161 * image pages.
2162 */
2163
2164static unsigned int safe_highmem_pages; 2242static unsigned int safe_highmem_pages;
2165 2243
2166static struct memory_bitmap *safe_highmem_bm; 2244static struct memory_bitmap *safe_highmem_bm;
2167 2245
2168static int 2246/**
2169prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) 2247 * prepare_highmem_image - Allocate memory for loading highmem data from image.
2248 * @bm: Pointer to an uninitialized memory bitmap structure.
2249 * @nr_highmem_p: Pointer to the number of highmem image pages.
2250 *
2251 * Try to allocate as many highmem pages as there are highmem image pages
2252 * (@nr_highmem_p points to the variable containing the number of highmem image
2253 * pages). The pages that are "safe" (ie. will not be overwritten when the
2254 * hibernation image is restored entirely) have the corresponding bits set in
2255 * @bm (it must be unitialized).
2256 *
2257 * NOTE: This function should not be called if there are no highmem image pages.
2258 */
2259static int prepare_highmem_image(struct memory_bitmap *bm,
2260 unsigned int *nr_highmem_p)
2170{ 2261{
2171 unsigned int to_alloc; 2262 unsigned int to_alloc;
2172 2263
@@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
2201 return 0; 2292 return 0;
2202} 2293}
2203 2294
2295static struct page *last_highmem_page;
2296
2204/** 2297/**
2205 * get_highmem_page_buffer - for given highmem image page find the buffer 2298 * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
2206 * that suspend_write_next() should set for its caller to write to.
2207 * 2299 *
2208 * If the page is to be saved to its "original" page frame or a copy of 2300 * For a given highmem image page get a buffer that suspend_write_next() should
2209 * the page is to be made in the highmem, @buffer is returned. Otherwise, 2301 * return to its caller to write to.
2210 * the copy of the page is to be made in normal memory, so the address of
2211 * the copy is returned.
2212 * 2302 *
2213 * If @buffer is returned, the caller of suspend_write_next() will write 2303 * If the page is to be saved to its "original" page frame or a copy of
2214 * the page's contents to @buffer, so they will have to be copied to the 2304 * the page is to be made in the highmem, @buffer is returned. Otherwise,
2215 * right location on the next call to suspend_write_next() and it is done 2305 * the copy of the page is to be made in normal memory, so the address of
2216 * with the help of copy_last_highmem_page(). For this purpose, if 2306 * the copy is returned.
2217 * @buffer is returned, @last_highmem page is set to the page to which 2307 *
2218 * the data will have to be copied from @buffer. 2308 * If @buffer is returned, the caller of suspend_write_next() will write
2309 * the page's contents to @buffer, so they will have to be copied to the
2310 * right location on the next call to suspend_write_next() and it is done
2311 * with the help of copy_last_highmem_page(). For this purpose, if
2312 * @buffer is returned, @last_highmem_page is set to the page to which
2313 * the data will have to be copied from @buffer.
2219 */ 2314 */
2220 2315static void *get_highmem_page_buffer(struct page *page,
2221static struct page *last_highmem_page; 2316 struct chain_allocator *ca)
2222
2223static void *
2224get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
2225{ 2317{
2226 struct highmem_pbe *pbe; 2318 struct highmem_pbe *pbe;
2227 void *kaddr; 2319 void *kaddr;
2228 2320
2229 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { 2321 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
2230 /* We have allocated the "original" page frame and we can 2322 /*
2323 * We have allocated the "original" page frame and we can
2231 * use it directly to store the loaded page. 2324 * use it directly to store the loaded page.
2232 */ 2325 */
2233 last_highmem_page = page; 2326 last_highmem_page = page;
2234 return buffer; 2327 return buffer;
2235 } 2328 }
2236 /* The "original" page frame has not been allocated and we have to 2329 /*
2330 * The "original" page frame has not been allocated and we have to
2237 * use a "safe" page frame to store the loaded page. 2331 * use a "safe" page frame to store the loaded page.
2238 */ 2332 */
2239 pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); 2333 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
2263} 2357}
2264 2358
2265/** 2359/**
2266 * copy_last_highmem_page - copy the contents of a highmem image from 2360 * copy_last_highmem_page - Copy most the most recent highmem image page.
2267 * @buffer, where the caller of snapshot_write_next() has place them, 2361 *
2268 * to the right location represented by @last_highmem_page . 2362 * Copy the contents of a highmem image from @buffer, where the caller of
2363 * snapshot_write_next() has stored them, to the right location represented by
2364 * @last_highmem_page .
2269 */ 2365 */
2270
2271static void copy_last_highmem_page(void) 2366static void copy_last_highmem_page(void)
2272{ 2367{
2273 if (last_highmem_page) { 2368 if (last_highmem_page) {
@@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void)
2294 free_image_page(buffer, PG_UNSAFE_CLEAR); 2389 free_image_page(buffer, PG_UNSAFE_CLEAR);
2295} 2390}
2296#else 2391#else
2297static unsigned int 2392static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2298count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
2299 2393
2300static inline int 2394static inline int prepare_highmem_image(struct memory_bitmap *bm,
2301prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) 2395 unsigned int *nr_highmem_p) { return 0; }
2302{
2303 return 0;
2304}
2305 2396
2306static inline void * 2397static inline void *get_highmem_page_buffer(struct page *page,
2307get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) 2398 struct chain_allocator *ca)
2308{ 2399{
2309 return ERR_PTR(-EINVAL); 2400 return ERR_PTR(-EINVAL);
2310} 2401}
@@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; }
2314static inline void free_highmem_data(void) {} 2405static inline void free_highmem_data(void) {}
2315#endif /* CONFIG_HIGHMEM */ 2406#endif /* CONFIG_HIGHMEM */
2316 2407
2408#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
2409
2317/** 2410/**
2318 * prepare_image - use the memory bitmap @bm to mark the pages that will 2411 * prepare_image - Make room for loading hibernation image.
2319 * be overwritten in the process of restoring the system memory state 2412 * @new_bm: Unitialized memory bitmap structure.
2320 * from the suspend image ("unsafe" pages) and allocate memory for the 2413 * @bm: Memory bitmap with unsafe pages marked.
2321 * image. 2414 *
2415 * Use @bm to mark the pages that will be overwritten in the process of
2416 * restoring the system memory state from the suspend image ("unsafe" pages)
2417 * and allocate memory for the image.
2322 * 2418 *
2323 * The idea is to allocate a new memory bitmap first and then allocate 2419 * The idea is to allocate a new memory bitmap first and then allocate
2324 * as many pages as needed for the image data, but not to assign these 2420 * as many pages as needed for image data, but without specifying what those
2325 * pages to specific tasks initially. Instead, we just mark them as 2421 * pages will be used for just yet. Instead, we mark them all as allocated and
2326 * allocated and create a lists of "safe" pages that will be used 2422 * create a lists of "safe" pages to be used later. On systems with high
2327 * later. On systems with high memory a list of "safe" highmem pages is 2423 * memory a list of "safe" highmem pages is created too.
2328 * also created.
2329 */ 2424 */
2330 2425static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2331#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
2332
2333static int
2334prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2335{ 2426{
2336 unsigned int nr_pages, nr_highmem; 2427 unsigned int nr_pages, nr_highmem;
2337 struct linked_page *sp_list, *lp; 2428 struct linked_page *lp;
2338 int error; 2429 int error;
2339 2430
2340 /* If there is no highmem, the buffer will not be necessary */ 2431 /* If there is no highmem, the buffer will not be necessary */
@@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2342 buffer = NULL; 2433 buffer = NULL;
2343 2434
2344 nr_highmem = count_highmem_image_pages(bm); 2435 nr_highmem = count_highmem_image_pages(bm);
2345 error = mark_unsafe_pages(bm); 2436 mark_unsafe_pages(bm);
2346 if (error)
2347 goto Free;
2348 2437
2349 error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); 2438 error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
2350 if (error) 2439 if (error)
@@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2357 if (error) 2446 if (error)
2358 goto Free; 2447 goto Free;
2359 } 2448 }
2360 /* Reserve some safe pages for potential later use. 2449 /*
2450 * Reserve some safe pages for potential later use.
2361 * 2451 *
2362 * NOTE: This way we make sure there will be enough safe pages for the 2452 * NOTE: This way we make sure there will be enough safe pages for the
2363 * chain_alloc() in get_buffer(). It is a bit wasteful, but 2453 * chain_alloc() in get_buffer(). It is a bit wasteful, but
2364 * nr_copy_pages cannot be greater than 50% of the memory anyway. 2454 * nr_copy_pages cannot be greater than 50% of the memory anyway.
2455 *
2456 * nr_copy_pages cannot be less than allocated_unsafe_pages too.
2365 */ 2457 */
2366 sp_list = NULL;
2367 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
2368 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; 2458 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
2369 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); 2459 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
2370 while (nr_pages > 0) { 2460 while (nr_pages > 0) {
@@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2373 error = -ENOMEM; 2463 error = -ENOMEM;
2374 goto Free; 2464 goto Free;
2375 } 2465 }
2376 lp->next = sp_list; 2466 lp->next = safe_pages_list;
2377 sp_list = lp; 2467 safe_pages_list = lp;
2378 nr_pages--; 2468 nr_pages--;
2379 } 2469 }
2380 /* Preallocate memory for the image */ 2470 /* Preallocate memory for the image */
2381 safe_pages_list = NULL;
2382 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; 2471 nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
2383 while (nr_pages > 0) { 2472 while (nr_pages > 0) {
2384 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); 2473 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2396 swsusp_set_page_free(virt_to_page(lp)); 2485 swsusp_set_page_free(virt_to_page(lp));
2397 nr_pages--; 2486 nr_pages--;
2398 } 2487 }
2399 /* Free the reserved safe pages so that chain_alloc() can use them */
2400 while (sp_list) {
2401 lp = sp_list->next;
2402 free_image_page(sp_list, PG_UNSAFE_CLEAR);
2403 sp_list = lp;
2404 }
2405 return 0; 2488 return 0;
2406 2489
2407 Free: 2490 Free:
@@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
2410} 2493}
2411 2494
2412/** 2495/**
2413 * get_buffer - compute the address that snapshot_write_next() should 2496 * get_buffer - Get the address to store the next image data page.
2414 * set for its caller to write to. 2497 *
2498 * Get the address that snapshot_write_next() should return to its caller to
2499 * write to.
2415 */ 2500 */
2416
2417static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) 2501static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2418{ 2502{
2419 struct pbe *pbe; 2503 struct pbe *pbe;
@@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2428 return get_highmem_page_buffer(page, ca); 2512 return get_highmem_page_buffer(page, ca);
2429 2513
2430 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) 2514 if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
2431 /* We have allocated the "original" page frame and we can 2515 /*
2516 * We have allocated the "original" page frame and we can
2432 * use it directly to store the loaded page. 2517 * use it directly to store the loaded page.
2433 */ 2518 */
2434 return page_address(page); 2519 return page_address(page);
2435 2520
2436 /* The "original" page frame has not been allocated and we have to 2521 /*
2522 * The "original" page frame has not been allocated and we have to
2437 * use a "safe" page frame to store the loaded page. 2523 * use a "safe" page frame to store the loaded page.
2438 */ 2524 */
2439 pbe = chain_alloc(ca, sizeof(struct pbe)); 2525 pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2450} 2536}
2451 2537
2452/** 2538/**
2453 * snapshot_write_next - used for writing the system memory snapshot. 2539 * snapshot_write_next - Get the address to store the next image page.
2540 * @handle: Snapshot handle structure to guide the writing.
2454 * 2541 *
2455 * On the first call to it @handle should point to a zeroed 2542 * On the first call, @handle should point to a zeroed snapshot_handle
2456 * snapshot_handle structure. The structure gets updated and a pointer 2543 * structure. The structure gets populated then and a pointer to it should be
2457 * to it should be passed to this function every next time. 2544 * passed to this function every next time.
2458 * 2545 *
2459 * On success the function returns a positive number. Then, the caller 2546 * On success, the function returns a positive number. Then, the caller
2460 * is allowed to write up to the returned number of bytes to the memory 2547 * is allowed to write up to the returned number of bytes to the memory
2461 * location computed by the data_of() macro. 2548 * location computed by the data_of() macro.
2462 * 2549 *
2463 * The function returns 0 to indicate the "end of file" condition, 2550 * The function returns 0 to indicate the "end of file" condition. Negative
2464 * and a negative number is returned on error. In such cases the 2551 * numbers are returned on errors, in which cases the structure pointed to by
2465 * structure pointed to by @handle is not updated and should not be used 2552 * @handle is not updated and should not be used any more.
2466 * any more.
2467 */ 2553 */
2468
2469int snapshot_write_next(struct snapshot_handle *handle) 2554int snapshot_write_next(struct snapshot_handle *handle)
2470{ 2555{
2471 static struct chain_allocator ca; 2556 static struct chain_allocator ca;
@@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
2491 if (error) 2576 if (error)
2492 return error; 2577 return error;
2493 2578
2579 safe_pages_list = NULL;
2580
2494 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2581 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2495 if (error) 2582 if (error)
2496 return error; 2583 return error;
@@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
2500 if (error) 2587 if (error)
2501 return error; 2588 return error;
2502 2589
2590 hibernate_restore_protection_begin();
2503 } else if (handle->cur <= nr_meta_pages + 1) { 2591 } else if (handle->cur <= nr_meta_pages + 1) {
2504 error = unpack_orig_pfns(buffer, &copy_bm); 2592 error = unpack_orig_pfns(buffer, &copy_bm);
2505 if (error) 2593 if (error)
@@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
2522 copy_last_highmem_page(); 2610 copy_last_highmem_page();
2523 /* Restore page key for data page (s390 only). */ 2611 /* Restore page key for data page (s390 only). */
2524 page_key_write(handle->buffer); 2612 page_key_write(handle->buffer);
2613 hibernate_restore_protect_page(handle->buffer);
2525 handle->buffer = get_buffer(&orig_bm, &ca); 2614 handle->buffer = get_buffer(&orig_bm, &ca);
2526 if (IS_ERR(handle->buffer)) 2615 if (IS_ERR(handle->buffer))
2527 return PTR_ERR(handle->buffer); 2616 return PTR_ERR(handle->buffer);
@@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle)
2533} 2622}
2534 2623
2535/** 2624/**
2536 * snapshot_write_finalize - must be called after the last call to 2625 * snapshot_write_finalize - Complete the loading of a hibernation image.
2537 * snapshot_write_next() in case the last page in the image happens 2626 *
2538 * to be a highmem page and its contents should be stored in the 2627 * Must be called after the last call to snapshot_write_next() in case the last
2539 * highmem. Additionally, it releases the memory that will not be 2628 * page in the image happens to be a highmem page and its contents should be
2540 * used any more. 2629 * stored in highmem. Additionally, it recycles bitmap memory that's not
2630 * necessary any more.
2541 */ 2631 */
2542
2543void snapshot_write_finalize(struct snapshot_handle *handle) 2632void snapshot_write_finalize(struct snapshot_handle *handle)
2544{ 2633{
2545 copy_last_highmem_page(); 2634 copy_last_highmem_page();
2546 /* Restore page key for data page (s390 only). */ 2635 /* Restore page key for data page (s390 only). */
2547 page_key_write(handle->buffer); 2636 page_key_write(handle->buffer);
2548 page_key_free(); 2637 page_key_free();
2549 /* Free only if we have loaded the image entirely */ 2638 hibernate_restore_protect_page(handle->buffer);
2639 /* Do that only if we have loaded the image entirely */
2550 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { 2640 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2551 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2641 memory_bm_recycle(&orig_bm);
2552 free_highmem_data(); 2642 free_highmem_data();
2553 } 2643 }
2554} 2644}
@@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
2561 2651
2562#ifdef CONFIG_HIGHMEM 2652#ifdef CONFIG_HIGHMEM
2563/* Assumes that @buf is ready and points to a "safe" page */ 2653/* Assumes that @buf is ready and points to a "safe" page */
2564static inline void 2654static inline void swap_two_pages_data(struct page *p1, struct page *p2,
2565swap_two_pages_data(struct page *p1, struct page *p2, void *buf) 2655 void *buf)
2566{ 2656{
2567 void *kaddr1, *kaddr2; 2657 void *kaddr1, *kaddr2;
2568 2658
@@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2576} 2666}
2577 2667
2578/** 2668/**
2579 * restore_highmem - for each highmem page that was allocated before 2669 * restore_highmem - Put highmem image pages into their original locations.
2580 * the suspend and included in the suspend image, and also has been 2670 *
2581 * allocated by the "resume" kernel swap its current (ie. "before 2671 * For each highmem page that was in use before hibernation and is included in
2582 * resume") contents with the previous (ie. "before suspend") one. 2672 * the image, and also has been allocated by the "restore" kernel, swap its
2673 * current contents with the previous (ie. "before hibernation") ones.
2583 * 2674 *
2584 * If the resume eventually fails, we can call this function once 2675 * If the restore eventually fails, we can call this function once again and
2585 * again and restore the "before resume" highmem state. 2676 * restore the highmem state as seen by the restore kernel.
2586 */ 2677 */
2587
2588int restore_highmem(void) 2678int restore_highmem(void)
2589{ 2679{
2590 struct highmem_pbe *pbe = highmem_pblist; 2680 struct highmem_pbe *pbe = highmem_pblist;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..0acab9d7f96f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
266 */ 266 */
267static int suspend_prepare(suspend_state_t state) 267static int suspend_prepare(suspend_state_t state)
268{ 268{
269 int error; 269 int error, nr_calls = 0;
270 270
271 if (!sleep_state_supported(state)) 271 if (!sleep_state_supported(state))
272 return -EPERM; 272 return -EPERM;
273 273
274 pm_prepare_console(); 274 pm_prepare_console();
275 275
276 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); 276 error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
277 if (error) 277 if (error) {
278 nr_calls--;
278 goto Finish; 279 goto Finish;
280 }
279 281
280 trace_suspend_resume(TPS("freeze_processes"), 0, true); 282 trace_suspend_resume(TPS("freeze_processes"), 0, true);
281 error = suspend_freeze_processes(); 283 error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
286 suspend_stats.failed_freeze++; 288 suspend_stats.failed_freeze++;
287 dpm_save_failed_step(SUSPEND_FREEZE); 289 dpm_save_failed_step(SUSPEND_FREEZE);
288 Finish: 290 Finish:
289 pm_notifier_call_chain(PM_POST_SUSPEND); 291 __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
290 pm_restore_console(); 292 pm_restore_console();
291 return error; 293 return error;
292} 294}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..a3b1e617bcdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
261 bio_put(bio); 261 bio_put(bio);
262} 262}
263 263
264static int hib_submit_io(int rw, pgoff_t page_off, void *addr, 264static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
265 struct hib_bio_batch *hb) 265 struct hib_bio_batch *hb)
266{ 266{
267 struct page *page = virt_to_page(addr); 267 struct page *page = virt_to_page(addr);
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
271 bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); 271 bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
272 bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); 272 bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
273 bio->bi_bdev = hib_resume_bdev; 273 bio->bi_bdev = hib_resume_bdev;
274 bio_set_op_attrs(bio, op, op_flags);
274 275
275 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { 276 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
276 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", 277 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
283 bio->bi_end_io = hib_end_io; 284 bio->bi_end_io = hib_end_io;
284 bio->bi_private = hb; 285 bio->bi_private = hb;
285 atomic_inc(&hb->count); 286 atomic_inc(&hb->count);
286 submit_bio(rw, bio); 287 submit_bio(bio);
287 } else { 288 } else {
288 error = submit_bio_wait(rw, bio); 289 error = submit_bio_wait(bio);
289 bio_put(bio); 290 bio_put(bio);
290 } 291 }
291 292
@@ -306,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
306{ 307{
307 int error; 308 int error;
308 309
309 hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); 310 hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
311 swsusp_header, NULL);
310 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 312 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
311 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 313 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
312 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 314 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -315,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
315 swsusp_header->flags = flags; 317 swsusp_header->flags = flags;
316 if (flags & SF_CRC32_MODE) 318 if (flags & SF_CRC32_MODE)
317 swsusp_header->crc32 = handle->crc32; 319 swsusp_header->crc32 = handle->crc32;
318 error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, 320 error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
319 swsusp_header, NULL); 321 swsusp_resume_block, swsusp_header, NULL);
320 } else { 322 } else {
321 printk(KERN_ERR "PM: Swap header not found!\n"); 323 printk(KERN_ERR "PM: Swap header not found!\n");
322 error = -ENODEV; 324 error = -ENODEV;
@@ -348,6 +350,12 @@ static int swsusp_swap_check(void)
348 if (res < 0) 350 if (res < 0)
349 blkdev_put(hib_resume_bdev, FMODE_WRITE); 351 blkdev_put(hib_resume_bdev, FMODE_WRITE);
350 352
353 /*
354 * Update the resume device to the one actually used,
355 * so the test_resume mode can use it in case it is
356 * invoked from hibernate() to test the snapshot.
357 */
358 swsusp_resume_device = hib_resume_bdev->bd_dev;
351 return res; 359 return res;
352} 360}
353 361
@@ -389,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
389 } else { 397 } else {
390 src = buf; 398 src = buf;
391 } 399 }
392 return hib_submit_io(WRITE_SYNC, offset, src, hb); 400 return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
393} 401}
394 402
395static void release_swap_writer(struct swap_map_handle *handle) 403static void release_swap_writer(struct swap_map_handle *handle)
@@ -992,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
992 return -ENOMEM; 1000 return -ENOMEM;
993 } 1001 }
994 1002
995 error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); 1003 error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
1004 tmp->map, NULL);
996 if (error) { 1005 if (error) {
997 release_swap_reader(handle); 1006 release_swap_reader(handle);
998 return error; 1007 return error;
@@ -1016,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
1016 offset = handle->cur->entries[handle->k]; 1025 offset = handle->cur->entries[handle->k];
1017 if (!offset) 1026 if (!offset)
1018 return -EFAULT; 1027 return -EFAULT;
1019 error = hib_submit_io(READ_SYNC, offset, buf, hb); 1028 error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
1020 if (error) 1029 if (error)
1021 return error; 1030 return error;
1022 if (++handle->k >= MAP_PAGE_ENTRIES) { 1031 if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1525,7 +1534,8 @@ int swsusp_check(void)
1525 if (!IS_ERR(hib_resume_bdev)) { 1534 if (!IS_ERR(hib_resume_bdev)) {
1526 set_blocksize(hib_resume_bdev, PAGE_SIZE); 1535 set_blocksize(hib_resume_bdev, PAGE_SIZE);
1527 clear_page(swsusp_header); 1536 clear_page(swsusp_header);
1528 error = hib_submit_io(READ_SYNC, swsusp_resume_block, 1537 error = hib_submit_io(REQ_OP_READ, READ_SYNC,
1538 swsusp_resume_block,
1529 swsusp_header, NULL); 1539 swsusp_header, NULL);
1530 if (error) 1540 if (error)
1531 goto put; 1541 goto put;
@@ -1533,7 +1543,8 @@ int swsusp_check(void)
1533 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { 1543 if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
1534 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 1544 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
1535 /* Reset swap signature now */ 1545 /* Reset swap signature now */
1536 error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, 1546 error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
1547 swsusp_resume_block,
1537 swsusp_header, NULL); 1548 swsusp_header, NULL);
1538 } else { 1549 } else {
1539 error = -EINVAL; 1550 error = -EINVAL;
@@ -1577,10 +1588,12 @@ int swsusp_unmark(void)
1577{ 1588{
1578 int error; 1589 int error;
1579 1590
1580 hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); 1591 hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
1592 swsusp_header, NULL);
1581 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { 1593 if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
1582 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); 1594 memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
1583 error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, 1595 error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
1596 swsusp_resume_block,
1584 swsusp_header, NULL); 1597 swsusp_header, NULL);
1585 } else { 1598 } else {
1586 printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); 1599 printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
47static int snapshot_open(struct inode *inode, struct file *filp) 47static int snapshot_open(struct inode *inode, struct file *filp)
48{ 48{
49 struct snapshot_data *data; 49 struct snapshot_data *data;
50 int error; 50 int error, nr_calls = 0;
51 51
52 if (!hibernation_available()) 52 if (!hibernation_available())
53 return -EPERM; 53 return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
74 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 74 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
75 data->mode = O_RDONLY; 75 data->mode = O_RDONLY;
76 data->free_bitmaps = false; 76 data->free_bitmaps = false;
77 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 77 error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
78 if (error) 78 if (error)
79 pm_notifier_call_chain(PM_POST_HIBERNATION); 79 __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
80 } else { 80 } else {
81 /* 81 /*
82 * Resuming. We may need to wait for the image device to 82 * Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
86 86
87 data->swap = -1; 87 data->swap = -1;
88 data->mode = O_WRONLY; 88 data->mode = O_WRONLY;
89 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 89 error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
90 if (!error) { 90 if (!error) {
91 error = create_basic_memory_bitmaps(); 91 error = create_basic_memory_bitmaps();
92 data->free_bitmaps = !error; 92 data->free_bitmaps = !error;
93 } 93 } else
94 nr_calls--;
95
94 if (error) 96 if (error)
95 pm_notifier_call_chain(PM_POST_RESTORE); 97 __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
96 } 98 }
97 if (error) 99 if (error)
98 atomic_inc(&snapshot_device_available); 100 atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
3177{ 3177{
3178 dump_stack_print_info(log_lvl); 3178 dump_stack_print_info(log_lvl);
3179 3179
3180 printk("%stask: %p ti: %p task.ti: %p\n", 3180 printk("%stask: %p task.stack: %p\n",
3181 log_lvl, current, current_thread_info(), 3181 log_lvl, current, task_stack_page(current));
3182 task_thread_info(current));
3183} 3182}
3184 3183
3185#endif 3184#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
328 put_cpu(); 328 put_cpu();
329} 329}
330 330
331static int profile_cpu_callback(struct notifier_block *info, 331static int profile_dead_cpu(unsigned int cpu)
332 unsigned long action, void *__cpu)
333{ 332{
334 int node, cpu = (unsigned long)__cpu;
335 struct page *page; 333 struct page *page;
334 int i;
336 335
337 switch (action) { 336 if (prof_cpu_mask != NULL)
338 case CPU_UP_PREPARE: 337 cpumask_clear_cpu(cpu, prof_cpu_mask);
339 case CPU_UP_PREPARE_FROZEN: 338
340 node = cpu_to_mem(cpu); 339 for (i = 0; i < 2; i++) {
341 per_cpu(cpu_profile_flip, cpu) = 0; 340 if (per_cpu(cpu_profile_hits, cpu)[i]) {
342 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 341 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
343 page = __alloc_pages_node(node, 342 per_cpu(cpu_profile_hits, cpu)[i] = NULL;
344 GFP_KERNEL | __GFP_ZERO,
345 0);
346 if (!page)
347 return notifier_from_errno(-ENOMEM);
348 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
349 }
350 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
351 page = __alloc_pages_node(node,
352 GFP_KERNEL | __GFP_ZERO,
353 0);
354 if (!page)
355 goto out_free;
356 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
357 }
358 break;
359out_free:
360 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
361 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
362 __free_page(page);
363 return notifier_from_errno(-ENOMEM);
364 case CPU_ONLINE:
365 case CPU_ONLINE_FROZEN:
366 if (prof_cpu_mask != NULL)
367 cpumask_set_cpu(cpu, prof_cpu_mask);
368 break;
369 case CPU_UP_CANCELED:
370 case CPU_UP_CANCELED_FROZEN:
371 case CPU_DEAD:
372 case CPU_DEAD_FROZEN:
373 if (prof_cpu_mask != NULL)
374 cpumask_clear_cpu(cpu, prof_cpu_mask);
375 if (per_cpu(cpu_profile_hits, cpu)[0]) {
376 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
377 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
378 __free_page(page); 343 __free_page(page);
379 } 344 }
380 if (per_cpu(cpu_profile_hits, cpu)[1]) { 345 }
381 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 346 return 0;
382 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 347}
383 __free_page(page); 348
349static int profile_prepare_cpu(unsigned int cpu)
350{
351 int i, node = cpu_to_mem(cpu);
352 struct page *page;
353
354 per_cpu(cpu_profile_flip, cpu) = 0;
355
356 for (i = 0; i < 2; i++) {
357 if (per_cpu(cpu_profile_hits, cpu)[i])
358 continue;
359
360 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
361 if (!page) {
362 profile_dead_cpu(cpu);
363 return -ENOMEM;
384 } 364 }
385 break; 365 per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
366
386 } 367 }
387 return NOTIFY_OK; 368 return 0;
369}
370
371static int profile_online_cpu(unsigned int cpu)
372{
373 if (prof_cpu_mask != NULL)
374 cpumask_set_cpu(cpu, prof_cpu_mask);
375
376 return 0;
388} 377}
378
389#else /* !CONFIG_SMP */ 379#else /* !CONFIG_SMP */
390#define profile_flip_buffers() do { } while (0) 380#define profile_flip_buffers() do { } while (0)
391#define profile_discard_flip_buffers() do { } while (0) 381#define profile_discard_flip_buffers() do { } while (0)
392#define profile_cpu_callback NULL
393 382
394static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) 383static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
395{ 384{
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
531 .llseek = default_llseek, 520 .llseek = default_llseek,
532}; 521};
533 522
534#ifdef CONFIG_SMP 523int __ref create_proc_profile(void)
535static void profile_nop(void *unused)
536{
537}
538
539static int create_hash_tables(void)
540{ 524{
541 int cpu; 525 struct proc_dir_entry *entry;
542 526#ifdef CONFIG_SMP
543 for_each_online_cpu(cpu) { 527 enum cpuhp_state online_state;
544 int node = cpu_to_mem(cpu);
545 struct page *page;
546
547 page = __alloc_pages_node(node,
548 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
549 0);
550 if (!page)
551 goto out_cleanup;
552 per_cpu(cpu_profile_hits, cpu)[1]
553 = (struct profile_hit *)page_address(page);
554 page = __alloc_pages_node(node,
555 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
556 0);
557 if (!page)
558 goto out_cleanup;
559 per_cpu(cpu_profile_hits, cpu)[0]
560 = (struct profile_hit *)page_address(page);
561 }
562 return 0;
563out_cleanup:
564 prof_on = 0;
565 smp_mb();
566 on_each_cpu(profile_nop, NULL, 1);
567 for_each_online_cpu(cpu) {
568 struct page *page;
569
570 if (per_cpu(cpu_profile_hits, cpu)[0]) {
571 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
572 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
573 __free_page(page);
574 }
575 if (per_cpu(cpu_profile_hits, cpu)[1]) {
576 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
577 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
578 __free_page(page);
579 }
580 }
581 return -1;
582}
583#else
584#define create_hash_tables() ({ 0; })
585#endif 528#endif
586 529
587int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
588{
589 struct proc_dir_entry *entry;
590 int err = 0; 530 int err = 0;
591 531
592 if (!prof_on) 532 if (!prof_on)
593 return 0; 533 return 0;
594 534#ifdef CONFIG_SMP
595 cpu_notifier_register_begin(); 535 err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
596 536 profile_prepare_cpu, profile_dead_cpu);
597 if (create_hash_tables()) { 537 if (err)
598 err = -ENOMEM; 538 return err;
599 goto out; 539
600 } 540 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
601 541 profile_online_cpu, NULL);
542 if (err < 0)
543 goto err_state_prep;
544 online_state = err;
545 err = 0;
546#endif
602 entry = proc_create("profile", S_IWUSR | S_IRUGO, 547 entry = proc_create("profile", S_IWUSR | S_IRUGO,
603 NULL, &proc_profile_operations); 548 NULL, &proc_profile_operations);
604 if (!entry) 549 if (!entry)
605 goto out; 550 goto err_state_onl;
606 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); 551 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
607 __hotcpu_notifier(profile_cpu_callback, 0);
608 552
609out: 553 return err;
610 cpu_notifier_register_done(); 554err_state_onl:
555#ifdef CONFIG_SMP
556 cpuhp_remove_state(online_state);
557err_state_prep:
558 cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
559#endif
611 return err; 560 return err;
612} 561}
613subsys_initcall(create_proc_profile); 562subsys_initcall(create_proc_profile);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cee0d8393ed..d38ab08a3fe7 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
58#define VERBOSE_PERFOUT_ERRSTRING(s) \ 58#define VERBOSE_PERFOUT_ERRSTRING(s) \
59 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) 59 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
60 60
61torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); 61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
62torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); 62torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
63torture_param(int, nreaders, -1, "Number of RCU reader threads"); 63torture_param(int, nreaders, -1, "Number of RCU reader threads");
64torture_param(int, nwriters, -1, "Number of RCU updater threads"); 64torture_param(int, nwriters, -1, "Number of RCU updater threads");
@@ -96,12 +96,7 @@ static int rcu_perf_writer_state;
96#define MAX_MEAS 10000 96#define MAX_MEAS 10000
97#define MIN_MEAS 100 97#define MIN_MEAS 100
98 98
99#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) 99static int perf_runnable = IS_ENABLED(MODULE);
100#define RCUPERF_RUNNABLE_INIT 1
101#else
102#define RCUPERF_RUNNABLE_INIT 0
103#endif
104static int perf_runnable = RCUPERF_RUNNABLE_INIT;
105module_param(perf_runnable, int, 0444); 100module_param(perf_runnable, int, 0444);
106MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); 101MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
107 102
@@ -363,8 +358,6 @@ rcu_perf_writer(void *arg)
363 u64 *wdpp = writer_durations[me]; 358 u64 *wdpp = writer_durations[me];
364 359
365 VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); 360 VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
366 WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
367 WARN_ON(rcu_gp_is_normal() && gp_exp);
368 WARN_ON(!wdpp); 361 WARN_ON(!wdpp);
369 set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); 362 set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
370 sp.sched_priority = 1; 363 sp.sched_priority = 1;
@@ -631,12 +624,24 @@ rcu_perf_init(void)
631 firsterr = -ENOMEM; 624 firsterr = -ENOMEM;
632 goto unwind; 625 goto unwind;
633 } 626 }
627 if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
628 VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
629 firsterr = -EINVAL;
630 goto unwind;
631 }
632 if (rcu_gp_is_normal() && gp_exp) {
633 VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
634 firsterr = -EINVAL;
635 goto unwind;
636 }
634 for (i = 0; i < nrealwriters; i++) { 637 for (i = 0; i < nrealwriters; i++) {
635 writer_durations[i] = 638 writer_durations[i] =
636 kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), 639 kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
637 GFP_KERNEL); 640 GFP_KERNEL);
638 if (!writer_durations[i]) 641 if (!writer_durations[i]) {
642 firsterr = -ENOMEM;
639 goto unwind; 643 goto unwind;
644 }
640 firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, 645 firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
641 writer_tasks[i]); 646 writer_tasks[i]);
642 if (firsterr) 647 if (firsterr)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 084a28a732eb..971e2b138063 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void)
182 return rcu_torture_writer_state_names[i]; 182 return rcu_torture_writer_state_names[i];
183} 183}
184 184
185#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 185static int torture_runnable = IS_ENABLED(MODULE);
186#define RCUTORTURE_RUNNABLE_INIT 1
187#else
188#define RCUTORTURE_RUNNABLE_INIT 0
189#endif
190static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
191module_param(torture_runnable, int, 0444); 186module_param(torture_runnable, int, 0444);
192MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); 187MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
193 188
@@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg)
1476 break; 1471 break;
1477 /* 1472 /*
1478 * The above smp_load_acquire() ensures barrier_phase load 1473 * The above smp_load_acquire() ensures barrier_phase load
1479 * is ordered before the folloiwng ->call(). 1474 * is ordered before the following ->call().
1480 */ 1475 */
1481 local_irq_disable(); /* Just to test no-irq call_rcu(). */ 1476 local_irq_disable(); /* Just to test no-irq call_rcu(). */
1482 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1477 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c7f1bc4f817c..5d80925e7fc8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
125/* Number of rcu_nodes at specified level. */ 125/* Number of rcu_nodes at specified level. */
126static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 126static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
127int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 127int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
128/* panic() on RCU Stall sysctl. */
129int sysctl_panic_on_rcu_stall __read_mostly;
128 130
129/* 131/*
130 * The rcu_scheduler_active variable transitions from zero to one just 132 * The rcu_scheduler_active variable transitions from zero to one just
131 * before the first task is spawned. So when this variable is zero, RCU 133 * before the first task is spawned. So when this variable is zero, RCU
132 * can assume that there is but one task, allowing RCU to (for example) 134 * can assume that there is but one task, allowing RCU to (for example)
133 * optimize synchronize_sched() to a simple barrier(). When this variable 135 * optimize synchronize_rcu() to a simple barrier(). When this variable
134 * is one, RCU must actually do all the hard work required to detect real 136 * is one, RCU must actually do all the hard work required to detect real
135 * grace periods. This variable is also used to suppress boot-time false 137 * grace periods. This variable is also used to suppress boot-time false
136 * positives from lockdep-RCU error checking. 138 * positives from lockdep-RCU error checking.
@@ -159,6 +161,7 @@ static void invoke_rcu_core(void);
159static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 161static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
160static void rcu_report_exp_rdp(struct rcu_state *rsp, 162static void rcu_report_exp_rdp(struct rcu_state *rsp,
161 struct rcu_data *rdp, bool wake); 163 struct rcu_data *rdp, bool wake);
164static void sync_sched_exp_online_cleanup(int cpu);
162 165
163/* rcuc/rcub kthread realtime priority */ 166/* rcuc/rcub kthread realtime priority */
164#ifdef CONFIG_RCU_KTHREAD_PRIO 167#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -1070,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
1070 * offline to continue to use RCU for one jiffy after marking itself 1073 * offline to continue to use RCU for one jiffy after marking itself
1071 * offline in the cpu_online_mask. This leniency is necessary given the 1074 * offline in the cpu_online_mask. This leniency is necessary given the
1072 * non-atomic nature of the online and offline processing, for example, 1075 * non-atomic nature of the online and offline processing, for example,
1073 * the fact that a CPU enters the scheduler after completing the CPU_DYING 1076 * the fact that a CPU enters the scheduler after completing the teardown
1074 * notifiers. 1077 * of the CPU.
1075 * 1078 *
1076 * This is also why RCU internally marks CPUs online during the 1079 * This is also why RCU internally marks CPUs online during in the
1077 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. 1080 * preparation phase and offline after the CPU has been taken down.
1078 * 1081 *
1079 * Disable checking if in an NMI handler because we cannot safely report 1082 * Disable checking if in an NMI handler because we cannot safely report
1080 * errors from NMI handlers anyway. 1083 * errors from NMI handlers anyway.
@@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
1284 rcu_for_each_leaf_node(rsp, rnp) { 1287 rcu_for_each_leaf_node(rsp, rnp) {
1285 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1288 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1286 if (rnp->qsmask != 0) { 1289 if (rnp->qsmask != 0) {
1287 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1290 for_each_leaf_node_possible_cpu(rnp, cpu)
1288 if (rnp->qsmask & (1UL << cpu)) 1291 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
1289 dump_cpu_task(rnp->grplo + cpu); 1292 dump_cpu_task(cpu);
1290 } 1293 }
1291 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1294 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1292 } 1295 }
@@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
1311 } 1314 }
1312} 1315}
1313 1316
1317static inline void panic_on_rcu_stall(void)
1318{
1319 if (sysctl_panic_on_rcu_stall)
1320 panic("RCU Stall\n");
1321}
1322
1314static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) 1323static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1315{ 1324{
1316 int cpu; 1325 int cpu;
@@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1351 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1360 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1352 ndetected += rcu_print_task_stall(rnp); 1361 ndetected += rcu_print_task_stall(rnp);
1353 if (rnp->qsmask != 0) { 1362 if (rnp->qsmask != 0) {
1354 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1363 for_each_leaf_node_possible_cpu(rnp, cpu)
1355 if (rnp->qsmask & (1UL << cpu)) { 1364 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
1356 print_cpu_stall_info(rsp, 1365 print_cpu_stall_info(rsp, cpu);
1357 rnp->grplo + cpu);
1358 ndetected++; 1366 ndetected++;
1359 } 1367 }
1360 } 1368 }
@@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1390 1398
1391 rcu_check_gp_kthread_starvation(rsp); 1399 rcu_check_gp_kthread_starvation(rsp);
1392 1400
1401 panic_on_rcu_stall();
1402
1393 force_quiescent_state(rsp); /* Kick them all. */ 1403 force_quiescent_state(rsp); /* Kick them all. */
1394} 1404}
1395 1405
@@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
1430 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1440 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1431 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1441 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1432 1442
1443 panic_on_rcu_stall();
1444
1433 /* 1445 /*
1434 * Attempt to revive the RCU machinery by forcing a context switch. 1446 * Attempt to revive the RCU machinery by forcing a context switch.
1435 * 1447 *
@@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1989 * of the tree within the rsp->node[] array. Note that other CPUs 2001 * of the tree within the rsp->node[] array. Note that other CPUs
1990 * will access only the leaves of the hierarchy, thus seeing that no 2002 * will access only the leaves of the hierarchy, thus seeing that no
1991 * grace period is in progress, at least until the corresponding 2003 * grace period is in progress, at least until the corresponding
1992 * leaf node has been initialized. In addition, we have excluded 2004 * leaf node has been initialized.
1993 * CPU-hotplug operations.
1994 * 2005 *
1995 * The grace period cannot complete until the initialization 2006 * The grace period cannot complete until the initialization
1996 * process finishes, because this kthread handles both. 2007 * process finishes, because this kthread handles both.
@@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
2872 unsigned long *maxj), 2883 unsigned long *maxj),
2873 bool *isidle, unsigned long *maxj) 2884 bool *isidle, unsigned long *maxj)
2874{ 2885{
2875 unsigned long bit;
2876 int cpu; 2886 int cpu;
2877 unsigned long flags; 2887 unsigned long flags;
2878 unsigned long mask; 2888 unsigned long mask;
@@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
2907 continue; 2917 continue;
2908 } 2918 }
2909 } 2919 }
2910 cpu = rnp->grplo; 2920 for_each_leaf_node_possible_cpu(rnp, cpu) {
2911 bit = 1; 2921 unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
2912 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2913 if ((rnp->qsmask & bit) != 0) { 2922 if ((rnp->qsmask & bit) != 0) {
2914 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2923 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2915 mask |= bit; 2924 mask |= bit;
@@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s)
3448 return ULONG_CMP_GE(READ_ONCE(*sp), s); 3457 return ULONG_CMP_GE(READ_ONCE(*sp), s);
3449} 3458}
3450 3459
3451/* Wrapper functions for expedited grace periods. */
3452static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
3453{
3454 rcu_seq_start(&rsp->expedited_sequence);
3455}
3456static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
3457{
3458 rcu_seq_end(&rsp->expedited_sequence);
3459 smp_mb(); /* Ensure that consecutive grace periods serialize. */
3460}
3461static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
3462{
3463 unsigned long s;
3464
3465 smp_mb(); /* Caller's modifications seen first by other CPUs. */
3466 s = rcu_seq_snap(&rsp->expedited_sequence);
3467 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
3468 return s;
3469}
3470static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
3471{
3472 return rcu_seq_done(&rsp->expedited_sequence, s);
3473}
3474
3475/*
3476 * Reset the ->expmaskinit values in the rcu_node tree to reflect any
3477 * recent CPU-online activity. Note that these masks are not cleared
3478 * when CPUs go offline, so they reflect the union of all CPUs that have
3479 * ever been online. This means that this function normally takes its
3480 * no-work-to-do fastpath.
3481 */
3482static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3483{
3484 bool done;
3485 unsigned long flags;
3486 unsigned long mask;
3487 unsigned long oldmask;
3488 int ncpus = READ_ONCE(rsp->ncpus);
3489 struct rcu_node *rnp;
3490 struct rcu_node *rnp_up;
3491
3492 /* If no new CPUs onlined since last time, nothing to do. */
3493 if (likely(ncpus == rsp->ncpus_snap))
3494 return;
3495 rsp->ncpus_snap = ncpus;
3496
3497 /*
3498 * Each pass through the following loop propagates newly onlined
3499 * CPUs for the current rcu_node structure up the rcu_node tree.
3500 */
3501 rcu_for_each_leaf_node(rsp, rnp) {
3502 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3503 if (rnp->expmaskinit == rnp->expmaskinitnext) {
3504 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3505 continue; /* No new CPUs, nothing to do. */
3506 }
3507
3508 /* Update this node's mask, track old value for propagation. */
3509 oldmask = rnp->expmaskinit;
3510 rnp->expmaskinit = rnp->expmaskinitnext;
3511 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3512
3513 /* If was already nonzero, nothing to propagate. */
3514 if (oldmask)
3515 continue;
3516
3517 /* Propagate the new CPU up the tree. */
3518 mask = rnp->grpmask;
3519 rnp_up = rnp->parent;
3520 done = false;
3521 while (rnp_up) {
3522 raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
3523 if (rnp_up->expmaskinit)
3524 done = true;
3525 rnp_up->expmaskinit |= mask;
3526 raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
3527 if (done)
3528 break;
3529 mask = rnp_up->grpmask;
3530 rnp_up = rnp_up->parent;
3531 }
3532 }
3533}
3534
3535/*
3536 * Reset the ->expmask values in the rcu_node tree in preparation for
3537 * a new expedited grace period.
3538 */
3539static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3540{
3541 unsigned long flags;
3542 struct rcu_node *rnp;
3543
3544 sync_exp_reset_tree_hotplug(rsp);
3545 rcu_for_each_node_breadth_first(rsp, rnp) {
3546 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3547 WARN_ON_ONCE(rnp->expmask);
3548 rnp->expmask = rnp->expmaskinit;
3549 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3550 }
3551}
3552
3553/*
3554 * Return non-zero if there is no RCU expedited grace period in progress
3555 * for the specified rcu_node structure, in other words, if all CPUs and
3556 * tasks covered by the specified rcu_node structure have done their bit
3557 * for the current expedited grace period. Works only for preemptible
3558 * RCU -- other RCU implementation use other means.
3559 *
3560 * Caller must hold the rcu_state's exp_mutex.
3561 */
3562static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
3563{
3564 return rnp->exp_tasks == NULL &&
3565 READ_ONCE(rnp->expmask) == 0;
3566}
3567
3568/*
3569 * Report the exit from RCU read-side critical section for the last task
3570 * that queued itself during or before the current expedited preemptible-RCU
3571 * grace period. This event is reported either to the rcu_node structure on
3572 * which the task was queued or to one of that rcu_node structure's ancestors,
3573 * recursively up the tree. (Calm down, calm down, we do the recursion
3574 * iteratively!)
3575 *
3576 * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
3577 * structure's ->lock.
3578 */
3579static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3580 bool wake, unsigned long flags)
3581 __releases(rnp->lock)
3582{
3583 unsigned long mask;
3584
3585 for (;;) {
3586 if (!sync_rcu_preempt_exp_done(rnp)) {
3587 if (!rnp->expmask)
3588 rcu_initiate_boost(rnp, flags);
3589 else
3590 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3591 break;
3592 }
3593 if (rnp->parent == NULL) {
3594 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3595 if (wake) {
3596 smp_mb(); /* EGP done before wake_up(). */
3597 swake_up(&rsp->expedited_wq);
3598 }
3599 break;
3600 }
3601 mask = rnp->grpmask;
3602 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
3603 rnp = rnp->parent;
3604 raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
3605 WARN_ON_ONCE(!(rnp->expmask & mask));
3606 rnp->expmask &= ~mask;
3607 }
3608}
3609
3610/*
3611 * Report expedited quiescent state for specified node. This is a
3612 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
3613 *
3614 * Caller must hold the rcu_state's exp_mutex.
3615 */
3616static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3617 struct rcu_node *rnp, bool wake)
3618{
3619 unsigned long flags;
3620
3621 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3622 __rcu_report_exp_rnp(rsp, rnp, wake, flags);
3623}
3624
3625/*
3626 * Report expedited quiescent state for multiple CPUs, all covered by the
3627 * specified leaf rcu_node structure. Caller must hold the rcu_state's
3628 * exp_mutex.
3629 */
3630static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3631 unsigned long mask, bool wake)
3632{
3633 unsigned long flags;
3634
3635 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3636 if (!(rnp->expmask & mask)) {
3637 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3638 return;
3639 }
3640 rnp->expmask &= ~mask;
3641 __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
3642}
3643
3644/*
3645 * Report expedited quiescent state for specified rcu_data (CPU).
3646 */
3647static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
3648 bool wake)
3649{
3650 rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
3651}
3652
3653/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
3654static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
3655 unsigned long s)
3656{
3657 if (rcu_exp_gp_seq_done(rsp, s)) {
3658 trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
3659 /* Ensure test happens before caller kfree(). */
3660 smp_mb__before_atomic(); /* ^^^ */
3661 atomic_long_inc(stat);
3662 return true;
3663 }
3664 return false;
3665}
3666
3667/*
3668 * Funnel-lock acquisition for expedited grace periods. Returns true
3669 * if some other task completed an expedited grace period that this task
3670 * can piggy-back on, and with no mutex held. Otherwise, returns false
3671 * with the mutex held, indicating that the caller must actually do the
3672 * expedited grace period.
3673 */
3674static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3675{
3676 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
3677 struct rcu_node *rnp = rdp->mynode;
3678 struct rcu_node *rnp_root = rcu_get_root(rsp);
3679
3680 /* Low-contention fastpath. */
3681 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
3682 (rnp == rnp_root ||
3683 ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
3684 !mutex_is_locked(&rsp->exp_mutex) &&
3685 mutex_trylock(&rsp->exp_mutex))
3686 goto fastpath;
3687
3688 /*
3689 * Each pass through the following loop works its way up
3690 * the rcu_node tree, returning if others have done the work or
3691 * otherwise falls through to acquire rsp->exp_mutex. The mapping
3692 * from CPU to rcu_node structure can be inexact, as it is just
3693 * promoting locality and is not strictly needed for correctness.
3694 */
3695 for (; rnp != NULL; rnp = rnp->parent) {
3696 if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
3697 return true;
3698
3699 /* Work not done, either wait here or go up. */
3700 spin_lock(&rnp->exp_lock);
3701 if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
3702
3703 /* Someone else doing GP, so wait for them. */
3704 spin_unlock(&rnp->exp_lock);
3705 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
3706 rnp->grplo, rnp->grphi,
3707 TPS("wait"));
3708 wait_event(rnp->exp_wq[(s >> 1) & 0x3],
3709 sync_exp_work_done(rsp,
3710 &rdp->exp_workdone2, s));
3711 return true;
3712 }
3713 rnp->exp_seq_rq = s; /* Followers can wait on us. */
3714 spin_unlock(&rnp->exp_lock);
3715 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
3716 rnp->grphi, TPS("nxtlvl"));
3717 }
3718 mutex_lock(&rsp->exp_mutex);
3719fastpath:
3720 if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
3721 mutex_unlock(&rsp->exp_mutex);
3722 return true;
3723 }
3724 rcu_exp_gp_seq_start(rsp);
3725 trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
3726 return false;
3727}
3728
3729/* Invoked on each online non-idle CPU for expedited quiescent state. */
3730static void sync_sched_exp_handler(void *data)
3731{
3732 struct rcu_data *rdp;
3733 struct rcu_node *rnp;
3734 struct rcu_state *rsp = data;
3735
3736 rdp = this_cpu_ptr(rsp->rda);
3737 rnp = rdp->mynode;
3738 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
3739 __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
3740 return;
3741 if (rcu_is_cpu_rrupt_from_idle()) {
3742 rcu_report_exp_rdp(&rcu_sched_state,
3743 this_cpu_ptr(&rcu_sched_data), true);
3744 return;
3745 }
3746 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
3747 resched_cpu(smp_processor_id());
3748}
3749
3750/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
3751static void sync_sched_exp_online_cleanup(int cpu)
3752{
3753 struct rcu_data *rdp;
3754 int ret;
3755 struct rcu_node *rnp;
3756 struct rcu_state *rsp = &rcu_sched_state;
3757
3758 rdp = per_cpu_ptr(rsp->rda, cpu);
3759 rnp = rdp->mynode;
3760 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
3761 return;
3762 ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
3763 WARN_ON_ONCE(ret);
3764}
3765
3766/*
3767 * Select the nodes that the upcoming expedited grace period needs
3768 * to wait for.
3769 */
3770static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
3771 smp_call_func_t func)
3772{
3773 int cpu;
3774 unsigned long flags;
3775 unsigned long mask;
3776 unsigned long mask_ofl_test;
3777 unsigned long mask_ofl_ipi;
3778 int ret;
3779 struct rcu_node *rnp;
3780
3781 sync_exp_reset_tree(rsp);
3782 rcu_for_each_leaf_node(rsp, rnp) {
3783 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3784
3785 /* Each pass checks a CPU for identity, offline, and idle. */
3786 mask_ofl_test = 0;
3787 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
3788 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3789 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3790
3791 if (raw_smp_processor_id() == cpu ||
3792 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3793 mask_ofl_test |= rdp->grpmask;
3794 }
3795 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
3796
3797 /*
3798 * Need to wait for any blocked tasks as well. Note that
3799 * additional blocking tasks will also block the expedited
3800 * GP until such time as the ->expmask bits are cleared.
3801 */
3802 if (rcu_preempt_has_tasks(rnp))
3803 rnp->exp_tasks = rnp->blkd_tasks.next;
3804 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3805
3806 /* IPI the remaining CPUs for expedited quiescent state. */
3807 mask = 1;
3808 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3809 if (!(mask_ofl_ipi & mask))
3810 continue;
3811retry_ipi:
3812 ret = smp_call_function_single(cpu, func, rsp, 0);
3813 if (!ret) {
3814 mask_ofl_ipi &= ~mask;
3815 continue;
3816 }
3817 /* Failed, raced with offline. */
3818 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3819 if (cpu_online(cpu) &&
3820 (rnp->expmask & mask)) {
3821 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3822 schedule_timeout_uninterruptible(1);
3823 if (cpu_online(cpu) &&
3824 (rnp->expmask & mask))
3825 goto retry_ipi;
3826 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3827 }
3828 if (!(rnp->expmask & mask))
3829 mask_ofl_ipi &= ~mask;
3830 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3831 }
3832 /* Report quiescent states for those that went offline. */
3833 mask_ofl_test |= mask_ofl_ipi;
3834 if (mask_ofl_test)
3835 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
3836 }
3837}
3838
3839static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3840{
3841 int cpu;
3842 unsigned long jiffies_stall;
3843 unsigned long jiffies_start;
3844 unsigned long mask;
3845 int ndetected;
3846 struct rcu_node *rnp;
3847 struct rcu_node *rnp_root = rcu_get_root(rsp);
3848 int ret;
3849
3850 jiffies_stall = rcu_jiffies_till_stall_check();
3851 jiffies_start = jiffies;
3852
3853 for (;;) {
3854 ret = swait_event_timeout(
3855 rsp->expedited_wq,
3856 sync_rcu_preempt_exp_done(rnp_root),
3857 jiffies_stall);
3858 if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
3859 return;
3860 if (ret < 0) {
3861 /* Hit a signal, disable CPU stall warnings. */
3862 swait_event(rsp->expedited_wq,
3863 sync_rcu_preempt_exp_done(rnp_root));
3864 return;
3865 }
3866 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
3867 rsp->name);
3868 ndetected = 0;
3869 rcu_for_each_leaf_node(rsp, rnp) {
3870 ndetected += rcu_print_task_exp_stall(rnp);
3871 mask = 1;
3872 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3873 struct rcu_data *rdp;
3874
3875 if (!(rnp->expmask & mask))
3876 continue;
3877 ndetected++;
3878 rdp = per_cpu_ptr(rsp->rda, cpu);
3879 pr_cont(" %d-%c%c%c", cpu,
3880 "O."[!!cpu_online(cpu)],
3881 "o."[!!(rdp->grpmask & rnp->expmaskinit)],
3882 "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
3883 }
3884 mask <<= 1;
3885 }
3886 pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
3887 jiffies - jiffies_start, rsp->expedited_sequence,
3888 rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
3889 if (ndetected) {
3890 pr_err("blocking rcu_node structures:");
3891 rcu_for_each_node_breadth_first(rsp, rnp) {
3892 if (rnp == rnp_root)
3893 continue; /* printed unconditionally */
3894 if (sync_rcu_preempt_exp_done(rnp))
3895 continue;
3896 pr_cont(" l=%u:%d-%d:%#lx/%c",
3897 rnp->level, rnp->grplo, rnp->grphi,
3898 rnp->expmask,
3899 ".T"[!!rnp->exp_tasks]);
3900 }
3901 pr_cont("\n");
3902 }
3903 rcu_for_each_leaf_node(rsp, rnp) {
3904 mask = 1;
3905 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3906 if (!(rnp->expmask & mask))
3907 continue;
3908 dump_cpu_task(cpu);
3909 }
3910 }
3911 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
3912 }
3913}
3914
3915/*
3916 * Wait for the current expedited grace period to complete, and then
3917 * wake up everyone who piggybacked on the just-completed expedited
3918 * grace period. Also update all the ->exp_seq_rq counters as needed
3919 * in order to avoid counter-wrap problems.
3920 */
3921static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
3922{
3923 struct rcu_node *rnp;
3924
3925 synchronize_sched_expedited_wait(rsp);
3926 rcu_exp_gp_seq_end(rsp);
3927 trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
3928
3929 /*
3930 * Switch over to wakeup mode, allowing the next GP, but -only- the
3931 * next GP, to proceed.
3932 */
3933 mutex_lock(&rsp->exp_wake_mutex);
3934 mutex_unlock(&rsp->exp_mutex);
3935
3936 rcu_for_each_node_breadth_first(rsp, rnp) {
3937 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
3938 spin_lock(&rnp->exp_lock);
3939 /* Recheck, avoid hang in case someone just arrived. */
3940 if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
3941 rnp->exp_seq_rq = s;
3942 spin_unlock(&rnp->exp_lock);
3943 }
3944 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
3945 }
3946 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
3947 mutex_unlock(&rsp->exp_wake_mutex);
3948}
3949
3950/**
3951 * synchronize_sched_expedited - Brute-force RCU-sched grace period
3952 *
3953 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
3954 * approach to force the grace period to end quickly. This consumes
3955 * significant time on all CPUs and is unfriendly to real-time workloads,
3956 * so is thus not recommended for any sort of common-case code. In fact,
3957 * if you are using synchronize_sched_expedited() in a loop, please
3958 * restructure your code to batch your updates, and then use a single
3959 * synchronize_sched() instead.
3960 *
3961 * This implementation can be thought of as an application of sequence
3962 * locking to expedited grace periods, but using the sequence counter to
3963 * determine when someone else has already done the work instead of for
3964 * retrying readers.
3965 */
3966void synchronize_sched_expedited(void)
3967{
3968 unsigned long s;
3969 struct rcu_state *rsp = &rcu_sched_state;
3970
3971 /* If only one CPU, this is automatically a grace period. */
3972 if (rcu_blocking_is_gp())
3973 return;
3974
3975 /* If expedited grace periods are prohibited, fall back to normal. */
3976 if (rcu_gp_is_normal()) {
3977 wait_rcu_gp(call_rcu_sched);
3978 return;
3979 }
3980
3981 /* Take a snapshot of the sequence number. */
3982 s = rcu_exp_gp_seq_snap(rsp);
3983 if (exp_funnel_lock(rsp, s))
3984 return; /* Someone else did our work for us. */
3985
3986 /* Initialize the rcu_node tree in preparation for the wait. */
3987 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
3988
3989 /* Wait and clean up, including waking everyone. */
3990 rcu_exp_wait_wake(rsp, s);
3991}
3992EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
3993
3994/* 3460/*
3995 * Check to see if there is any immediate RCU-related work to be done 3461 * Check to see if there is any immediate RCU-related work to be done
3996 * by the current CPU, for the specified type of RCU, returning 1 if so. 3462 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
4281 3747
4282 /* Set up local state, ensuring consistent view of global state. */ 3748 /* Set up local state, ensuring consistent view of global state. */
4283 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3749 raw_spin_lock_irqsave_rcu_node(rnp, flags);
4284 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3750 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
4285 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3751 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
4286 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3752 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
4287 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3753 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -4340,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
4340 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3806 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4341} 3807}
4342 3808
4343static void rcu_prepare_cpu(int cpu) 3809int rcutree_prepare_cpu(unsigned int cpu)
4344{ 3810{
4345 struct rcu_state *rsp; 3811 struct rcu_state *rsp;
4346 3812
4347 for_each_rcu_flavor(rsp) 3813 for_each_rcu_flavor(rsp)
4348 rcu_init_percpu_data(cpu, rsp); 3814 rcu_init_percpu_data(cpu, rsp);
3815
3816 rcu_prepare_kthreads(cpu);
3817 rcu_spawn_all_nocb_kthreads(cpu);
3818
3819 return 0;
3820}
3821
3822static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3823{
3824 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3825
3826 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3827}
3828
3829int rcutree_online_cpu(unsigned int cpu)
3830{
3831 sync_sched_exp_online_cleanup(cpu);
3832 rcutree_affinity_setting(cpu, -1);
3833 return 0;
3834}
3835
3836int rcutree_offline_cpu(unsigned int cpu)
3837{
3838 rcutree_affinity_setting(cpu, cpu);
3839 return 0;
3840}
3841
3842
3843int rcutree_dying_cpu(unsigned int cpu)
3844{
3845 struct rcu_state *rsp;
3846
3847 for_each_rcu_flavor(rsp)
3848 rcu_cleanup_dying_cpu(rsp);
3849 return 0;
3850}
3851
3852int rcutree_dead_cpu(unsigned int cpu)
3853{
3854 struct rcu_state *rsp;
3855
3856 for_each_rcu_flavor(rsp) {
3857 rcu_cleanup_dead_cpu(cpu, rsp);
3858 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
3859 }
3860 return 0;
4349} 3861}
4350 3862
4351#ifdef CONFIG_HOTPLUG_CPU 3863#ifdef CONFIG_HOTPLUG_CPU
@@ -4364,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
4364 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3876 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
4365 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 3877 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
4366 3878
4367 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
4368 return;
4369
4370 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 3879 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
4371 mask = rdp->grpmask; 3880 mask = rdp->grpmask;
4372 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 3881 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
@@ -4388,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
4388} 3897}
4389#endif 3898#endif
4390 3899
4391/*
4392 * Handle CPU online/offline notification events.
4393 */
4394int rcu_cpu_notify(struct notifier_block *self,
4395 unsigned long action, void *hcpu)
4396{
4397 long cpu = (long)hcpu;
4398 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
4399 struct rcu_node *rnp = rdp->mynode;
4400 struct rcu_state *rsp;
4401
4402 switch (action) {
4403 case CPU_UP_PREPARE:
4404 case CPU_UP_PREPARE_FROZEN:
4405 rcu_prepare_cpu(cpu);
4406 rcu_prepare_kthreads(cpu);
4407 rcu_spawn_all_nocb_kthreads(cpu);
4408 break;
4409 case CPU_ONLINE:
4410 case CPU_DOWN_FAILED:
4411 sync_sched_exp_online_cleanup(cpu);
4412 rcu_boost_kthread_setaffinity(rnp, -1);
4413 break;
4414 case CPU_DOWN_PREPARE:
4415 rcu_boost_kthread_setaffinity(rnp, cpu);
4416 break;
4417 case CPU_DYING:
4418 case CPU_DYING_FROZEN:
4419 for_each_rcu_flavor(rsp)
4420 rcu_cleanup_dying_cpu(rsp);
4421 break;
4422 case CPU_DEAD:
4423 case CPU_DEAD_FROZEN:
4424 case CPU_UP_CANCELED:
4425 case CPU_UP_CANCELED_FROZEN:
4426 for_each_rcu_flavor(rsp) {
4427 rcu_cleanup_dead_cpu(cpu, rsp);
4428 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
4429 }
4430 break;
4431 default:
4432 break;
4433 }
4434 return NOTIFY_OK;
4435}
4436
4437static int rcu_pm_notify(struct notifier_block *self, 3900static int rcu_pm_notify(struct notifier_block *self,
4438 unsigned long action, void *hcpu) 3901 unsigned long action, void *hcpu)
4439{ 3902{
@@ -4745,10 +4208,10 @@ void __init rcu_init(void)
4745 * this is called early in boot, before either interrupts 4208 * this is called early in boot, before either interrupts
4746 * or the scheduler are operational. 4209 * or the scheduler are operational.
4747 */ 4210 */
4748 cpu_notifier(rcu_cpu_notify, 0);
4749 pm_notifier(rcu_pm_notify, 0); 4211 pm_notifier(rcu_pm_notify, 0);
4750 for_each_online_cpu(cpu) 4212 for_each_online_cpu(cpu)
4751 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 4213 rcutree_prepare_cpu(cpu);
4752} 4214}
4753 4215
4216#include "tree_exp.h"
4754#include "tree_plugin.h" 4217#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e3959f5e6ddf..f714f873bf9d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -254,6 +254,13 @@ struct rcu_node {
254} ____cacheline_internodealigned_in_smp; 254} ____cacheline_internodealigned_in_smp;
255 255
256/* 256/*
257 * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
258 * are indexed relative to this interval rather than the global CPU ID space.
259 * This generates the bit for a CPU in node-local masks.
260 */
261#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
262
263/*
257 * Do a full breadth-first scan of the rcu_node structures for the 264 * Do a full breadth-first scan of the rcu_node structures for the
258 * specified rcu_state structure. 265 * specified rcu_state structure.
259 */ 266 */
@@ -281,6 +288,14 @@ struct rcu_node {
281 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) 288 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
282 289
283/* 290/*
291 * Iterate over all possible CPUs in a leaf RCU node.
292 */
293#define for_each_leaf_node_possible_cpu(rnp, cpu) \
294 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
295 cpu <= rnp->grphi; \
296 cpu = cpumask_next((cpu), cpu_possible_mask))
297
298/*
284 * Union to allow "aggregate OR" operation on the need for a quiescent 299 * Union to allow "aggregate OR" operation on the need for a quiescent
285 * state by the normal and expedited grace periods. 300 * state by the normal and expedited grace periods.
286 */ 301 */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000000..6d86ab6ec2c9
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,655 @@
1/*
2 * RCU expedited grace periods
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2016
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23/* Wrapper functions for expedited grace periods. */
24static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
25{
26 rcu_seq_start(&rsp->expedited_sequence);
27}
28static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
29{
30 rcu_seq_end(&rsp->expedited_sequence);
31 smp_mb(); /* Ensure that consecutive grace periods serialize. */
32}
33static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
34{
35 unsigned long s;
36
37 smp_mb(); /* Caller's modifications seen first by other CPUs. */
38 s = rcu_seq_snap(&rsp->expedited_sequence);
39 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
40 return s;
41}
42static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
43{
44 return rcu_seq_done(&rsp->expedited_sequence, s);
45}
46
47/*
48 * Reset the ->expmaskinit values in the rcu_node tree to reflect any
49 * recent CPU-online activity. Note that these masks are not cleared
50 * when CPUs go offline, so they reflect the union of all CPUs that have
51 * ever been online. This means that this function normally takes its
52 * no-work-to-do fastpath.
53 */
54static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
55{
56 bool done;
57 unsigned long flags;
58 unsigned long mask;
59 unsigned long oldmask;
60 int ncpus = READ_ONCE(rsp->ncpus);
61 struct rcu_node *rnp;
62 struct rcu_node *rnp_up;
63
64 /* If no new CPUs onlined since last time, nothing to do. */
65 if (likely(ncpus == rsp->ncpus_snap))
66 return;
67 rsp->ncpus_snap = ncpus;
68
69 /*
70 * Each pass through the following loop propagates newly onlined
71 * CPUs for the current rcu_node structure up the rcu_node tree.
72 */
73 rcu_for_each_leaf_node(rsp, rnp) {
74 raw_spin_lock_irqsave_rcu_node(rnp, flags);
75 if (rnp->expmaskinit == rnp->expmaskinitnext) {
76 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
77 continue; /* No new CPUs, nothing to do. */
78 }
79
80 /* Update this node's mask, track old value for propagation. */
81 oldmask = rnp->expmaskinit;
82 rnp->expmaskinit = rnp->expmaskinitnext;
83 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
84
85 /* If was already nonzero, nothing to propagate. */
86 if (oldmask)
87 continue;
88
89 /* Propagate the new CPU up the tree. */
90 mask = rnp->grpmask;
91 rnp_up = rnp->parent;
92 done = false;
93 while (rnp_up) {
94 raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
95 if (rnp_up->expmaskinit)
96 done = true;
97 rnp_up->expmaskinit |= mask;
98 raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
99 if (done)
100 break;
101 mask = rnp_up->grpmask;
102 rnp_up = rnp_up->parent;
103 }
104 }
105}
106
107/*
108 * Reset the ->expmask values in the rcu_node tree in preparation for
109 * a new expedited grace period.
110 */
111static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
112{
113 unsigned long flags;
114 struct rcu_node *rnp;
115
116 sync_exp_reset_tree_hotplug(rsp);
117 rcu_for_each_node_breadth_first(rsp, rnp) {
118 raw_spin_lock_irqsave_rcu_node(rnp, flags);
119 WARN_ON_ONCE(rnp->expmask);
120 rnp->expmask = rnp->expmaskinit;
121 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
122 }
123}
124
125/*
126 * Return non-zero if there is no RCU expedited grace period in progress
127 * for the specified rcu_node structure, in other words, if all CPUs and
128 * tasks covered by the specified rcu_node structure have done their bit
129 * for the current expedited grace period. Works only for preemptible
130 * RCU -- other RCU implementation use other means.
131 *
132 * Caller must hold the rcu_state's exp_mutex.
133 */
134static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
135{
136 return rnp->exp_tasks == NULL &&
137 READ_ONCE(rnp->expmask) == 0;
138}
139
140/*
141 * Report the exit from RCU read-side critical section for the last task
142 * that queued itself during or before the current expedited preemptible-RCU
143 * grace period. This event is reported either to the rcu_node structure on
144 * which the task was queued or to one of that rcu_node structure's ancestors,
145 * recursively up the tree. (Calm down, calm down, we do the recursion
146 * iteratively!)
147 *
148 * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
149 * structure's ->lock.
150 */
151static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
152 bool wake, unsigned long flags)
153 __releases(rnp->lock)
154{
155 unsigned long mask;
156
157 for (;;) {
158 if (!sync_rcu_preempt_exp_done(rnp)) {
159 if (!rnp->expmask)
160 rcu_initiate_boost(rnp, flags);
161 else
162 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
163 break;
164 }
165 if (rnp->parent == NULL) {
166 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
167 if (wake) {
168 smp_mb(); /* EGP done before wake_up(). */
169 swake_up(&rsp->expedited_wq);
170 }
171 break;
172 }
173 mask = rnp->grpmask;
174 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
175 rnp = rnp->parent;
176 raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
177 WARN_ON_ONCE(!(rnp->expmask & mask));
178 rnp->expmask &= ~mask;
179 }
180}
181
182/*
183 * Report expedited quiescent state for specified node. This is a
184 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
185 *
186 * Caller must hold the rcu_state's exp_mutex.
187 */
188static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
189 struct rcu_node *rnp, bool wake)
190{
191 unsigned long flags;
192
193 raw_spin_lock_irqsave_rcu_node(rnp, flags);
194 __rcu_report_exp_rnp(rsp, rnp, wake, flags);
195}
196
197/*
198 * Report expedited quiescent state for multiple CPUs, all covered by the
199 * specified leaf rcu_node structure. Caller must hold the rcu_state's
200 * exp_mutex.
201 */
202static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
203 unsigned long mask, bool wake)
204{
205 unsigned long flags;
206
207 raw_spin_lock_irqsave_rcu_node(rnp, flags);
208 if (!(rnp->expmask & mask)) {
209 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
210 return;
211 }
212 rnp->expmask &= ~mask;
213 __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
214}
215
216/*
217 * Report expedited quiescent state for specified rcu_data (CPU).
218 */
219static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
220 bool wake)
221{
222 rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
223}
224
225/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
226static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
227 unsigned long s)
228{
229 if (rcu_exp_gp_seq_done(rsp, s)) {
230 trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
231 /* Ensure test happens before caller kfree(). */
232 smp_mb__before_atomic(); /* ^^^ */
233 atomic_long_inc(stat);
234 return true;
235 }
236 return false;
237}
238
239/*
240 * Funnel-lock acquisition for expedited grace periods. Returns true
241 * if some other task completed an expedited grace period that this task
242 * can piggy-back on, and with no mutex held. Otherwise, returns false
243 * with the mutex held, indicating that the caller must actually do the
244 * expedited grace period.
245 */
246static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
247{
248 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
249 struct rcu_node *rnp = rdp->mynode;
250 struct rcu_node *rnp_root = rcu_get_root(rsp);
251
252 /* Low-contention fastpath. */
253 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
254 (rnp == rnp_root ||
255 ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
256 mutex_trylock(&rsp->exp_mutex))
257 goto fastpath;
258
259 /*
260 * Each pass through the following loop works its way up
261 * the rcu_node tree, returning if others have done the work or
262 * otherwise falls through to acquire rsp->exp_mutex. The mapping
263 * from CPU to rcu_node structure can be inexact, as it is just
264 * promoting locality and is not strictly needed for correctness.
265 */
266 for (; rnp != NULL; rnp = rnp->parent) {
267 if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
268 return true;
269
270 /* Work not done, either wait here or go up. */
271 spin_lock(&rnp->exp_lock);
272 if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
273
274 /* Someone else doing GP, so wait for them. */
275 spin_unlock(&rnp->exp_lock);
276 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
277 rnp->grplo, rnp->grphi,
278 TPS("wait"));
279 wait_event(rnp->exp_wq[(s >> 1) & 0x3],
280 sync_exp_work_done(rsp,
281 &rdp->exp_workdone2, s));
282 return true;
283 }
284 rnp->exp_seq_rq = s; /* Followers can wait on us. */
285 spin_unlock(&rnp->exp_lock);
286 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
287 rnp->grphi, TPS("nxtlvl"));
288 }
289 mutex_lock(&rsp->exp_mutex);
290fastpath:
291 if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
292 mutex_unlock(&rsp->exp_mutex);
293 return true;
294 }
295 rcu_exp_gp_seq_start(rsp);
296 trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
297 return false;
298}
299
300/* Invoked on each online non-idle CPU for expedited quiescent state. */
301static void sync_sched_exp_handler(void *data)
302{
303 struct rcu_data *rdp;
304 struct rcu_node *rnp;
305 struct rcu_state *rsp = data;
306
307 rdp = this_cpu_ptr(rsp->rda);
308 rnp = rdp->mynode;
309 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
310 __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
311 return;
312 if (rcu_is_cpu_rrupt_from_idle()) {
313 rcu_report_exp_rdp(&rcu_sched_state,
314 this_cpu_ptr(&rcu_sched_data), true);
315 return;
316 }
317 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
318 resched_cpu(smp_processor_id());
319}
320
321/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
322static void sync_sched_exp_online_cleanup(int cpu)
323{
324 struct rcu_data *rdp;
325 int ret;
326 struct rcu_node *rnp;
327 struct rcu_state *rsp = &rcu_sched_state;
328
329 rdp = per_cpu_ptr(rsp->rda, cpu);
330 rnp = rdp->mynode;
331 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
332 return;
333 ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
334 WARN_ON_ONCE(ret);
335}
336
337/*
338 * Select the nodes that the upcoming expedited grace period needs
339 * to wait for.
340 */
341static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
342 smp_call_func_t func)
343{
344 int cpu;
345 unsigned long flags;
346 unsigned long mask_ofl_test;
347 unsigned long mask_ofl_ipi;
348 int ret;
349 struct rcu_node *rnp;
350
351 sync_exp_reset_tree(rsp);
352 rcu_for_each_leaf_node(rsp, rnp) {
353 raw_spin_lock_irqsave_rcu_node(rnp, flags);
354
355 /* Each pass checks a CPU for identity, offline, and idle. */
356 mask_ofl_test = 0;
357 for_each_leaf_node_possible_cpu(rnp, cpu) {
358 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
359 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
360
361 if (raw_smp_processor_id() == cpu ||
362 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
363 mask_ofl_test |= rdp->grpmask;
364 }
365 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
366
367 /*
368 * Need to wait for any blocked tasks as well. Note that
369 * additional blocking tasks will also block the expedited
370 * GP until such time as the ->expmask bits are cleared.
371 */
372 if (rcu_preempt_has_tasks(rnp))
373 rnp->exp_tasks = rnp->blkd_tasks.next;
374 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
375
376 /* IPI the remaining CPUs for expedited quiescent state. */
377 for_each_leaf_node_possible_cpu(rnp, cpu) {
378 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
379 if (!(mask_ofl_ipi & mask))
380 continue;
381retry_ipi:
382 ret = smp_call_function_single(cpu, func, rsp, 0);
383 if (!ret) {
384 mask_ofl_ipi &= ~mask;
385 continue;
386 }
387 /* Failed, raced with offline. */
388 raw_spin_lock_irqsave_rcu_node(rnp, flags);
389 if (cpu_online(cpu) &&
390 (rnp->expmask & mask)) {
391 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
392 schedule_timeout_uninterruptible(1);
393 if (cpu_online(cpu) &&
394 (rnp->expmask & mask))
395 goto retry_ipi;
396 raw_spin_lock_irqsave_rcu_node(rnp, flags);
397 }
398 if (!(rnp->expmask & mask))
399 mask_ofl_ipi &= ~mask;
400 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
401 }
402 /* Report quiescent states for those that went offline. */
403 mask_ofl_test |= mask_ofl_ipi;
404 if (mask_ofl_test)
405 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
406 }
407}
408
409static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
410{
411 int cpu;
412 unsigned long jiffies_stall;
413 unsigned long jiffies_start;
414 unsigned long mask;
415 int ndetected;
416 struct rcu_node *rnp;
417 struct rcu_node *rnp_root = rcu_get_root(rsp);
418 int ret;
419
420 jiffies_stall = rcu_jiffies_till_stall_check();
421 jiffies_start = jiffies;
422
423 for (;;) {
424 ret = swait_event_timeout(
425 rsp->expedited_wq,
426 sync_rcu_preempt_exp_done(rnp_root),
427 jiffies_stall);
428 if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
429 return;
430 if (ret < 0) {
431 /* Hit a signal, disable CPU stall warnings. */
432 swait_event(rsp->expedited_wq,
433 sync_rcu_preempt_exp_done(rnp_root));
434 return;
435 }
436 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
437 rsp->name);
438 ndetected = 0;
439 rcu_for_each_leaf_node(rsp, rnp) {
440 ndetected += rcu_print_task_exp_stall(rnp);
441 for_each_leaf_node_possible_cpu(rnp, cpu) {
442 struct rcu_data *rdp;
443
444 mask = leaf_node_cpu_bit(rnp, cpu);
445 if (!(rnp->expmask & mask))
446 continue;
447 ndetected++;
448 rdp = per_cpu_ptr(rsp->rda, cpu);
449 pr_cont(" %d-%c%c%c", cpu,
450 "O."[!!cpu_online(cpu)],
451 "o."[!!(rdp->grpmask & rnp->expmaskinit)],
452 "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
453 }
454 }
455 pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
456 jiffies - jiffies_start, rsp->expedited_sequence,
457 rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
458 if (ndetected) {
459 pr_err("blocking rcu_node structures:");
460 rcu_for_each_node_breadth_first(rsp, rnp) {
461 if (rnp == rnp_root)
462 continue; /* printed unconditionally */
463 if (sync_rcu_preempt_exp_done(rnp))
464 continue;
465 pr_cont(" l=%u:%d-%d:%#lx/%c",
466 rnp->level, rnp->grplo, rnp->grphi,
467 rnp->expmask,
468 ".T"[!!rnp->exp_tasks]);
469 }
470 pr_cont("\n");
471 }
472 rcu_for_each_leaf_node(rsp, rnp) {
473 for_each_leaf_node_possible_cpu(rnp, cpu) {
474 mask = leaf_node_cpu_bit(rnp, cpu);
475 if (!(rnp->expmask & mask))
476 continue;
477 dump_cpu_task(cpu);
478 }
479 }
480 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
481 }
482}
483
484/*
485 * Wait for the current expedited grace period to complete, and then
486 * wake up everyone who piggybacked on the just-completed expedited
487 * grace period. Also update all the ->exp_seq_rq counters as needed
488 * in order to avoid counter-wrap problems.
489 */
490static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
491{
492 struct rcu_node *rnp;
493
494 synchronize_sched_expedited_wait(rsp);
495 rcu_exp_gp_seq_end(rsp);
496 trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
497
498 /*
499 * Switch over to wakeup mode, allowing the next GP, but -only- the
500 * next GP, to proceed.
501 */
502 mutex_lock(&rsp->exp_wake_mutex);
503 mutex_unlock(&rsp->exp_mutex);
504
505 rcu_for_each_node_breadth_first(rsp, rnp) {
506 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
507 spin_lock(&rnp->exp_lock);
508 /* Recheck, avoid hang in case someone just arrived. */
509 if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
510 rnp->exp_seq_rq = s;
511 spin_unlock(&rnp->exp_lock);
512 }
513 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
514 }
515 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
516 mutex_unlock(&rsp->exp_wake_mutex);
517}
518
519/**
520 * synchronize_sched_expedited - Brute-force RCU-sched grace period
521 *
522 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
523 * approach to force the grace period to end quickly. This consumes
524 * significant time on all CPUs and is unfriendly to real-time workloads,
525 * so is thus not recommended for any sort of common-case code. In fact,
526 * if you are using synchronize_sched_expedited() in a loop, please
527 * restructure your code to batch your updates, and then use a single
528 * synchronize_sched() instead.
529 *
530 * This implementation can be thought of as an application of sequence
531 * locking to expedited grace periods, but using the sequence counter to
532 * determine when someone else has already done the work instead of for
533 * retrying readers.
534 */
535void synchronize_sched_expedited(void)
536{
537 unsigned long s;
538 struct rcu_state *rsp = &rcu_sched_state;
539
540 /* If only one CPU, this is automatically a grace period. */
541 if (rcu_blocking_is_gp())
542 return;
543
544 /* If expedited grace periods are prohibited, fall back to normal. */
545 if (rcu_gp_is_normal()) {
546 wait_rcu_gp(call_rcu_sched);
547 return;
548 }
549
550 /* Take a snapshot of the sequence number. */
551 s = rcu_exp_gp_seq_snap(rsp);
552 if (exp_funnel_lock(rsp, s))
553 return; /* Someone else did our work for us. */
554
555 /* Initialize the rcu_node tree in preparation for the wait. */
556 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
557
558 /* Wait and clean up, including waking everyone. */
559 rcu_exp_wait_wake(rsp, s);
560}
561EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
562
563#ifdef CONFIG_PREEMPT_RCU
564
565/*
566 * Remote handler for smp_call_function_single(). If there is an
567 * RCU read-side critical section in effect, request that the
568 * next rcu_read_unlock() record the quiescent state up the
569 * ->expmask fields in the rcu_node tree. Otherwise, immediately
570 * report the quiescent state.
571 */
572static void sync_rcu_exp_handler(void *info)
573{
574 struct rcu_data *rdp;
575 struct rcu_state *rsp = info;
576 struct task_struct *t = current;
577
578 /*
579 * Within an RCU read-side critical section, request that the next
580 * rcu_read_unlock() report. Unless this RCU read-side critical
581 * section has already blocked, in which case it is already set
582 * up for the expedited grace period to wait on it.
583 */
584 if (t->rcu_read_lock_nesting > 0 &&
585 !t->rcu_read_unlock_special.b.blocked) {
586 t->rcu_read_unlock_special.b.exp_need_qs = true;
587 return;
588 }
589
590 /*
591 * We are either exiting an RCU read-side critical section (negative
592 * values of t->rcu_read_lock_nesting) or are not in one at all
593 * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
594 * read-side critical section that blocked before this expedited
595 * grace period started. Either way, we can immediately report
596 * the quiescent state.
597 */
598 rdp = this_cpu_ptr(rsp->rda);
599 rcu_report_exp_rdp(rsp, rdp, true);
600}
601
602/**
603 * synchronize_rcu_expedited - Brute-force RCU grace period
604 *
605 * Wait for an RCU-preempt grace period, but expedite it. The basic
606 * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
607 * checks whether the CPU is in an RCU-preempt critical section, and
608 * if so, it sets a flag that causes the outermost rcu_read_unlock()
609 * to report the quiescent state. On the other hand, if the CPU is
610 * not in an RCU read-side critical section, the IPI handler reports
611 * the quiescent state immediately.
612 *
613 * Although this is a greate improvement over previous expedited
614 * implementations, it is still unfriendly to real-time workloads, so is
615 * thus not recommended for any sort of common-case code. In fact, if
616 * you are using synchronize_rcu_expedited() in a loop, please restructure
617 * your code to batch your updates, and then Use a single synchronize_rcu()
618 * instead.
619 */
620void synchronize_rcu_expedited(void)
621{
622 struct rcu_state *rsp = rcu_state_p;
623 unsigned long s;
624
625 /* If expedited grace periods are prohibited, fall back to normal. */
626 if (rcu_gp_is_normal()) {
627 wait_rcu_gp(call_rcu);
628 return;
629 }
630
631 s = rcu_exp_gp_seq_snap(rsp);
632 if (exp_funnel_lock(rsp, s))
633 return; /* Someone else did our work for us. */
634
635 /* Initialize the rcu_node tree in preparation for the wait. */
636 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
637
638 /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
639 rcu_exp_wait_wake(rsp, s);
640}
641EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
642
643#else /* #ifdef CONFIG_PREEMPT_RCU */
644
645/*
646 * Wait for an rcu-preempt grace period, but make it happen quickly.
647 * But because preemptible RCU does not exist, map to rcu-sched.
648 */
649void synchronize_rcu_expedited(void)
650{
651 synchronize_sched_expedited();
652}
653EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
654
655#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ff1cd4e1188d..0082fce402a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void)
79 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 79 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
80 if (IS_ENABLED(CONFIG_PROVE_RCU)) 80 if (IS_ENABLED(CONFIG_PROVE_RCU))
81 pr_info("\tRCU lockdep checking is enabled.\n"); 81 pr_info("\tRCU lockdep checking is enabled.\n");
82 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
83 pr_info("\tRCU torture testing starts during boot.\n");
84 if (RCU_NUM_LVLS >= 4) 82 if (RCU_NUM_LVLS >= 4)
85 pr_info("\tFour(or more)-level hierarchy is enabled.\n"); 83 pr_info("\tFour(or more)-level hierarchy is enabled.\n");
86 if (RCU_FANOUT_LEAF != 16) 84 if (RCU_FANOUT_LEAF != 16)
@@ -681,84 +679,6 @@ void synchronize_rcu(void)
681} 679}
682EXPORT_SYMBOL_GPL(synchronize_rcu); 680EXPORT_SYMBOL_GPL(synchronize_rcu);
683 681
684/*
685 * Remote handler for smp_call_function_single(). If there is an
686 * RCU read-side critical section in effect, request that the
687 * next rcu_read_unlock() record the quiescent state up the
688 * ->expmask fields in the rcu_node tree. Otherwise, immediately
689 * report the quiescent state.
690 */
691static void sync_rcu_exp_handler(void *info)
692{
693 struct rcu_data *rdp;
694 struct rcu_state *rsp = info;
695 struct task_struct *t = current;
696
697 /*
698 * Within an RCU read-side critical section, request that the next
699 * rcu_read_unlock() report. Unless this RCU read-side critical
700 * section has already blocked, in which case it is already set
701 * up for the expedited grace period to wait on it.
702 */
703 if (t->rcu_read_lock_nesting > 0 &&
704 !t->rcu_read_unlock_special.b.blocked) {
705 t->rcu_read_unlock_special.b.exp_need_qs = true;
706 return;
707 }
708
709 /*
710 * We are either exiting an RCU read-side critical section (negative
711 * values of t->rcu_read_lock_nesting) or are not in one at all
712 * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
713 * read-side critical section that blocked before this expedited
714 * grace period started. Either way, we can immediately report
715 * the quiescent state.
716 */
717 rdp = this_cpu_ptr(rsp->rda);
718 rcu_report_exp_rdp(rsp, rdp, true);
719}
720
721/**
722 * synchronize_rcu_expedited - Brute-force RCU grace period
723 *
724 * Wait for an RCU-preempt grace period, but expedite it. The basic
725 * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
726 * checks whether the CPU is in an RCU-preempt critical section, and
727 * if so, it sets a flag that causes the outermost rcu_read_unlock()
728 * to report the quiescent state. On the other hand, if the CPU is
729 * not in an RCU read-side critical section, the IPI handler reports
730 * the quiescent state immediately.
731 *
732 * Although this is a greate improvement over previous expedited
733 * implementations, it is still unfriendly to real-time workloads, so is
734 * thus not recommended for any sort of common-case code. In fact, if
735 * you are using synchronize_rcu_expedited() in a loop, please restructure
736 * your code to batch your updates, and then Use a single synchronize_rcu()
737 * instead.
738 */
739void synchronize_rcu_expedited(void)
740{
741 struct rcu_state *rsp = rcu_state_p;
742 unsigned long s;
743
744 /* If expedited grace periods are prohibited, fall back to normal. */
745 if (rcu_gp_is_normal()) {
746 wait_rcu_gp(call_rcu);
747 return;
748 }
749
750 s = rcu_exp_gp_seq_snap(rsp);
751 if (exp_funnel_lock(rsp, s))
752 return; /* Someone else did our work for us. */
753
754 /* Initialize the rcu_node tree in preparation for the wait. */
755 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
756
757 /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
758 rcu_exp_wait_wake(rsp, s);
759}
760EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
761
762/** 682/**
763 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 683 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
764 * 684 *
@@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void)
883} 803}
884 804
885/* 805/*
886 * Wait for an rcu-preempt grace period, but make it happen quickly.
887 * But because preemptible RCU does not exist, map to rcu-sched.
888 */
889void synchronize_rcu_expedited(void)
890{
891 synchronize_sched_expedited();
892}
893EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
894
895/*
896 * Because preemptible RCU does not exist, rcu_barrier() is just 806 * Because preemptible RCU does not exist, rcu_barrier() is just
897 * another name for rcu_barrier_sched(). 807 * another name for rcu_barrier_sched().
898 */ 808 */
@@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1254 return; 1164 return;
1255 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1165 if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1256 return; 1166 return;
1257 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1167 for_each_leaf_node_possible_cpu(rnp, cpu)
1258 if ((mask & 0x1) && cpu != outgoingcpu) 1168 if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
1169 cpu != outgoingcpu)
1259 cpumask_set_cpu(cpu, cm); 1170 cpumask_set_cpu(cpu, cm);
1260 if (cpumask_weight(cm) == 0) 1171 if (cpumask_weight(cm) == 0)
1261 cpumask_setall(cm); 1172 cpumask_setall(cm);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3e888cd5a594..f0d8322bc3ec 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
528module_param(rcu_task_stall_timeout, int, 0644); 528module_param(rcu_task_stall_timeout, int, 0644);
529 529
530static void rcu_spawn_tasks_kthread(void); 530static void rcu_spawn_tasks_kthread(void);
531static struct task_struct *rcu_tasks_kthread_ptr;
531 532
532/* 533/*
533 * Post an RCU-tasks callback. First call must be from process context 534 * Post an RCU-tasks callback. First call must be from process context
@@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
537{ 538{
538 unsigned long flags; 539 unsigned long flags;
539 bool needwake; 540 bool needwake;
541 bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
540 542
541 rhp->next = NULL; 543 rhp->next = NULL;
542 rhp->func = func; 544 rhp->func = func;
@@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
545 *rcu_tasks_cbs_tail = rhp; 547 *rcu_tasks_cbs_tail = rhp;
546 rcu_tasks_cbs_tail = &rhp->next; 548 rcu_tasks_cbs_tail = &rhp->next;
547 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); 549 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
548 if (needwake) { 550 /* We can't create the thread unless interrupts are enabled. */
551 if ((needwake && havetask) ||
552 (!havetask && !irqs_disabled_flags(flags))) {
549 rcu_spawn_tasks_kthread(); 553 rcu_spawn_tasks_kthread();
550 wake_up(&rcu_tasks_cbs_wq); 554 wake_up(&rcu_tasks_cbs_wq);
551 } 555 }
@@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg)
790static void rcu_spawn_tasks_kthread(void) 794static void rcu_spawn_tasks_kthread(void)
791{ 795{
792 static DEFINE_MUTEX(rcu_tasks_kthread_mutex); 796 static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
793 static struct task_struct *rcu_tasks_kthread_ptr;
794 struct task_struct *t; 797 struct task_struct *t;
795 798
796 if (READ_ONCE(rcu_tasks_kthread_ptr)) { 799 if (READ_ONCE(rcu_tasks_kthread_ptr)) {
diff --git a/kernel/relay.c b/kernel/relay.c
index 074994bcfa9b..04d7cf3ef8cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -614,6 +614,7 @@ free_bufs:
614 614
615 kref_put(&chan->kref, relay_destroy_channel); 615 kref_put(&chan->kref, relay_destroy_channel);
616 mutex_unlock(&relay_channels_mutex); 616 mutex_unlock(&relay_channels_mutex);
617 kfree(chan);
617 return NULL; 618 return NULL;
618} 619}
619EXPORT_SYMBOL_GPL(relay_open); 620EXPORT_SYMBOL_GPL(relay_open);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..5c883fe8e440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1536 for (;;) { 1536 for (;;) {
1537 /* Any allowed, online CPU? */ 1537 /* Any allowed, online CPU? */
1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1539 if (!cpu_active(dest_cpu)) 1539 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1540 continue;
1541 if (!cpu_online(dest_cpu))
1540 continue; 1542 continue;
1541 goto out; 1543 goto out;
1542 } 1544 }
@@ -1935,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1935 * chain to provide order. Instead we do: 1937 * chain to provide order. Instead we do:
1936 * 1938 *
1937 * 1) smp_store_release(X->on_cpu, 0) 1939 * 1) smp_store_release(X->on_cpu, 0)
1938 * 2) smp_cond_acquire(!X->on_cpu) 1940 * 2) smp_cond_load_acquire(!X->on_cpu)
1939 * 1941 *
1940 * Example: 1942 * Example:
1941 * 1943 *
@@ -1946,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1946 * sched-out X 1948 * sched-out X
1947 * smp_store_release(X->on_cpu, 0); 1949 * smp_store_release(X->on_cpu, 0);
1948 * 1950 *
1949 * smp_cond_acquire(!X->on_cpu); 1951 * smp_cond_load_acquire(&X->on_cpu, !VAL);
1950 * X->state = WAKING 1952 * X->state = WAKING
1951 * set_task_cpu(X,2) 1953 * set_task_cpu(X,2)
1952 * 1954 *
@@ -1972,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1972 * This means that any means of doing remote wakeups must order the CPU doing 1974 * This means that any means of doing remote wakeups must order the CPU doing
1973 * the wakeup against the CPU the task is going to end up running on. This, 1975 * the wakeup against the CPU the task is going to end up running on. This,
1974 * however, is already required for the regular Program-Order guarantee above, 1976 * however, is already required for the regular Program-Order guarantee above,
1975 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). 1977 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
1976 * 1978 *
1977 */ 1979 */
1978 1980
@@ -2045,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2045 * This ensures that tasks getting woken will be fully ordered against 2047 * This ensures that tasks getting woken will be fully ordered against
2046 * their previous state and preserve Program Order. 2048 * their previous state and preserve Program Order.
2047 */ 2049 */
2048 smp_cond_acquire(!p->on_cpu); 2050 smp_cond_load_acquire(&p->on_cpu, !VAL);
2049 2051
2050 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2052 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2051 p->state = TASK_WAKING; 2053 p->state = TASK_WAKING;
@@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
2253#endif 2255#endif
2254#endif 2256#endif
2255 2257
2258#ifdef CONFIG_SCHEDSTATS
2259
2256DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2260DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2261static bool __initdata __sched_schedstats = false;
2257 2262
2258#ifdef CONFIG_SCHEDSTATS
2259static void set_schedstats(bool enabled) 2263static void set_schedstats(bool enabled)
2260{ 2264{
2261 if (enabled) 2265 if (enabled)
@@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str)
2278 if (!str) 2282 if (!str)
2279 goto out; 2283 goto out;
2280 2284
2285 /*
2286 * This code is called before jump labels have been set up, so we can't
2287 * change the static branch directly just yet. Instead set a temporary
2288 * variable so init_schedstats() can do it later.
2289 */
2281 if (!strcmp(str, "enable")) { 2290 if (!strcmp(str, "enable")) {
2282 set_schedstats(true); 2291 __sched_schedstats = true;
2283 ret = 1; 2292 ret = 1;
2284 } else if (!strcmp(str, "disable")) { 2293 } else if (!strcmp(str, "disable")) {
2285 set_schedstats(false); 2294 __sched_schedstats = false;
2286 ret = 1; 2295 ret = 1;
2287 } 2296 }
2288out: 2297out:
@@ -2293,6 +2302,11 @@ out:
2293} 2302}
2294__setup("schedstats=", setup_schedstats); 2303__setup("schedstats=", setup_schedstats);
2295 2304
2305static void __init init_schedstats(void)
2306{
2307 set_schedstats(__sched_schedstats);
2308}
2309
2296#ifdef CONFIG_PROC_SYSCTL 2310#ifdef CONFIG_PROC_SYSCTL
2297int sysctl_schedstats(struct ctl_table *table, int write, 2311int sysctl_schedstats(struct ctl_table *table, int write,
2298 void __user *buffer, size_t *lenp, loff_t *ppos) 2312 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
2313 set_schedstats(state); 2327 set_schedstats(state);
2314 return err; 2328 return err;
2315} 2329}
2316#endif 2330#endif /* CONFIG_PROC_SYSCTL */
2317#endif 2331#else /* !CONFIG_SCHEDSTATS */
2332static inline void init_schedstats(void) {}
2333#endif /* CONFIG_SCHEDSTATS */
2318 2334
2319/* 2335/*
2320 * fork()/clone()-time setup: 2336 * fork()/clone()-time setup:
@@ -2326,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2326 2342
2327 __sched_fork(clone_flags, p); 2343 __sched_fork(clone_flags, p);
2328 /* 2344 /*
2329 * We mark the process as running here. This guarantees that 2345 * We mark the process as NEW here. This guarantees that
2330 * nobody will actually run it, and a signal or other external 2346 * nobody will actually run it, and a signal or other external
2331 * event cannot wake it up and insert it on the runqueue either. 2347 * event cannot wake it up and insert it on the runqueue either.
2332 */ 2348 */
2333 p->state = TASK_RUNNING; 2349 p->state = TASK_NEW;
2334 2350
2335 /* 2351 /*
2336 * Make sure we do not leak PI boosting priority to the child. 2352 * Make sure we do not leak PI boosting priority to the child.
@@ -2367,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2367 p->sched_class = &fair_sched_class; 2383 p->sched_class = &fair_sched_class;
2368 } 2384 }
2369 2385
2370 if (p->sched_class->task_fork) 2386 init_entity_runnable_average(&p->se);
2371 p->sched_class->task_fork(p);
2372 2387
2373 /* 2388 /*
2374 * The child is not yet in the pid-hash so no cgroup attach races, 2389 * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2378,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2378 * Silence PROVE_RCU. 2393 * Silence PROVE_RCU.
2379 */ 2394 */
2380 raw_spin_lock_irqsave(&p->pi_lock, flags); 2395 raw_spin_lock_irqsave(&p->pi_lock, flags);
2381 set_task_cpu(p, cpu); 2396 /*
2397 * We're setting the cpu for the first time, we don't migrate,
2398 * so use __set_task_cpu().
2399 */
2400 __set_task_cpu(p, cpu);
2401 if (p->sched_class->task_fork)
2402 p->sched_class->task_fork(p);
2382 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2403 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2383 2404
2384#ifdef CONFIG_SCHED_INFO 2405#ifdef CONFIG_SCHED_INFO
@@ -2510,21 +2531,22 @@ void wake_up_new_task(struct task_struct *p)
2510 struct rq_flags rf; 2531 struct rq_flags rf;
2511 struct rq *rq; 2532 struct rq *rq;
2512 2533
2513 /* Initialize new task's runnable average */
2514 init_entity_runnable_average(&p->se);
2515 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2534 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2535 p->state = TASK_RUNNING;
2516#ifdef CONFIG_SMP 2536#ifdef CONFIG_SMP
2517 /* 2537 /*
2518 * Fork balancing, do it here and not earlier because: 2538 * Fork balancing, do it here and not earlier because:
2519 * - cpus_allowed can change in the fork path 2539 * - cpus_allowed can change in the fork path
2520 * - any previously selected cpu might disappear through hotplug 2540 * - any previously selected cpu might disappear through hotplug
2541 *
2542 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2543 * as we're not fully set-up yet.
2521 */ 2544 */
2522 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2545 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2523#endif 2546#endif
2524 /* Post initialize new task's util average when its cfs_rq is set */ 2547 rq = __task_rq_lock(p, &rf);
2525 post_init_entity_util_avg(&p->se); 2548 post_init_entity_util_avg(&p->se);
2526 2549
2527 rq = __task_rq_lock(p, &rf);
2528 activate_task(rq, p, 0); 2550 activate_task(rq, p, 0);
2529 p->on_rq = TASK_ON_RQ_QUEUED; 2551 p->on_rq = TASK_ON_RQ_QUEUED;
2530 trace_sched_wakeup_new(p); 2552 trace_sched_wakeup_new(p);
@@ -3146,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
3146 pr_cont("\n"); 3168 pr_cont("\n");
3147 } 3169 }
3148#endif 3170#endif
3171 if (panic_on_warn)
3172 panic("scheduling while atomic\n");
3173
3149 dump_stack(); 3174 dump_stack();
3150 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3175 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3151} 3176}
@@ -3156,7 +3181,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
3156static inline void schedule_debug(struct task_struct *prev) 3181static inline void schedule_debug(struct task_struct *prev)
3157{ 3182{
3158#ifdef CONFIG_SCHED_STACK_END_CHECK 3183#ifdef CONFIG_SCHED_STACK_END_CHECK
3159 BUG_ON(task_stack_end_corrupted(prev)); 3184 if (task_stack_end_corrupted(prev))
3185 panic("corrupted stack end detected inside scheduler\n");
3160#endif 3186#endif
3161 3187
3162 if (unlikely(in_atomic_preempt_off())) { 3188 if (unlikely(in_atomic_preempt_off())) {
@@ -4736,7 +4762,8 @@ out_unlock:
4736 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4762 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4737 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4763 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4738 * 4764 *
4739 * Return: 0 on success. An error code otherwise. 4765 * Return: size of CPU mask copied to user_mask_ptr on success. An
4766 * error code otherwise.
4740 */ 4767 */
4741SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4768SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4742 unsigned long __user *, user_mask_ptr) 4769 unsigned long __user *, user_mask_ptr)
@@ -5133,14 +5160,16 @@ void show_state_filter(unsigned long state_filter)
5133 /* 5160 /*
5134 * reset the NMI-timeout, listing all files on a slow 5161 * reset the NMI-timeout, listing all files on a slow
5135 * console might take a lot of time: 5162 * console might take a lot of time:
5163 * Also, reset softlockup watchdogs on all CPUs, because
5164 * another CPU might be blocked waiting for us to process
5165 * an IPI.
5136 */ 5166 */
5137 touch_nmi_watchdog(); 5167 touch_nmi_watchdog();
5168 touch_all_softlockup_watchdogs();
5138 if (!state_filter || (p->state & state_filter)) 5169 if (!state_filter || (p->state & state_filter))
5139 sched_show_task(p); 5170 sched_show_task(p);
5140 } 5171 }
5141 5172
5142 touch_all_softlockup_watchdogs();
5143
5144#ifdef CONFIG_SCHED_DEBUG 5173#ifdef CONFIG_SCHED_DEBUG
5145 if (!state_filter) 5174 if (!state_filter)
5146 sysrq_sched_debug_show(); 5175 sysrq_sched_debug_show();
@@ -5376,13 +5405,15 @@ void idle_task_exit(void)
5376/* 5405/*
5377 * Since this CPU is going 'away' for a while, fold any nr_active delta 5406 * Since this CPU is going 'away' for a while, fold any nr_active delta
5378 * we might have. Assumes we're called after migrate_tasks() so that the 5407 * we might have. Assumes we're called after migrate_tasks() so that the
5379 * nr_active count is stable. 5408 * nr_active count is stable. We need to take the teardown thread which
5409 * is calling this into account, so we hand in adjust = 1 to the load
5410 * calculation.
5380 * 5411 *
5381 * Also see the comment "Global load-average calculations". 5412 * Also see the comment "Global load-average calculations".
5382 */ 5413 */
5383static void calc_load_migrate(struct rq *rq) 5414static void calc_load_migrate(struct rq *rq)
5384{ 5415{
5385 long delta = calc_load_fold_active(rq); 5416 long delta = calc_load_fold_active(rq, 1);
5386 if (delta) 5417 if (delta)
5387 atomic_long_add(delta, &calc_load_tasks); 5418 atomic_long_add(delta, &calc_load_tasks);
5388} 5419}
@@ -7213,7 +7244,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
7213 struct rq *rq = cpu_rq(cpu); 7244 struct rq *rq = cpu_rq(cpu);
7214 7245
7215 rq->calc_load_update = calc_load_update; 7246 rq->calc_load_update = calc_load_update;
7216 account_reset_rq(rq);
7217 update_max_interval(); 7247 update_max_interval();
7218} 7248}
7219 7249
@@ -7487,6 +7517,8 @@ void __init sched_init(void)
7487#endif 7517#endif
7488 init_sched_fair_class(); 7518 init_sched_fair_class();
7489 7519
7520 init_schedstats();
7521
7490 scheduler_running = 1; 7522 scheduler_running = 1;
7491} 7523}
7492 7524
@@ -7691,6 +7723,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
7691 INIT_LIST_HEAD(&tg->children); 7723 INIT_LIST_HEAD(&tg->children);
7692 list_add_rcu(&tg->siblings, &parent->children); 7724 list_add_rcu(&tg->siblings, &parent->children);
7693 spin_unlock_irqrestore(&task_group_lock, flags); 7725 spin_unlock_irqrestore(&task_group_lock, flags);
7726
7727 online_fair_sched_group(tg);
7694} 7728}
7695 7729
7696/* rcu callback to free various structures associated with a task group */ 7730/* rcu callback to free various structures associated with a task group */
@@ -7719,27 +7753,9 @@ void sched_offline_group(struct task_group *tg)
7719 spin_unlock_irqrestore(&task_group_lock, flags); 7753 spin_unlock_irqrestore(&task_group_lock, flags);
7720} 7754}
7721 7755
7722/* change task's runqueue when it moves between groups. 7756static void sched_change_group(struct task_struct *tsk, int type)
7723 * The caller of this function should have put the task in its new group
7724 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7725 * reflect its new group.
7726 */
7727void sched_move_task(struct task_struct *tsk)
7728{ 7757{
7729 struct task_group *tg; 7758 struct task_group *tg;
7730 int queued, running;
7731 struct rq_flags rf;
7732 struct rq *rq;
7733
7734 rq = task_rq_lock(tsk, &rf);
7735
7736 running = task_current(rq, tsk);
7737 queued = task_on_rq_queued(tsk);
7738
7739 if (queued)
7740 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7741 if (unlikely(running))
7742 put_prev_task(rq, tsk);
7743 7759
7744 /* 7760 /*
7745 * All callers are synchronized by task_rq_lock(); we do not use RCU 7761 * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7752,11 +7768,37 @@ void sched_move_task(struct task_struct *tsk)
7752 tsk->sched_task_group = tg; 7768 tsk->sched_task_group = tg;
7753 7769
7754#ifdef CONFIG_FAIR_GROUP_SCHED 7770#ifdef CONFIG_FAIR_GROUP_SCHED
7755 if (tsk->sched_class->task_move_group) 7771 if (tsk->sched_class->task_change_group)
7756 tsk->sched_class->task_move_group(tsk); 7772 tsk->sched_class->task_change_group(tsk, type);
7757 else 7773 else
7758#endif 7774#endif
7759 set_task_rq(tsk, task_cpu(tsk)); 7775 set_task_rq(tsk, task_cpu(tsk));
7776}
7777
7778/*
7779 * Change task's runqueue when it moves between groups.
7780 *
7781 * The caller of this function should have put the task in its new group by
7782 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
7783 * its new group.
7784 */
7785void sched_move_task(struct task_struct *tsk)
7786{
7787 int queued, running;
7788 struct rq_flags rf;
7789 struct rq *rq;
7790
7791 rq = task_rq_lock(tsk, &rf);
7792
7793 running = task_current(rq, tsk);
7794 queued = task_on_rq_queued(tsk);
7795
7796 if (queued)
7797 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7798 if (unlikely(running))
7799 put_prev_task(rq, tsk);
7800
7801 sched_change_group(tsk, TASK_MOVE_GROUP);
7760 7802
7761 if (unlikely(running)) 7803 if (unlikely(running))
7762 tsk->sched_class->set_curr_task(rq); 7804 tsk->sched_class->set_curr_task(rq);
@@ -8184,15 +8226,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8184 sched_free_group(tg); 8226 sched_free_group(tg);
8185} 8227}
8186 8228
8229/*
8230 * This is called before wake_up_new_task(), therefore we really only
8231 * have to set its group bits, all the other stuff does not apply.
8232 */
8187static void cpu_cgroup_fork(struct task_struct *task) 8233static void cpu_cgroup_fork(struct task_struct *task)
8188{ 8234{
8189 sched_move_task(task); 8235 struct rq_flags rf;
8236 struct rq *rq;
8237
8238 rq = task_rq_lock(task, &rf);
8239
8240 sched_change_group(task, TASK_SET_GROUP);
8241
8242 task_rq_unlock(rq, task, &rf);
8190} 8243}
8191 8244
8192static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 8245static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8193{ 8246{
8194 struct task_struct *task; 8247 struct task_struct *task;
8195 struct cgroup_subsys_state *css; 8248 struct cgroup_subsys_state *css;
8249 int ret = 0;
8196 8250
8197 cgroup_taskset_for_each(task, css, tset) { 8251 cgroup_taskset_for_each(task, css, tset) {
8198#ifdef CONFIG_RT_GROUP_SCHED 8252#ifdef CONFIG_RT_GROUP_SCHED
@@ -8203,8 +8257,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8203 if (task->sched_class != &fair_sched_class) 8257 if (task->sched_class != &fair_sched_class)
8204 return -EINVAL; 8258 return -EINVAL;
8205#endif 8259#endif
8260 /*
8261 * Serialize against wake_up_new_task() such that if its
8262 * running, we're sure to observe its full state.
8263 */
8264 raw_spin_lock_irq(&task->pi_lock);
8265 /*
8266 * Avoid calling sched_move_task() before wake_up_new_task()
8267 * has happened. This would lead to problems with PELT, due to
8268 * move wanting to detach+attach while we're not attached yet.
8269 */
8270 if (task->state == TASK_NEW)
8271 ret = -EINVAL;
8272 raw_spin_unlock_irq(&task->pi_lock);
8273
8274 if (ret)
8275 break;
8206 } 8276 }
8207 return 0; 8277 return ret;
8208} 8278}
8209 8279
8210static void cpu_cgroup_attach(struct cgroup_taskset *tset) 8280static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0938..bc0b309c3f19 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
25 CPUACCT_STAT_NSTATS, 25 CPUACCT_STAT_NSTATS,
26}; 26};
27 27
28enum cpuacct_usage_index { 28static const char * const cpuacct_stat_desc[] = {
29 CPUACCT_USAGE_USER, /* ... user mode */ 29 [CPUACCT_STAT_USER] = "user",
30 CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ 30 [CPUACCT_STAT_SYSTEM] = "system",
31
32 CPUACCT_USAGE_NRUSAGE,
33}; 31};
34 32
35struct cpuacct_usage { 33struct cpuacct_usage {
36 u64 usages[CPUACCT_USAGE_NRUSAGE]; 34 u64 usages[CPUACCT_STAT_NSTATS];
37}; 35};
38 36
39/* track cpu usage of a group of tasks and its child groups */ 37/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
108} 106}
109 107
110static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, 108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
111 enum cpuacct_usage_index index) 109 enum cpuacct_stat_index index)
112{ 110{
113 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 111 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
114 u64 data; 112 u64 data;
115 113
116 /* 114 /*
117 * We allow index == CPUACCT_USAGE_NRUSAGE here to read 115 * We allow index == CPUACCT_STAT_NSTATS here to read
118 * the sum of suages. 116 * the sum of suages.
119 */ 117 */
120 BUG_ON(index > CPUACCT_USAGE_NRUSAGE); 118 BUG_ON(index > CPUACCT_STAT_NSTATS);
121 119
122#ifndef CONFIG_64BIT 120#ifndef CONFIG_64BIT
123 /* 121 /*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
126 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 124 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
127#endif 125#endif
128 126
129 if (index == CPUACCT_USAGE_NRUSAGE) { 127 if (index == CPUACCT_STAT_NSTATS) {
130 int i = 0; 128 int i = 0;
131 129
132 data = 0; 130 data = 0;
133 for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) 131 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
134 data += cpuusage->usages[i]; 132 data += cpuusage->usages[i];
135 } else { 133 } else {
136 data = cpuusage->usages[index]; 134 data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
155 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 153 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
156#endif 154#endif
157 155
158 for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) 156 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
159 cpuusage->usages[i] = val; 157 cpuusage->usages[i] = val;
160 158
161#ifndef CONFIG_64BIT 159#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
165 163
166/* return total cpu usage (in nanoseconds) of a group */ 164/* return total cpu usage (in nanoseconds) of a group */
167static u64 __cpuusage_read(struct cgroup_subsys_state *css, 165static u64 __cpuusage_read(struct cgroup_subsys_state *css,
168 enum cpuacct_usage_index index) 166 enum cpuacct_stat_index index)
169{ 167{
170 struct cpuacct *ca = css_ca(css); 168 struct cpuacct *ca = css_ca(css);
171 u64 totalcpuusage = 0; 169 u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
180static u64 cpuusage_user_read(struct cgroup_subsys_state *css, 178static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
181 struct cftype *cft) 179 struct cftype *cft)
182{ 180{
183 return __cpuusage_read(css, CPUACCT_USAGE_USER); 181 return __cpuusage_read(css, CPUACCT_STAT_USER);
184} 182}
185 183
186static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, 184static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
187 struct cftype *cft) 185 struct cftype *cft)
188{ 186{
189 return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); 187 return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
190} 188}
191 189
192static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) 190static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
193{ 191{
194 return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); 192 return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
195} 193}
196 194
197static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, 195static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
213} 211}
214 212
215static int __cpuacct_percpu_seq_show(struct seq_file *m, 213static int __cpuacct_percpu_seq_show(struct seq_file *m,
216 enum cpuacct_usage_index index) 214 enum cpuacct_stat_index index)
217{ 215{
218 struct cpuacct *ca = css_ca(seq_css(m)); 216 struct cpuacct *ca = css_ca(seq_css(m));
219 u64 percpu; 217 u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
229 227
230static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) 228static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
231{ 229{
232 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); 230 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
233} 231}
234 232
235static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) 233static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
236{ 234{
237 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); 235 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
238} 236}
239 237
240static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) 238static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
241{ 239{
242 return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); 240 return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
243} 241}
244 242
245static const char * const cpuacct_stat_desc[] = { 243static int cpuacct_all_seq_show(struct seq_file *m, void *V)
246 [CPUACCT_STAT_USER] = "user", 244{
247 [CPUACCT_STAT_SYSTEM] = "system", 245 struct cpuacct *ca = css_ca(seq_css(m));
248}; 246 int index;
247 int cpu;
248
249 seq_puts(m, "cpu");
250 for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
251 seq_printf(m, " %s", cpuacct_stat_desc[index]);
252 seq_puts(m, "\n");
253
254 for_each_possible_cpu(cpu) {
255 struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
256
257 seq_printf(m, "%d", cpu);
258
259 for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
260#ifndef CONFIG_64BIT
261 /*
262 * Take rq->lock to make 64-bit read safe on 32-bit
263 * platforms.
264 */
265 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
266#endif
267
268 seq_printf(m, " %llu", cpuusage->usages[index]);
269
270#ifndef CONFIG_64BIT
271 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
272#endif
273 }
274 seq_puts(m, "\n");
275 }
276 return 0;
277}
249 278
250static int cpuacct_stats_show(struct seq_file *sf, void *v) 279static int cpuacct_stats_show(struct seq_file *sf, void *v)
251{ 280{
252 struct cpuacct *ca = css_ca(seq_css(sf)); 281 struct cpuacct *ca = css_ca(seq_css(sf));
282 s64 val[CPUACCT_STAT_NSTATS];
253 int cpu; 283 int cpu;
254 s64 val = 0; 284 int stat;
255 285
286 memset(val, 0, sizeof(val));
256 for_each_possible_cpu(cpu) { 287 for_each_possible_cpu(cpu) {
257 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 288 u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
258 val += kcpustat->cpustat[CPUTIME_USER];
259 val += kcpustat->cpustat[CPUTIME_NICE];
260 }
261 val = cputime64_to_clock_t(val);
262 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
263 289
264 val = 0; 290 val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
265 for_each_possible_cpu(cpu) { 291 val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
266 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 292 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
267 val += kcpustat->cpustat[CPUTIME_SYSTEM]; 293 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
268 val += kcpustat->cpustat[CPUTIME_IRQ]; 294 val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
269 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
270 } 295 }
271 296
272 val = cputime64_to_clock_t(val); 297 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
273 seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 298 seq_printf(sf, "%s %lld\n",
299 cpuacct_stat_desc[stat],
300 cputime64_to_clock_t(val[stat]));
301 }
274 302
275 return 0; 303 return 0;
276} 304}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
302 .seq_show = cpuacct_percpu_sys_seq_show, 330 .seq_show = cpuacct_percpu_sys_seq_show,
303 }, 331 },
304 { 332 {
333 .name = "usage_all",
334 .seq_show = cpuacct_all_seq_show,
335 },
336 {
305 .name = "stat", 337 .name = "stat",
306 .seq_show = cpuacct_stats_show, 338 .seq_show = cpuacct_stats_show,
307 }, 339 },
@@ -316,11 +348,11 @@ static struct cftype files[] = {
316void cpuacct_charge(struct task_struct *tsk, u64 cputime) 348void cpuacct_charge(struct task_struct *tsk, u64 cputime)
317{ 349{
318 struct cpuacct *ca; 350 struct cpuacct *ca;
319 int index = CPUACCT_USAGE_SYSTEM; 351 int index = CPUACCT_STAT_SYSTEM;
320 struct pt_regs *regs = task_pt_regs(tsk); 352 struct pt_regs *regs = task_pt_regs(tsk);
321 353
322 if (regs && user_mode(regs)) 354 if (regs && user_mode(regs))
323 index = CPUACCT_USAGE_USER; 355 index = CPUACCT_STAT_USER;
324 356
325 rcu_read_lock(); 357 rcu_read_lock();
326 358
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa25cc45..a84641b222c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,8 @@ struct sugov_cpu {
47 struct update_util_data update_util; 47 struct update_util_data update_util;
48 struct sugov_policy *sg_policy; 48 struct sugov_policy *sg_policy;
49 49
50 unsigned int cached_raw_freq;
51
50 /* The fields below are only needed when sharing a policy. */ 52 /* The fields below are only needed when sharing a policy. */
51 unsigned long util; 53 unsigned long util;
52 unsigned long max; 54 unsigned long max;
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
106 108
107/** 109/**
108 * get_next_freq - Compute a new frequency for a given cpufreq policy. 110 * get_next_freq - Compute a new frequency for a given cpufreq policy.
109 * @policy: cpufreq policy object to compute the new frequency for. 111 * @sg_cpu: schedutil cpu object to compute the new frequency for.
110 * @util: Current CPU utilization. 112 * @util: Current CPU utilization.
111 * @max: CPU capacity. 113 * @max: CPU capacity.
112 * 114 *
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
121 * next_freq = C * curr_freq * util_raw / max 123 * next_freq = C * curr_freq * util_raw / max
122 * 124 *
123 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 125 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
126 *
127 * The lowest driver-supported frequency which is equal or greater than the raw
128 * next_freq (as calculated above) is returned, subject to policy min/max and
129 * cpufreq driver limitations.
124 */ 130 */
125static unsigned int get_next_freq(struct cpufreq_policy *policy, 131static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
126 unsigned long util, unsigned long max) 132 unsigned long max)
127{ 133{
134 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
135 struct cpufreq_policy *policy = sg_policy->policy;
128 unsigned int freq = arch_scale_freq_invariant() ? 136 unsigned int freq = arch_scale_freq_invariant() ?
129 policy->cpuinfo.max_freq : policy->cur; 137 policy->cpuinfo.max_freq : policy->cur;
130 138
131 return (freq + (freq >> 2)) * util / max; 139 freq = (freq + (freq >> 2)) * util / max;
140
141 if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
142 return sg_policy->next_freq;
143 sg_cpu->cached_raw_freq = freq;
144 return cpufreq_driver_resolve_freq(policy, freq);
132} 145}
133 146
134static void sugov_update_single(struct update_util_data *hook, u64 time, 147static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
143 return; 156 return;
144 157
145 next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : 158 next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
146 get_next_freq(policy, util, max); 159 get_next_freq(sg_cpu, util, max);
147 sugov_update_commit(sg_policy, time, next_f); 160 sugov_update_commit(sg_policy, time, next_f);
148} 161}
149 162
150static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, 163static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
151 unsigned long util, unsigned long max) 164 unsigned long util, unsigned long max)
152{ 165{
166 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
153 struct cpufreq_policy *policy = sg_policy->policy; 167 struct cpufreq_policy *policy = sg_policy->policy;
154 unsigned int max_f = policy->cpuinfo.max_freq; 168 unsigned int max_f = policy->cpuinfo.max_freq;
155 u64 last_freq_update_time = sg_policy->last_freq_update_time; 169 u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
189 } 203 }
190 } 204 }
191 205
192 return get_next_freq(policy, util, max); 206 return get_next_freq(sg_cpu, util, max);
193} 207}
194 208
195static void sugov_update_shared(struct update_util_data *hook, u64 time, 209static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
206 sg_cpu->last_update = time; 220 sg_cpu->last_update = time;
207 221
208 if (sugov_should_update_freq(sg_policy, time)) { 222 if (sugov_should_update_freq(sg_policy, time)) {
209 next_f = sugov_next_freq_shared(sg_policy, util, max); 223 next_f = sugov_next_freq_shared(sg_cpu, util, max);
210 sugov_update_commit(sg_policy, time, next_f); 224 sugov_update_commit(sg_policy, time, next_f);
211 } 225 }
212 226
@@ -394,7 +408,7 @@ static int sugov_init(struct cpufreq_policy *policy)
394 return ret; 408 return ret;
395} 409}
396 410
397static int sugov_exit(struct cpufreq_policy *policy) 411static void sugov_exit(struct cpufreq_policy *policy)
398{ 412{
399 struct sugov_policy *sg_policy = policy->governor_data; 413 struct sugov_policy *sg_policy = policy->governor_data;
400 struct sugov_tunables *tunables = sg_policy->tunables; 414 struct sugov_tunables *tunables = sg_policy->tunables;
@@ -412,7 +426,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
412 mutex_unlock(&global_tunables_lock); 426 mutex_unlock(&global_tunables_lock);
413 427
414 sugov_policy_free(sg_policy); 428 sugov_policy_free(sg_policy);
415 return 0;
416} 429}
417 430
418static int sugov_start(struct cpufreq_policy *policy) 431static int sugov_start(struct cpufreq_policy *policy)
@@ -434,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)
434 sg_cpu->util = ULONG_MAX; 447 sg_cpu->util = ULONG_MAX;
435 sg_cpu->max = 0; 448 sg_cpu->max = 0;
436 sg_cpu->last_update = 0; 449 sg_cpu->last_update = 0;
450 sg_cpu->cached_raw_freq = 0;
437 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 451 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
438 sugov_update_shared); 452 sugov_update_shared);
439 } else { 453 } else {
@@ -444,7 +458,7 @@ static int sugov_start(struct cpufreq_policy *policy)
444 return 0; 458 return 0;
445} 459}
446 460
447static int sugov_stop(struct cpufreq_policy *policy) 461static void sugov_stop(struct cpufreq_policy *policy)
448{ 462{
449 struct sugov_policy *sg_policy = policy->governor_data; 463 struct sugov_policy *sg_policy = policy->governor_data;
450 unsigned int cpu; 464 unsigned int cpu;
@@ -456,53 +470,29 @@ static int sugov_stop(struct cpufreq_policy *policy)
456 470
457 irq_work_sync(&sg_policy->irq_work); 471 irq_work_sync(&sg_policy->irq_work);
458 cancel_work_sync(&sg_policy->work); 472 cancel_work_sync(&sg_policy->work);
459 return 0;
460} 473}
461 474
462static int sugov_limits(struct cpufreq_policy *policy) 475static void sugov_limits(struct cpufreq_policy *policy)
463{ 476{
464 struct sugov_policy *sg_policy = policy->governor_data; 477 struct sugov_policy *sg_policy = policy->governor_data;
465 478
466 if (!policy->fast_switch_enabled) { 479 if (!policy->fast_switch_enabled) {
467 mutex_lock(&sg_policy->work_lock); 480 mutex_lock(&sg_policy->work_lock);
468 481 cpufreq_policy_apply_limits(policy);
469 if (policy->max < policy->cur)
470 __cpufreq_driver_target(policy, policy->max,
471 CPUFREQ_RELATION_H);
472 else if (policy->min > policy->cur)
473 __cpufreq_driver_target(policy, policy->min,
474 CPUFREQ_RELATION_L);
475
476 mutex_unlock(&sg_policy->work_lock); 482 mutex_unlock(&sg_policy->work_lock);
477 } 483 }
478 484
479 sg_policy->need_freq_update = true; 485 sg_policy->need_freq_update = true;
480 return 0;
481}
482
483int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
484{
485 if (event == CPUFREQ_GOV_POLICY_INIT) {
486 return sugov_init(policy);
487 } else if (policy->governor_data) {
488 switch (event) {
489 case CPUFREQ_GOV_POLICY_EXIT:
490 return sugov_exit(policy);
491 case CPUFREQ_GOV_START:
492 return sugov_start(policy);
493 case CPUFREQ_GOV_STOP:
494 return sugov_stop(policy);
495 case CPUFREQ_GOV_LIMITS:
496 return sugov_limits(policy);
497 }
498 }
499 return -EINVAL;
500} 486}
501 487
502static struct cpufreq_governor schedutil_gov = { 488static struct cpufreq_governor schedutil_gov = {
503 .name = "schedutil", 489 .name = "schedutil",
504 .governor = sugov_governor,
505 .owner = THIS_MODULE, 490 .owner = THIS_MODULE,
491 .init = sugov_init,
492 .exit = sugov_exit,
493 .start = sugov_start,
494 .stop = sugov_stop,
495 .limits = sugov_limits,
506}; 496};
507 497
508static int __init sugov_module_init(void) 498static int __init sugov_module_init(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..1934f658c036 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
49 */ 49 */
50void irqtime_account_irq(struct task_struct *curr) 50void irqtime_account_irq(struct task_struct *curr)
51{ 51{
52 unsigned long flags;
53 s64 delta; 52 s64 delta;
54 int cpu; 53 int cpu;
55 54
56 if (!sched_clock_irqtime) 55 if (!sched_clock_irqtime)
57 return; 56 return;
58 57
59 local_irq_save(flags);
60
61 cpu = smp_processor_id(); 58 cpu = smp_processor_id();
62 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 59 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
63 __this_cpu_add(irq_start_time, delta); 60 __this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
75 __this_cpu_add(cpu_softirq_time, delta); 72 __this_cpu_add(cpu_softirq_time, delta);
76 73
77 irq_time_write_end(); 74 irq_time_write_end();
78 local_irq_restore(flags);
79} 75}
80EXPORT_SYMBOL_GPL(irqtime_account_irq); 76EXPORT_SYMBOL_GPL(irqtime_account_irq);
81 77
82static int irqtime_account_hi_update(void) 78static cputime_t irqtime_account_hi_update(cputime_t maxtime)
83{ 79{
84 u64 *cpustat = kcpustat_this_cpu->cpustat; 80 u64 *cpustat = kcpustat_this_cpu->cpustat;
85 unsigned long flags; 81 unsigned long flags;
86 u64 latest_ns; 82 cputime_t irq_cputime;
87 int ret = 0;
88 83
89 local_irq_save(flags); 84 local_irq_save(flags);
90 latest_ns = this_cpu_read(cpu_hardirq_time); 85 irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
91 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) 86 cpustat[CPUTIME_IRQ];
92 ret = 1; 87 irq_cputime = min(irq_cputime, maxtime);
88 cpustat[CPUTIME_IRQ] += irq_cputime;
93 local_irq_restore(flags); 89 local_irq_restore(flags);
94 return ret; 90 return irq_cputime;
95} 91}
96 92
97static int irqtime_account_si_update(void) 93static cputime_t irqtime_account_si_update(cputime_t maxtime)
98{ 94{
99 u64 *cpustat = kcpustat_this_cpu->cpustat; 95 u64 *cpustat = kcpustat_this_cpu->cpustat;
100 unsigned long flags; 96 unsigned long flags;
101 u64 latest_ns; 97 cputime_t softirq_cputime;
102 int ret = 0;
103 98
104 local_irq_save(flags); 99 local_irq_save(flags);
105 latest_ns = this_cpu_read(cpu_softirq_time); 100 softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
106 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) 101 cpustat[CPUTIME_SOFTIRQ];
107 ret = 1; 102 softirq_cputime = min(softirq_cputime, maxtime);
103 cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
108 local_irq_restore(flags); 104 local_irq_restore(flags);
109 return ret; 105 return softirq_cputime;
110} 106}
111 107
112#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
113 109
114#define sched_clock_irqtime (0) 110#define sched_clock_irqtime (0)
115 111
112static cputime_t irqtime_account_hi_update(cputime_t dummy)
113{
114 return 0;
115}
116
117static cputime_t irqtime_account_si_update(cputime_t dummy)
118{
119 return 0;
120}
121
116#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 122#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
117 123
118static inline void task_group_account_field(struct task_struct *p, int index, 124static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,29 +263,42 @@ void account_idle_time(cputime_t cputime)
257 cpustat[CPUTIME_IDLE] += (__force u64) cputime; 263 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
258} 264}
259 265
260static __always_inline bool steal_account_process_tick(void) 266static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
261{ 267{
262#ifdef CONFIG_PARAVIRT 268#ifdef CONFIG_PARAVIRT
263 if (static_key_false(&paravirt_steal_enabled)) { 269 if (static_key_false(&paravirt_steal_enabled)) {
270 cputime_t steal_cputime;
264 u64 steal; 271 u64 steal;
265 unsigned long steal_jiffies;
266 272
267 steal = paravirt_steal_clock(smp_processor_id()); 273 steal = paravirt_steal_clock(smp_processor_id());
268 steal -= this_rq()->prev_steal_time; 274 steal -= this_rq()->prev_steal_time;
269 275
270 /* 276 steal_cputime = min(nsecs_to_cputime(steal), maxtime);
271 * steal is in nsecs but our caller is expecting steal 277 account_steal_time(steal_cputime);
272 * time in jiffies. Lets cast the result to jiffies 278 this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
273 * granularity and account the rest on the next rounds.
274 */
275 steal_jiffies = nsecs_to_jiffies(steal);
276 this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
277 279
278 account_steal_time(jiffies_to_cputime(steal_jiffies)); 280 return steal_cputime;
279 return steal_jiffies;
280 } 281 }
281#endif 282#endif
282 return false; 283 return 0;
284}
285
286/*
287 * Account how much elapsed time was spent in steal, irq, or softirq time.
288 */
289static inline cputime_t account_other_time(cputime_t max)
290{
291 cputime_t accounted;
292
293 accounted = steal_account_process_time(max);
294
295 if (accounted < max)
296 accounted += irqtime_account_hi_update(max - accounted);
297
298 if (accounted < max)
299 accounted += irqtime_account_si_update(max - accounted);
300
301 return accounted;
283} 302}
284 303
285/* 304/*
@@ -342,21 +361,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
342static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 361static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
343 struct rq *rq, int ticks) 362 struct rq *rq, int ticks)
344{ 363{
345 cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); 364 u64 cputime = (__force u64) cputime_one_jiffy * ticks;
346 u64 cputime = (__force u64) cputime_one_jiffy; 365 cputime_t scaled, other;
347 u64 *cpustat = kcpustat_this_cpu->cpustat;
348 366
349 if (steal_account_process_tick()) 367 /*
368 * When returning from idle, many ticks can get accounted at
369 * once, including some ticks of steal, irq, and softirq time.
370 * Subtract those ticks from the amount of time accounted to
371 * idle, or potentially user or system time. Due to rounding,
372 * other time can exceed ticks occasionally.
373 */
374 other = account_other_time(cputime);
375 if (other >= cputime)
350 return; 376 return;
377 cputime -= other;
378 scaled = cputime_to_scaled(cputime);
351 379
352 cputime *= ticks; 380 if (this_cpu_ksoftirqd() == p) {
353 scaled *= ticks;
354
355 if (irqtime_account_hi_update()) {
356 cpustat[CPUTIME_IRQ] += cputime;
357 } else if (irqtime_account_si_update()) {
358 cpustat[CPUTIME_SOFTIRQ] += cputime;
359 } else if (this_cpu_ksoftirqd() == p) {
360 /* 381 /*
361 * ksoftirqd time do not get accounted in cpu_softirq_time. 382 * ksoftirqd time do not get accounted in cpu_softirq_time.
362 * So, we have to handle it separately here. 383 * So, we have to handle it separately here.
@@ -406,6 +427,10 @@ void vtime_common_task_switch(struct task_struct *prev)
406} 427}
407#endif 428#endif
408 429
430#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
431
432
433#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
409/* 434/*
410 * Archs that account the whole time spent in the idle task 435 * Archs that account the whole time spent in the idle task
411 * (outside irq) as idle time can rely on this and just implement 436 * (outside irq) as idle time can rely on this and just implement
@@ -415,33 +440,16 @@ void vtime_common_task_switch(struct task_struct *prev)
415 * vtime_account(). 440 * vtime_account().
416 */ 441 */
417#ifndef __ARCH_HAS_VTIME_ACCOUNT 442#ifndef __ARCH_HAS_VTIME_ACCOUNT
418void vtime_common_account_irq_enter(struct task_struct *tsk) 443void vtime_account_irq_enter(struct task_struct *tsk)
419{ 444{
420 if (!in_interrupt()) { 445 if (!in_interrupt() && is_idle_task(tsk))
421 /* 446 vtime_account_idle(tsk);
422 * If we interrupted user, context_tracking_in_user() 447 else
423 * is 1 because the context tracking don't hook 448 vtime_account_system(tsk);
424 * on irq entry/exit. This way we know if
425 * we need to flush user time on kernel entry.
426 */
427 if (context_tracking_in_user()) {
428 vtime_account_user(tsk);
429 return;
430 }
431
432 if (is_idle_task(tsk)) {
433 vtime_account_idle(tsk);
434 return;
435 }
436 }
437 vtime_account_system(tsk);
438} 449}
439EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); 450EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
440#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 451#endif /* __ARCH_HAS_VTIME_ACCOUNT */
441#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
442 452
443
444#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
445void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 453void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
446{ 454{
447 *ut = p->utime; 455 *ut = p->utime;
@@ -466,7 +474,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
466 */ 474 */
467void account_process_tick(struct task_struct *p, int user_tick) 475void account_process_tick(struct task_struct *p, int user_tick)
468{ 476{
469 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 477 cputime_t cputime, scaled, steal;
470 struct rq *rq = this_rq(); 478 struct rq *rq = this_rq();
471 479
472 if (vtime_accounting_cpu_enabled()) 480 if (vtime_accounting_cpu_enabled())
@@ -477,26 +485,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
477 return; 485 return;
478 } 486 }
479 487
480 if (steal_account_process_tick()) 488 cputime = cputime_one_jiffy;
489 steal = steal_account_process_time(cputime);
490
491 if (steal >= cputime)
481 return; 492 return;
482 493
494 cputime -= steal;
495 scaled = cputime_to_scaled(cputime);
496
483 if (user_tick) 497 if (user_tick)
484 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 498 account_user_time(p, cputime, scaled);
485 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 499 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
486 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 500 account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
487 one_jiffy_scaled);
488 else 501 else
489 account_idle_time(cputime_one_jiffy); 502 account_idle_time(cputime);
490}
491
492/*
493 * Account multiple ticks of steal time.
494 * @p: the process from which the cpu time has been stolen
495 * @ticks: number of stolen ticks
496 */
497void account_steal_ticks(unsigned long ticks)
498{
499 account_steal_time(jiffies_to_cputime(ticks));
500} 503}
501 504
502/* 505/*
@@ -681,12 +684,14 @@ static cputime_t vtime_delta(struct task_struct *tsk)
681static cputime_t get_vtime_delta(struct task_struct *tsk) 684static cputime_t get_vtime_delta(struct task_struct *tsk)
682{ 685{
683 unsigned long now = READ_ONCE(jiffies); 686 unsigned long now = READ_ONCE(jiffies);
684 unsigned long delta = now - tsk->vtime_snap; 687 cputime_t delta, other;
685 688
689 delta = jiffies_to_cputime(now - tsk->vtime_snap);
690 other = account_other_time(delta);
686 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 691 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
687 tsk->vtime_snap = now; 692 tsk->vtime_snap = now;
688 693
689 return jiffies_to_cputime(delta); 694 return delta - other;
690} 695}
691 696
692static void __vtime_account_system(struct task_struct *tsk) 697static void __vtime_account_system(struct task_struct *tsk)
@@ -706,16 +711,6 @@ void vtime_account_system(struct task_struct *tsk)
706 write_seqcount_end(&tsk->vtime_seqcount); 711 write_seqcount_end(&tsk->vtime_seqcount);
707} 712}
708 713
709void vtime_gen_account_irq_exit(struct task_struct *tsk)
710{
711 write_seqcount_begin(&tsk->vtime_seqcount);
712 if (vtime_delta(tsk))
713 __vtime_account_system(tsk);
714 if (context_tracking_in_user())
715 tsk->vtime_snap_whence = VTIME_USER;
716 write_seqcount_end(&tsk->vtime_seqcount);
717}
718
719void vtime_account_user(struct task_struct *tsk) 714void vtime_account_user(struct task_struct *tsk)
720{ 715{
721 cputime_t delta_cpu; 716 cputime_t delta_cpu;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..2a0a9995256d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
427 SPLIT_NS(p->se.vruntime), 427 SPLIT_NS(p->se.vruntime),
428 (long long)(p->nvcsw + p->nivcsw), 428 (long long)(p->nvcsw + p->nivcsw),
429 p->prio); 429 p->prio);
430#ifdef CONFIG_SCHEDSTATS 430
431 if (schedstat_enabled()) {
432 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
433 SPLIT_NS(p->se.statistics.wait_sum),
434 SPLIT_NS(p->se.sum_exec_runtime),
435 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
436 }
437#else
438 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 431 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
439 0LL, 0L, 432 SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
440 SPLIT_NS(p->se.sum_exec_runtime), 433 SPLIT_NS(p->se.sum_exec_runtime),
441 0LL, 0L); 434 SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
442#endif 435
443#ifdef CONFIG_NUMA_BALANCING 436#ifdef CONFIG_NUMA_BALANCING
444 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 437 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
445#endif 438#endif
@@ -886,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
886 879
887 nr_switches = p->nvcsw + p->nivcsw; 880 nr_switches = p->nvcsw + p->nivcsw;
888 881
889#ifdef CONFIG_SCHEDSTATS
890 P(se.nr_migrations); 882 P(se.nr_migrations);
891 883
884#ifdef CONFIG_SCHEDSTATS
892 if (schedstat_enabled()) { 885 if (schedstat_enabled()) {
893 u64 avg_atom, avg_per_cpu; 886 u64 avg_atom, avg_per_cpu;
894 887
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..4088eedea763 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se)
690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 690 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
691} 691}
692 692
693static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
694static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
695static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
696static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
697
693/* 698/*
694 * With new tasks being created, their initial util_avgs are extrapolated 699 * With new tasks being created, their initial util_avgs are extrapolated
695 * based on the cfs_rq's current util_avg: 700 * based on the cfs_rq's current util_avg:
@@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
720 struct cfs_rq *cfs_rq = cfs_rq_of(se); 725 struct cfs_rq *cfs_rq = cfs_rq_of(se);
721 struct sched_avg *sa = &se->avg; 726 struct sched_avg *sa = &se->avg;
722 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 727 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
728 u64 now = cfs_rq_clock_task(cfs_rq);
729 int tg_update;
723 730
724 if (cap > 0) { 731 if (cap > 0) {
725 if (cfs_rq->avg.util_avg != 0) { 732 if (cfs_rq->avg.util_avg != 0) {
@@ -733,18 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
733 } 740 }
734 sa->util_sum = sa->util_avg * LOAD_AVG_MAX; 741 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
735 } 742 }
743
744 if (entity_is_task(se)) {
745 struct task_struct *p = task_of(se);
746 if (p->sched_class != &fair_sched_class) {
747 /*
748 * For !fair tasks do:
749 *
750 update_cfs_rq_load_avg(now, cfs_rq, false);
751 attach_entity_load_avg(cfs_rq, se);
752 switched_from_fair(rq, p);
753 *
754 * such that the next switched_to_fair() has the
755 * expected state.
756 */
757 se->avg.last_update_time = now;
758 return;
759 }
760 }
761
762 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
763 attach_entity_load_avg(cfs_rq, se);
764 if (tg_update)
765 update_tg_load_avg(cfs_rq, false);
736} 766}
737 767
738static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); 768#else /* !CONFIG_SMP */
739static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
740#else
741void init_entity_runnable_average(struct sched_entity *se) 769void init_entity_runnable_average(struct sched_entity *se)
742{ 770{
743} 771}
744void post_init_entity_util_avg(struct sched_entity *se) 772void post_init_entity_util_avg(struct sched_entity *se)
745{ 773{
746} 774}
747#endif 775static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
776{
777}
778#endif /* CONFIG_SMP */
748 779
749/* 780/*
750 * Update the current task's runtime statistics. 781 * Update the current task's runtime statistics.
@@ -1305,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env,
1305{ 1336{
1306 if (env->best_task) 1337 if (env->best_task)
1307 put_task_struct(env->best_task); 1338 put_task_struct(env->best_task);
1339 if (p)
1340 get_task_struct(p);
1308 1341
1309 env->best_task = p; 1342 env->best_task = p;
1310 env->best_imp = imp; 1343 env->best_imp = imp;
@@ -1372,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env,
1372 long imp = env->p->numa_group ? groupimp : taskimp; 1405 long imp = env->p->numa_group ? groupimp : taskimp;
1373 long moveimp = imp; 1406 long moveimp = imp;
1374 int dist = env->dist; 1407 int dist = env->dist;
1375 bool assigned = false;
1376 1408
1377 rcu_read_lock(); 1409 rcu_read_lock();
1378 1410 cur = task_rcu_dereference(&dst_rq->curr);
1379 raw_spin_lock_irq(&dst_rq->lock); 1411 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1380 cur = dst_rq->curr;
1381 /*
1382 * No need to move the exiting task or idle task.
1383 */
1384 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1385 cur = NULL; 1412 cur = NULL;
1386 else {
1387 /*
1388 * The task_struct must be protected here to protect the
1389 * p->numa_faults access in the task_weight since the
1390 * numa_faults could already be freed in the following path:
1391 * finish_task_switch()
1392 * --> put_task_struct()
1393 * --> __put_task_struct()
1394 * --> task_numa_free()
1395 */
1396 get_task_struct(cur);
1397 }
1398
1399 raw_spin_unlock_irq(&dst_rq->lock);
1400 1413
1401 /* 1414 /*
1402 * Because we have preemption enabled we can get migrated around and 1415 * Because we have preemption enabled we can get migrated around and
@@ -1479,7 +1492,6 @@ balance:
1479 */ 1492 */
1480 if (!load_too_imbalanced(src_load, dst_load, env)) { 1493 if (!load_too_imbalanced(src_load, dst_load, env)) {
1481 imp = moveimp - 1; 1494 imp = moveimp - 1;
1482 put_task_struct(cur);
1483 cur = NULL; 1495 cur = NULL;
1484 goto assign; 1496 goto assign;
1485 } 1497 }
@@ -1505,16 +1517,9 @@ balance:
1505 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); 1517 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1506 1518
1507assign: 1519assign:
1508 assigned = true;
1509 task_numa_assign(env, cur, imp); 1520 task_numa_assign(env, cur, imp);
1510unlock: 1521unlock:
1511 rcu_read_unlock(); 1522 rcu_read_unlock();
1512 /*
1513 * The dst_rq->curr isn't assigned. The protection for task_struct is
1514 * finished.
1515 */
1516 if (cur && !assigned)
1517 put_task_struct(cur);
1518} 1523}
1519 1524
1520static void task_numa_find_cpu(struct task_numa_env *env, 1525static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2499,28 +2504,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2499 2504
2500#ifdef CONFIG_FAIR_GROUP_SCHED 2505#ifdef CONFIG_FAIR_GROUP_SCHED
2501# ifdef CONFIG_SMP 2506# ifdef CONFIG_SMP
2502static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) 2507static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2503{ 2508{
2504 long tg_weight; 2509 long tg_weight, load, shares;
2505 2510
2506 /* 2511 /*
2507 * Use this CPU's real-time load instead of the last load contribution 2512 * This really should be: cfs_rq->avg.load_avg, but instead we use
2508 * as the updating of the contribution is delayed, and we will use the 2513 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2509 * the real-time load to calc the share. See update_tg_load_avg(). 2514 * the shares for small weight interactive tasks.
2510 */ 2515 */
2511 tg_weight = atomic_long_read(&tg->load_avg); 2516 load = scale_load_down(cfs_rq->load.weight);
2512 tg_weight -= cfs_rq->tg_load_avg_contrib;
2513 tg_weight += cfs_rq->load.weight;
2514 2517
2515 return tg_weight; 2518 tg_weight = atomic_long_read(&tg->load_avg);
2516}
2517
2518static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2519{
2520 long tg_weight, load, shares;
2521 2519
2522 tg_weight = calc_tg_weight(tg, cfs_rq); 2520 /* Ensure tg_weight >= load */
2523 load = cfs_rq->load.weight; 2521 tg_weight -= cfs_rq->tg_load_avg_contrib;
2522 tg_weight += load;
2524 2523
2525 shares = (tg->shares * load); 2524 shares = (tg->shares * load);
2526 if (tg_weight) 2525 if (tg_weight)
@@ -2539,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2539 return tg->shares; 2538 return tg->shares;
2540} 2539}
2541# endif /* CONFIG_SMP */ 2540# endif /* CONFIG_SMP */
2541
2542static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 2542static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2543 unsigned long weight) 2543 unsigned long weight)
2544{ 2544{
@@ -2873,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se,
2873static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 2873static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2874#endif /* CONFIG_FAIR_GROUP_SCHED */ 2874#endif /* CONFIG_FAIR_GROUP_SCHED */
2875 2875
2876static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2877
2878static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 2876static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2879{ 2877{
2880 struct rq *rq = rq_of(cfs_rq); 2878 struct rq *rq = rq_of(cfs_rq);
@@ -2904,7 +2902,40 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
2904 } 2902 }
2905} 2903}
2906 2904
2907/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2905/*
2906 * Unsigned subtract and clamp on underflow.
2907 *
2908 * Explicitly do a load-store to ensure the intermediate value never hits
2909 * memory. This allows lockless observations without ever seeing the negative
2910 * values.
2911 */
2912#define sub_positive(_ptr, _val) do { \
2913 typeof(_ptr) ptr = (_ptr); \
2914 typeof(*ptr) val = (_val); \
2915 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2916 res = var - val; \
2917 if (res > var) \
2918 res = 0; \
2919 WRITE_ONCE(*ptr, res); \
2920} while (0)
2921
2922/**
2923 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
2924 * @now: current time, as per cfs_rq_clock_task()
2925 * @cfs_rq: cfs_rq to update
2926 * @update_freq: should we call cfs_rq_util_change() or will the call do so
2927 *
2928 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
2929 * avg. The immediate corollary is that all (fair) tasks must be attached, see
2930 * post_init_entity_util_avg().
2931 *
2932 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
2933 *
2934 * Returns true if the load decayed or we removed utilization. It is expected
2935 * that one calls update_tg_load_avg() on this condition, but after you've
2936 * modified the cfs_rq avg (attach/detach), such that we propagate the new
2937 * avg up.
2938 */
2908static inline int 2939static inline int
2909update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) 2940update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2910{ 2941{
@@ -2913,15 +2944,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
2913 2944
2914 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 2945 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2915 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 2946 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2916 sa->load_avg = max_t(long, sa->load_avg - r, 0); 2947 sub_positive(&sa->load_avg, r);
2917 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); 2948 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2918 removed_load = 1; 2949 removed_load = 1;
2919 } 2950 }
2920 2951
2921 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2952 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2922 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2953 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2923 sa->util_avg = max_t(long, sa->util_avg - r, 0); 2954 sub_positive(&sa->util_avg, r);
2924 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); 2955 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2925 removed_util = 1; 2956 removed_util = 1;
2926 } 2957 }
2927 2958
@@ -2959,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
2959 update_tg_load_avg(cfs_rq, 0); 2990 update_tg_load_avg(cfs_rq, 0);
2960} 2991}
2961 2992
2993/**
2994 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
2995 * @cfs_rq: cfs_rq to attach to
2996 * @se: sched_entity to attach
2997 *
2998 * Must call update_cfs_rq_load_avg() before this, since we rely on
2999 * cfs_rq->avg.last_update_time being current.
3000 */
2962static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3001static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2963{ 3002{
2964 if (!sched_feat(ATTACH_AGE_LOAD)) 3003 if (!sched_feat(ATTACH_AGE_LOAD))
@@ -2967,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
2967 /* 3006 /*
2968 * If we got migrated (either between CPUs or between cgroups) we'll 3007 * If we got migrated (either between CPUs or between cgroups) we'll
2969 * have aged the average right before clearing @last_update_time. 3008 * have aged the average right before clearing @last_update_time.
3009 *
3010 * Or we're fresh through post_init_entity_util_avg().
2970 */ 3011 */
2971 if (se->avg.last_update_time) { 3012 if (se->avg.last_update_time) {
2972 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), 3013 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -2988,16 +3029,24 @@ skip_aging:
2988 cfs_rq_util_change(cfs_rq); 3029 cfs_rq_util_change(cfs_rq);
2989} 3030}
2990 3031
3032/**
3033 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3034 * @cfs_rq: cfs_rq to detach from
3035 * @se: sched_entity to detach
3036 *
3037 * Must call update_cfs_rq_load_avg() before this, since we rely on
3038 * cfs_rq->avg.last_update_time being current.
3039 */
2991static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3040static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2992{ 3041{
2993 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), 3042 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2994 &se->avg, se->on_rq * scale_load_down(se->load.weight), 3043 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2995 cfs_rq->curr == se, NULL); 3044 cfs_rq->curr == se, NULL);
2996 3045
2997 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); 3046 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2998 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); 3047 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
2999 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); 3048 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3000 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); 3049 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3001 3050
3002 cfs_rq_util_change(cfs_rq); 3051 cfs_rq_util_change(cfs_rq);
3003} 3052}
@@ -3072,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se)
3072 u64 last_update_time; 3121 u64 last_update_time;
3073 3122
3074 /* 3123 /*
3075 * Newly created task or never used group entity should not be removed 3124 * tasks cannot exit without having gone through wake_up_new_task() ->
3076 * from its (source) cfs_rq 3125 * post_init_entity_util_avg() which will have added things to the
3126 * cfs_rq, so we can remove unconditionally.
3127 *
3128 * Similarly for groups, they will have passed through
3129 * post_init_entity_util_avg() before unregister_sched_fair_group()
3130 * calls this.
3077 */ 3131 */
3078 if (se->avg.last_update_time == 0)
3079 return;
3080 3132
3081 last_update_time = cfs_rq_last_update_time(cfs_rq); 3133 last_update_time = cfs_rq_last_update_time(cfs_rq);
3082 3134
@@ -3099,6 +3151,12 @@ static int idle_balance(struct rq *this_rq);
3099 3151
3100#else /* CONFIG_SMP */ 3152#else /* CONFIG_SMP */
3101 3153
3154static inline int
3155update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3156{
3157 return 0;
3158}
3159
3102static inline void update_load_avg(struct sched_entity *se, int not_used) 3160static inline void update_load_avg(struct sched_entity *se, int not_used)
3103{ 3161{
3104 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3162 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3246,7 +3304,7 @@ static inline void check_schedstat_required(void)
3246 trace_sched_stat_iowait_enabled() || 3304 trace_sched_stat_iowait_enabled() ||
3247 trace_sched_stat_blocked_enabled() || 3305 trace_sched_stat_blocked_enabled() ||
3248 trace_sched_stat_runtime_enabled()) { 3306 trace_sched_stat_runtime_enabled()) {
3249 pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3307 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3250 "stat_blocked and stat_runtime require the " 3308 "stat_blocked and stat_runtime require the "
3251 "kernel parameter schedstats=enabled or " 3309 "kernel parameter schedstats=enabled or "
3252 "kernel.sched_schedstats=1\n"); 3310 "kernel.sched_schedstats=1\n");
@@ -3688,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3688static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 3746static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3689{ 3747{
3690 if (unlikely(cfs_rq->throttle_count)) 3748 if (unlikely(cfs_rq->throttle_count))
3691 return cfs_rq->throttled_clock_task; 3749 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
3692 3750
3693 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; 3751 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3694} 3752}
@@ -3826,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
3826 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 3884 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3827 3885
3828 cfs_rq->throttle_count--; 3886 cfs_rq->throttle_count--;
3829#ifdef CONFIG_SMP
3830 if (!cfs_rq->throttle_count) { 3887 if (!cfs_rq->throttle_count) {
3831 /* adjust cfs_rq_clock_task() */ 3888 /* adjust cfs_rq_clock_task() */
3832 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 3889 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3833 cfs_rq->throttled_clock_task; 3890 cfs_rq->throttled_clock_task;
3834 } 3891 }
3835#endif
3836 3892
3837 return 0; 3893 return 0;
3838} 3894}
@@ -4199,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4199 throttle_cfs_rq(cfs_rq); 4255 throttle_cfs_rq(cfs_rq);
4200} 4256}
4201 4257
4258static void sync_throttle(struct task_group *tg, int cpu)
4259{
4260 struct cfs_rq *pcfs_rq, *cfs_rq;
4261
4262 if (!cfs_bandwidth_used())
4263 return;
4264
4265 if (!tg->parent)
4266 return;
4267
4268 cfs_rq = tg->cfs_rq[cpu];
4269 pcfs_rq = tg->parent->cfs_rq[cpu];
4270
4271 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4272 pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4273}
4274
4202/* conditionally throttle active cfs_rq's from put_prev_entity() */ 4275/* conditionally throttle active cfs_rq's from put_prev_entity() */
4203static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4276static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4204{ 4277{
@@ -4338,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4338static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 4411static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4339static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } 4412static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4340static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 4413static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4414static inline void sync_throttle(struct task_group *tg, int cpu) {}
4341static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 4415static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4342 4416
4343static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 4417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4446,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4446 * 4520 *
4447 * note: in the case of encountering a throttled cfs_rq we will 4521 * note: in the case of encountering a throttled cfs_rq we will
4448 * post the final h_nr_running increment below. 4522 * post the final h_nr_running increment below.
4449 */ 4523 */
4450 if (cfs_rq_throttled(cfs_rq)) 4524 if (cfs_rq_throttled(cfs_rq))
4451 break; 4525 break;
4452 cfs_rq->h_nr_running++; 4526 cfs_rq->h_nr_running++;
@@ -4500,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4500 4574
4501 /* Don't dequeue parent if it has other entities besides us */ 4575 /* Don't dequeue parent if it has other entities besides us */
4502 if (cfs_rq->load.weight) { 4576 if (cfs_rq->load.weight) {
4577 /* Avoid re-evaluating load for this entity: */
4578 se = parent_entity(se);
4503 /* 4579 /*
4504 * Bias pick_next to pick a task from this cfs_rq, as 4580 * Bias pick_next to pick a task from this cfs_rq, as
4505 * p is sleeping when it is within its sched_slice. 4581 * p is sleeping when it is within its sched_slice.
4506 */ 4582 */
4507 if (task_sleep && parent_entity(se)) 4583 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4508 set_next_buddy(parent_entity(se)); 4584 set_next_buddy(se);
4509
4510 /* avoid re-evaluating load for this entity */
4511 se = parent_entity(se);
4512 break; 4585 break;
4513 } 4586 }
4514 flags |= DEQUEUE_SLEEP; 4587 flags |= DEQUEUE_SLEEP;
@@ -4910,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4910 return wl; 4983 return wl;
4911 4984
4912 for_each_sched_entity(se) { 4985 for_each_sched_entity(se) {
4913 long w, W; 4986 struct cfs_rq *cfs_rq = se->my_q;
4987 long W, w = cfs_rq_load_avg(cfs_rq);
4914 4988
4915 tg = se->my_q->tg; 4989 tg = cfs_rq->tg;
4916 4990
4917 /* 4991 /*
4918 * W = @wg + \Sum rw_j 4992 * W = @wg + \Sum rw_j
4919 */ 4993 */
4920 W = wg + calc_tg_weight(tg, se->my_q); 4994 W = wg + atomic_long_read(&tg->load_avg);
4995
4996 /* Ensure \Sum rw_j >= rw_i */
4997 W -= cfs_rq->tg_load_avg_contrib;
4998 W += w;
4921 4999
4922 /* 5000 /*
4923 * w = rw_i + @wl 5001 * w = rw_i + @wl
4924 */ 5002 */
4925 w = cfs_rq_load_avg(se->my_q) + wl; 5003 w += wl;
4926 5004
4927 /* 5005 /*
4928 * wl = S * s'_i; see (2) 5006 * wl = S * s'_i; see (2)
@@ -8283,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p)
8283{ 8361{
8284 struct cfs_rq *cfs_rq; 8362 struct cfs_rq *cfs_rq;
8285 struct sched_entity *se = &p->se, *curr; 8363 struct sched_entity *se = &p->se, *curr;
8286 int this_cpu = smp_processor_id();
8287 struct rq *rq = this_rq(); 8364 struct rq *rq = this_rq();
8288 unsigned long flags;
8289
8290 raw_spin_lock_irqsave(&rq->lock, flags);
8291 8365
8366 raw_spin_lock(&rq->lock);
8292 update_rq_clock(rq); 8367 update_rq_clock(rq);
8293 8368
8294 cfs_rq = task_cfs_rq(current); 8369 cfs_rq = task_cfs_rq(current);
8295 curr = cfs_rq->curr; 8370 curr = cfs_rq->curr;
8296 8371 if (curr) {
8297 /* 8372 update_curr(cfs_rq);
8298 * Not only the cpu but also the task_group of the parent might have
8299 * been changed after parent->se.parent,cfs_rq were copied to
8300 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8301 * of child point to valid ones.
8302 */
8303 rcu_read_lock();
8304 __set_task_cpu(p, this_cpu);
8305 rcu_read_unlock();
8306
8307 update_curr(cfs_rq);
8308
8309 if (curr)
8310 se->vruntime = curr->vruntime; 8373 se->vruntime = curr->vruntime;
8374 }
8311 place_entity(cfs_rq, se, 1); 8375 place_entity(cfs_rq, se, 1);
8312 8376
8313 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { 8377 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8320,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p)
8320 } 8384 }
8321 8385
8322 se->vruntime -= cfs_rq->min_vruntime; 8386 se->vruntime -= cfs_rq->min_vruntime;
8323 8387 raw_spin_unlock(&rq->lock);
8324 raw_spin_unlock_irqrestore(&rq->lock, flags);
8325} 8388}
8326 8389
8327/* 8390/*
@@ -8377,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
8377{ 8440{
8378 struct sched_entity *se = &p->se; 8441 struct sched_entity *se = &p->se;
8379 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8443 u64 now = cfs_rq_clock_task(cfs_rq);
8444 int tg_update;
8380 8445
8381 if (!vruntime_normalized(p)) { 8446 if (!vruntime_normalized(p)) {
8382 /* 8447 /*
@@ -8388,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
8388 } 8453 }
8389 8454
8390 /* Catch up with the cfs_rq and remove our load when we leave */ 8455 /* Catch up with the cfs_rq and remove our load when we leave */
8456 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8391 detach_entity_load_avg(cfs_rq, se); 8457 detach_entity_load_avg(cfs_rq, se);
8458 if (tg_update)
8459 update_tg_load_avg(cfs_rq, false);
8392} 8460}
8393 8461
8394static void attach_task_cfs_rq(struct task_struct *p) 8462static void attach_task_cfs_rq(struct task_struct *p)
8395{ 8463{
8396 struct sched_entity *se = &p->se; 8464 struct sched_entity *se = &p->se;
8397 struct cfs_rq *cfs_rq = cfs_rq_of(se); 8465 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8466 u64 now = cfs_rq_clock_task(cfs_rq);
8467 int tg_update;
8398 8468
8399#ifdef CONFIG_FAIR_GROUP_SCHED 8469#ifdef CONFIG_FAIR_GROUP_SCHED
8400 /* 8470 /*
@@ -8405,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
8405#endif 8475#endif
8406 8476
8407 /* Synchronize task with its cfs_rq */ 8477 /* Synchronize task with its cfs_rq */
8478 tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
8408 attach_entity_load_avg(cfs_rq, se); 8479 attach_entity_load_avg(cfs_rq, se);
8480 if (tg_update)
8481 update_tg_load_avg(cfs_rq, false);
8409 8482
8410 if (!vruntime_normalized(p)) 8483 if (!vruntime_normalized(p))
8411 se->vruntime += cfs_rq->min_vruntime; 8484 se->vruntime += cfs_rq->min_vruntime;
@@ -8465,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
8465} 8538}
8466 8539
8467#ifdef CONFIG_FAIR_GROUP_SCHED 8540#ifdef CONFIG_FAIR_GROUP_SCHED
8541static void task_set_group_fair(struct task_struct *p)
8542{
8543 struct sched_entity *se = &p->se;
8544
8545 set_task_rq(p, task_cpu(p));
8546 se->depth = se->parent ? se->parent->depth + 1 : 0;
8547}
8548
8468static void task_move_group_fair(struct task_struct *p) 8549static void task_move_group_fair(struct task_struct *p)
8469{ 8550{
8470 detach_task_cfs_rq(p); 8551 detach_task_cfs_rq(p);
@@ -8477,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p)
8477 attach_task_cfs_rq(p); 8558 attach_task_cfs_rq(p);
8478} 8559}
8479 8560
8561static void task_change_group_fair(struct task_struct *p, int type)
8562{
8563 switch (type) {
8564 case TASK_SET_GROUP:
8565 task_set_group_fair(p);
8566 break;
8567
8568 case TASK_MOVE_GROUP:
8569 task_move_group_fair(p);
8570 break;
8571 }
8572}
8573
8480void free_fair_sched_group(struct task_group *tg) 8574void free_fair_sched_group(struct task_group *tg)
8481{ 8575{
8482 int i; 8576 int i;
@@ -8496,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg)
8496 8590
8497int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8591int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8498{ 8592{
8499 struct cfs_rq *cfs_rq;
8500 struct sched_entity *se; 8593 struct sched_entity *se;
8594 struct cfs_rq *cfs_rq;
8595 struct rq *rq;
8501 int i; 8596 int i;
8502 8597
8503 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8598 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8512 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8607 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513 8608
8514 for_each_possible_cpu(i) { 8609 for_each_possible_cpu(i) {
8610 rq = cpu_rq(i);
8611
8515 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8612 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8516 GFP_KERNEL, cpu_to_node(i)); 8613 GFP_KERNEL, cpu_to_node(i));
8517 if (!cfs_rq) 8614 if (!cfs_rq)
@@ -8525,7 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8525 init_cfs_rq(cfs_rq); 8622 init_cfs_rq(cfs_rq);
8526 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8623 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8527 init_entity_runnable_average(se); 8624 init_entity_runnable_average(se);
8528 post_init_entity_util_avg(se);
8529 } 8625 }
8530 8626
8531 return 1; 8627 return 1;
@@ -8536,6 +8632,23 @@ err:
8536 return 0; 8632 return 0;
8537} 8633}
8538 8634
8635void online_fair_sched_group(struct task_group *tg)
8636{
8637 struct sched_entity *se;
8638 struct rq *rq;
8639 int i;
8640
8641 for_each_possible_cpu(i) {
8642 rq = cpu_rq(i);
8643 se = tg->se[i];
8644
8645 raw_spin_lock_irq(&rq->lock);
8646 post_init_entity_util_avg(se);
8647 sync_throttle(tg, i);
8648 raw_spin_unlock_irq(&rq->lock);
8649 }
8650}
8651
8539void unregister_fair_sched_group(struct task_group *tg) 8652void unregister_fair_sched_group(struct task_group *tg)
8540{ 8653{
8541 unsigned long flags; 8654 unsigned long flags;
@@ -8640,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8640 return 1; 8753 return 1;
8641} 8754}
8642 8755
8756void online_fair_sched_group(struct task_group *tg) { }
8757
8643void unregister_fair_sched_group(struct task_group *tg) { } 8758void unregister_fair_sched_group(struct task_group *tg) { }
8644 8759
8645#endif /* CONFIG_FAIR_GROUP_SCHED */ 8760#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8699,7 +8814,7 @@ const struct sched_class fair_sched_class = {
8699 .update_curr = update_curr_fair, 8814 .update_curr = update_curr_fair,
8700 8815
8701#ifdef CONFIG_FAIR_GROUP_SCHED 8816#ifdef CONFIG_FAIR_GROUP_SCHED
8702 .task_move_group = task_move_group_fair, 8817 .task_change_group = task_change_group_fair,
8703#endif 8818#endif
8704}; 8819};
8705 8820
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..9fb873cfc75c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
127 */ 127 */
128static void cpuidle_idle_call(void) 128static void cpuidle_idle_call(void)
129{ 129{
130 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 130 struct cpuidle_device *dev = cpuidle_get_device();
131 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 131 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
132 int next_state, entered_state; 132 int next_state, entered_state;
133 133
@@ -201,6 +201,8 @@ exit_idle:
201 */ 201 */
202static void cpu_idle_loop(void) 202static void cpu_idle_loop(void)
203{ 203{
204 int cpu = smp_processor_id();
205
204 while (1) { 206 while (1) {
205 /* 207 /*
206 * If the arch has a polling bit, we maintain an invariant: 208 * If the arch has a polling bit, we maintain an invariant:
@@ -219,7 +221,7 @@ static void cpu_idle_loop(void)
219 check_pgt_cache(); 221 check_pgt_cache();
220 rmb(); 222 rmb();
221 223
222 if (cpu_is_offline(smp_processor_id())) { 224 if (cpu_is_offline(cpu)) {
223 cpuhp_report_idle_dead(); 225 cpuhp_report_idle_dead();
224 arch_cpu_idle_dead(); 226 arch_cpu_idle_dead();
225 } 227 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..a2d6eb71f06b 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
78 loads[2] = (avenrun[2] + offset) << shift; 78 loads[2] = (avenrun[2] + offset) << shift;
79} 79}
80 80
81long calc_load_fold_active(struct rq *this_rq) 81long calc_load_fold_active(struct rq *this_rq, long adjust)
82{ 82{
83 long nr_active, delta = 0; 83 long nr_active, delta = 0;
84 84
85 nr_active = this_rq->nr_running; 85 nr_active = this_rq->nr_running - adjust;
86 nr_active += (long)this_rq->nr_uninterruptible; 86 nr_active += (long)this_rq->nr_uninterruptible;
87 87
88 if (nr_active != this_rq->calc_load_active) { 88 if (nr_active != this_rq->calc_load_active) {
@@ -188,7 +188,7 @@ void calc_load_enter_idle(void)
188 * We're going into NOHZ mode, if there's any pending delta, fold it 188 * We're going into NOHZ mode, if there's any pending delta, fold it
189 * into the pending idle delta. 189 * into the pending idle delta.
190 */ 190 */
191 delta = calc_load_fold_active(this_rq); 191 delta = calc_load_fold_active(this_rq, 0);
192 if (delta) { 192 if (delta) {
193 int idx = calc_load_write_idx(); 193 int idx = calc_load_write_idx();
194 194
@@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq)
389 if (time_before(jiffies, this_rq->calc_load_update)) 389 if (time_before(jiffies, this_rq->calc_load_update))
390 return; 390 return;
391 391
392 delta = calc_load_fold_active(this_rq); 392 delta = calc_load_fold_active(this_rq, 0);
393 if (delta) 393 if (delta)
394 atomic_long_add(delta, &calc_load_tasks); 394 atomic_long_add(delta, &calc_load_tasks);
395 395
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..c64fc5114004 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,7 +28,7 @@ extern unsigned long calc_load_update;
28extern atomic_long_t calc_load_tasks; 28extern atomic_long_t calc_load_tasks;
29 29
30extern void calc_global_load_tick(struct rq *this_rq); 30extern void calc_global_load_tick(struct rq *this_rq);
31extern long calc_load_fold_active(struct rq *this_rq); 31extern long calc_load_fold_active(struct rq *this_rq, long adjust);
32 32
33#ifdef CONFIG_SMP 33#ifdef CONFIG_SMP
34extern void cpu_load_update_active(struct rq *this_rq); 34extern void cpu_load_update_active(struct rq *this_rq);
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
321 321
322extern void free_fair_sched_group(struct task_group *tg); 322extern void free_fair_sched_group(struct task_group *tg);
323extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 323extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
324extern void online_fair_sched_group(struct task_group *tg);
324extern void unregister_fair_sched_group(struct task_group *tg); 325extern void unregister_fair_sched_group(struct task_group *tg);
325extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 326extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
326 struct sched_entity *se, int cpu, 327 struct sched_entity *se, int cpu,
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1113 * In particular, the load of prev->state in finish_task_switch() must 1114 * In particular, the load of prev->state in finish_task_switch() must
1114 * happen before this. 1115 * happen before this.
1115 * 1116 *
1116 * Pairs with the smp_cond_acquire() in try_to_wake_up(). 1117 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
1117 */ 1118 */
1118 smp_store_release(&prev->on_cpu, 0); 1119 smp_store_release(&prev->on_cpu, 0);
1119#endif 1120#endif
@@ -1246,8 +1247,11 @@ struct sched_class {
1246 1247
1247 void (*update_curr) (struct rq *rq); 1248 void (*update_curr) (struct rq *rq);
1248 1249
1250#define TASK_SET_GROUP 0
1251#define TASK_MOVE_GROUP 1
1252
1249#ifdef CONFIG_FAIR_GROUP_SCHED 1253#ifdef CONFIG_FAIR_GROUP_SCHED
1250 void (*task_move_group) (struct task_struct *p); 1254 void (*task_change_group) (struct task_struct *p, int type);
1251#endif 1255#endif
1252}; 1256};
1253 1257
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
1809#else /* arch_scale_freq_capacity */ 1813#else /* arch_scale_freq_capacity */
1810#define arch_scale_freq_invariant() (false) 1814#define arch_scale_freq_invariant() (false)
1811#endif 1815#endif
1812
1813static inline void account_reset_rq(struct rq *rq)
1814{
1815#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1816 rq->prev_irq_time = 0;
1817#endif
1818#ifdef CONFIG_PARAVIRT
1819 rq->prev_steal_time = 0;
1820#endif
1821#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1822 rq->prev_steal_time_rq = 0;
1823#endif
1824}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
33# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) 33# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
34# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) 34# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
35# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 35# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
36# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0)
37
36#else /* !CONFIG_SCHEDSTATS */ 38#else /* !CONFIG_SCHEDSTATS */
37static inline void 39static inline void
38rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 40rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
47# define schedstat_inc(rq, field) do { } while (0) 49# define schedstat_inc(rq, field) do { } while (0)
48# define schedstat_add(rq, field, amt) do { } while (0) 50# define schedstat_add(rq, field, amt) do { } while (0)
49# define schedstat_set(var, val) do { } while (0) 51# define schedstat_set(var, val) do { } while (0)
52# define schedstat_val(rq, field) 0
50#endif 53#endif
51 54
52#ifdef CONFIG_SCHED_INFO 55#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/signal.c b/kernel/signal.c
index 96e9bc40667f..af21afc00d08 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2751 * @ts: upper bound on process time suspension 2751 * @ts: upper bound on process time suspension
2752 */ 2752 */
2753int do_sigtimedwait(const sigset_t *which, siginfo_t *info, 2753int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2754 const struct timespec *ts) 2754 const struct timespec *ts)
2755{ 2755{
2756 ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
2756 struct task_struct *tsk = current; 2757 struct task_struct *tsk = current;
2757 long timeout = MAX_SCHEDULE_TIMEOUT;
2758 sigset_t mask = *which; 2758 sigset_t mask = *which;
2759 int sig; 2759 int sig, ret = 0;
2760 2760
2761 if (ts) { 2761 if (ts) {
2762 if (!timespec_valid(ts)) 2762 if (!timespec_valid(ts))
2763 return -EINVAL; 2763 return -EINVAL;
2764 timeout = timespec_to_jiffies(ts); 2764 timeout = timespec_to_ktime(*ts);
2765 /* 2765 to = &timeout;
2766 * We can be close to the next tick, add another one
2767 * to ensure we will wait at least the time asked for.
2768 */
2769 if (ts->tv_sec || ts->tv_nsec)
2770 timeout++;
2771 } 2766 }
2772 2767
2773 /* 2768 /*
@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2778 2773
2779 spin_lock_irq(&tsk->sighand->siglock); 2774 spin_lock_irq(&tsk->sighand->siglock);
2780 sig = dequeue_signal(tsk, &mask, info); 2775 sig = dequeue_signal(tsk, &mask, info);
2781 if (!sig && timeout) { 2776 if (!sig && timeout.tv64) {
2782 /* 2777 /*
2783 * None ready, temporarily unblock those we're interested 2778 * None ready, temporarily unblock those we're interested
2784 * while we are sleeping in so that we'll be awakened when 2779 * while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2790 recalc_sigpending(); 2785 recalc_sigpending();
2791 spin_unlock_irq(&tsk->sighand->siglock); 2786 spin_unlock_irq(&tsk->sighand->siglock);
2792 2787
2793 timeout = freezable_schedule_timeout_interruptible(timeout); 2788 __set_current_state(TASK_INTERRUPTIBLE);
2794 2789 ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
2790 HRTIMER_MODE_REL);
2795 spin_lock_irq(&tsk->sighand->siglock); 2791 spin_lock_irq(&tsk->sighand->siglock);
2796 __set_task_blocked(tsk, &tsk->real_blocked); 2792 __set_task_blocked(tsk, &tsk->real_blocked);
2797 sigemptyset(&tsk->real_blocked); 2793 sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2801 2797
2802 if (sig) 2798 if (sig)
2803 return sig; 2799 return sig;
2804 return timeout ? -EINTR : -EAGAIN; 2800 return ret ? -EINTR : -EAGAIN;
2805} 2801}
2806 2802
2807/** 2803/**
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..3aa642d39c03 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
33 33
34static void flush_smp_call_function_queue(bool warn_cpu_offline); 34static void flush_smp_call_function_queue(bool warn_cpu_offline);
35 35
36static int 36int smpcfd_prepare_cpu(unsigned int cpu)
37hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
38{ 37{
39 long cpu = (long)hcpu;
40 struct call_function_data *cfd = &per_cpu(cfd_data, cpu); 38 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
41 39
42 switch (action) { 40 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
43 case CPU_UP_PREPARE: 41 cpu_to_node(cpu)))
44 case CPU_UP_PREPARE_FROZEN: 42 return -ENOMEM;
45 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 43 cfd->csd = alloc_percpu(struct call_single_data);
46 cpu_to_node(cpu))) 44 if (!cfd->csd) {
47 return notifier_from_errno(-ENOMEM);
48 cfd->csd = alloc_percpu(struct call_single_data);
49 if (!cfd->csd) {
50 free_cpumask_var(cfd->cpumask);
51 return notifier_from_errno(-ENOMEM);
52 }
53 break;
54
55#ifdef CONFIG_HOTPLUG_CPU
56 case CPU_UP_CANCELED:
57 case CPU_UP_CANCELED_FROZEN:
58 /* Fall-through to the CPU_DEAD[_FROZEN] case. */
59
60 case CPU_DEAD:
61 case CPU_DEAD_FROZEN:
62 free_cpumask_var(cfd->cpumask); 45 free_cpumask_var(cfd->cpumask);
63 free_percpu(cfd->csd); 46 return -ENOMEM;
64 break; 47 }
65 48
66 case CPU_DYING: 49 return 0;
67 case CPU_DYING_FROZEN: 50}
68 /* 51
69 * The IPIs for the smp-call-function callbacks queued by other 52int smpcfd_dead_cpu(unsigned int cpu)
70 * CPUs might arrive late, either due to hardware latencies or 53{
71 * because this CPU disabled interrupts (inside stop-machine) 54 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
72 * before the IPIs were sent. So flush out any pending callbacks
73 * explicitly (without waiting for the IPIs to arrive), to
74 * ensure that the outgoing CPU doesn't go offline with work
75 * still pending.
76 */
77 flush_smp_call_function_queue(false);
78 break;
79#endif
80 };
81 55
82 return NOTIFY_OK; 56 free_cpumask_var(cfd->cpumask);
57 free_percpu(cfd->csd);
58 return 0;
83} 59}
84 60
85static struct notifier_block hotplug_cfd_notifier = { 61int smpcfd_dying_cpu(unsigned int cpu)
86 .notifier_call = hotplug_cfd, 62{
87}; 63 /*
64 * The IPIs for the smp-call-function callbacks queued by other
65 * CPUs might arrive late, either due to hardware latencies or
66 * because this CPU disabled interrupts (inside stop-machine)
67 * before the IPIs were sent. So flush out any pending callbacks
68 * explicitly (without waiting for the IPIs to arrive), to
69 * ensure that the outgoing CPU doesn't go offline with work
70 * still pending.
71 */
72 flush_smp_call_function_queue(false);
73 return 0;
74}
88 75
89void __init call_function_init(void) 76void __init call_function_init(void)
90{ 77{
91 void *cpu = (void *)(long)smp_processor_id();
92 int i; 78 int i;
93 79
94 for_each_possible_cpu(i) 80 for_each_possible_cpu(i)
95 init_llist_head(&per_cpu(call_single_queue, i)); 81 init_llist_head(&per_cpu(call_single_queue, i));
96 82
97 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); 83 smpcfd_prepare_cpu(smp_processor_id());
98 register_cpu_notifier(&hotplug_cfd_notifier);
99} 84}
100 85
101/* 86/*
@@ -107,7 +92,7 @@ void __init call_function_init(void)
107 */ 92 */
108static __always_inline void csd_lock_wait(struct call_single_data *csd) 93static __always_inline void csd_lock_wait(struct call_single_data *csd)
109{ 94{
110 smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); 95 smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
111} 96}
112 97
113static __always_inline void csd_lock(struct call_single_data *csd) 98static __always_inline void csd_lock(struct call_single_data *csd)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87b2fc38398b..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = {
1205 .extra2 = &one, 1205 .extra2 = &one,
1206 }, 1206 },
1207#endif 1207#endif
1208#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
1209 {
1210 .procname = "panic_on_rcu_stall",
1211 .data = &sysctl_panic_on_rcu_stall,
1212 .maxlen = sizeof(sysctl_panic_on_rcu_stall),
1213 .mode = 0644,
1214 .proc_handler = proc_dointvec_minmax,
1215 .extra1 = &zero,
1216 .extra2 = &one,
1217 },
1218#endif
1208 { } 1219 { }
1209}; 1220};
1210 1221
@@ -1497,8 +1508,8 @@ static struct ctl_table vm_table[] = {
1497#ifdef CONFIG_NUMA 1508#ifdef CONFIG_NUMA
1498 { 1509 {
1499 .procname = "zone_reclaim_mode", 1510 .procname = "zone_reclaim_mode",
1500 .data = &zone_reclaim_mode, 1511 .data = &node_reclaim_mode,
1501 .maxlen = sizeof(zone_reclaim_mode), 1512 .maxlen = sizeof(node_reclaim_mode),
1502 .mode = 0644, 1513 .mode = 0644,
1503 .proc_handler = proc_dointvec, 1514 .proc_handler = proc_dointvec,
1504 .extra1 = &zero, 1515 .extra1 = &zero,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 53fa971d000d..6ab4842b00e8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -108,7 +108,6 @@ void task_work_run(void)
108 * fail, but it can play with *work and other entries. 108 * fail, but it can play with *work and other entries.
109 */ 109 */
110 raw_spin_unlock_wait(&task->pi_lock); 110 raw_spin_unlock_wait(&task->pi_lock);
111 smp_mb();
112 111
113 do { 112 do {
114 next = work->next; 113 next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e840ed867a5d..c3aad685bbc0 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -30,7 +30,6 @@
30 * struct alarm_base - Alarm timer bases 30 * struct alarm_base - Alarm timer bases
31 * @lock: Lock for syncrhonized access to the base 31 * @lock: Lock for syncrhonized access to the base
32 * @timerqueue: Timerqueue head managing the list of events 32 * @timerqueue: Timerqueue head managing the list of events
33 * @timer: hrtimer used to schedule events while running
34 * @gettime: Function to read the time correlating to the base 33 * @gettime: Function to read the time correlating to the base
35 * @base_clockid: clockid for the base 34 * @base_clockid: clockid for the base
36 */ 35 */
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a9b76a40319e..2c5bc77c0bb0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu)
645#endif 645#endif
646 646
647#ifdef CONFIG_SYSFS 647#ifdef CONFIG_SYSFS
648struct bus_type clockevents_subsys = { 648static struct bus_type clockevents_subsys = {
649 .name = "clockevents", 649 .name = "clockevents",
650 .dev_name = "clockevent", 650 .dev_name = "clockevent",
651}; 651};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 56ece145a814..6a5a310a1a53 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs)
669 struct list_head *entry = &clocksource_list; 669 struct list_head *entry = &clocksource_list;
670 struct clocksource *tmp; 670 struct clocksource *tmp;
671 671
672 list_for_each_entry(tmp, &clocksource_list, list) 672 list_for_each_entry(tmp, &clocksource_list, list) {
673 /* Keep track of the place, where to insert */ 673 /* Keep track of the place, where to insert */
674 if (tmp->rating >= cs->rating) 674 if (tmp->rating < cs->rating)
675 entry = &tmp->list; 675 break;
676 entry = &tmp->list;
677 }
676 list_add(&cs->list, entry); 678 list_add(&cs->list, entry);
677} 679}
678 680
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e99df0ff1d42..9ba7c820fc23 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
177#endif 177#endif
178} 178}
179 179
180#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 180#ifdef CONFIG_NO_HZ_COMMON
181static inline 181static inline
182struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, 182struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
183 int pinned) 183 int pinned)
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1590/* 1590/*
1591 * Functions related to boot-time initialization: 1591 * Functions related to boot-time initialization:
1592 */ 1592 */
1593static void init_hrtimers_cpu(int cpu) 1593int hrtimers_prepare_cpu(unsigned int cpu)
1594{ 1594{
1595 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1595 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1596 int i; 1596 int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
1602 1602
1603 cpu_base->cpu = cpu; 1603 cpu_base->cpu = cpu;
1604 hrtimer_init_hres(cpu_base); 1604 hrtimer_init_hres(cpu_base);
1605 return 0;
1605} 1606}
1606 1607
1607#ifdef CONFIG_HOTPLUG_CPU 1608#ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1636 } 1637 }
1637} 1638}
1638 1639
1639static void migrate_hrtimers(int scpu) 1640int hrtimers_dead_cpu(unsigned int scpu)
1640{ 1641{
1641 struct hrtimer_cpu_base *old_base, *new_base; 1642 struct hrtimer_cpu_base *old_base, *new_base;
1642 int i; 1643 int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
1665 /* Check, if we got expired work to do */ 1666 /* Check, if we got expired work to do */
1666 __hrtimer_peek_ahead_timers(); 1667 __hrtimer_peek_ahead_timers();
1667 local_irq_enable(); 1668 local_irq_enable();
1669 return 0;
1668} 1670}
1669 1671
1670#endif /* CONFIG_HOTPLUG_CPU */ 1672#endif /* CONFIG_HOTPLUG_CPU */
1671 1673
1672static int hrtimer_cpu_notify(struct notifier_block *self,
1673 unsigned long action, void *hcpu)
1674{
1675 int scpu = (long)hcpu;
1676
1677 switch (action) {
1678
1679 case CPU_UP_PREPARE:
1680 case CPU_UP_PREPARE_FROZEN:
1681 init_hrtimers_cpu(scpu);
1682 break;
1683
1684#ifdef CONFIG_HOTPLUG_CPU
1685 case CPU_DEAD:
1686 case CPU_DEAD_FROZEN:
1687 migrate_hrtimers(scpu);
1688 break;
1689#endif
1690
1691 default:
1692 break;
1693 }
1694
1695 return NOTIFY_OK;
1696}
1697
1698static struct notifier_block hrtimers_nb = {
1699 .notifier_call = hrtimer_cpu_notify,
1700};
1701
1702void __init hrtimers_init(void) 1674void __init hrtimers_init(void)
1703{ 1675{
1704 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1676 hrtimers_prepare_cpu(smp_processor_id());
1705 (void *)(long)smp_processor_id());
1706 register_cpu_notifier(&hrtimers_nb);
1707} 1677}
1708 1678
1709/** 1679/**
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1cafba860b08..39008d78927a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
777 timer->it.cpu.expires = 0; 777 timer->it.cpu.expires = 0;
778 sample_to_timespec(timer->it_clock, timer->it.cpu.expires, 778 sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
779 &itp->it_value); 779 &itp->it_value);
780 return;
780 } else { 781 } else {
781 cpu_timer_sample_group(timer->it_clock, p, &now); 782 cpu_timer_sample_group(timer->it_clock, p, &now);
782 unlock_task_sighand(p, &flags); 783 unlock_task_sighand(p, &flags);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index e622ba365a13..b0928ab3270f 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
43 int allowed_error_ns = usecs * 5; 43 int allowed_error_ns = usecs * 5;
44 44
45 for (i = 0; i < iters; ++i) { 45 for (i = 0; i < iters; ++i) {
46 struct timespec ts1, ts2; 46 s64 kt1, kt2;
47 int time_passed; 47 int time_passed;
48 48
49 ktime_get_ts(&ts1); 49 kt1 = ktime_get_ns();
50 udelay(usecs); 50 udelay(usecs);
51 ktime_get_ts(&ts2); 51 kt2 = ktime_get_ns();
52 time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); 52 time_passed = kt2 - kt1;
53 53
54 if (i == 0 || time_passed < min) 54 if (i == 0 || time_passed < min)
55 min = time_passed; 55 min = time_passed;
@@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v)
87 if (usecs > 0 && iters > 0) { 87 if (usecs > 0 && iters > 0) {
88 return udelay_test_single(s, usecs, iters); 88 return udelay_test_single(s, usecs, iters);
89 } else if (usecs == 0) { 89 } else if (usecs == 0) {
90 struct timespec ts; 90 struct timespec64 ts;
91 91
92 ktime_get_ts(&ts); 92 ktime_get_ts64(&ts);
93 seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", 93 seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n",
94 loops_per_jiffy, ts.tv_sec, ts.tv_nsec); 94 loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec);
95 seq_puts(s, "usage:\n"); 95 seq_puts(s, "usage:\n");
96 seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); 96 seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
97 seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); 97 seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 53d7184da0be..690b797f522e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
75} 75}
76 76
77static struct clock_event_device ce_broadcast_hrtimer = { 77static struct clock_event_device ce_broadcast_hrtimer = {
78 .name = "bc_hrtimer",
78 .set_state_shutdown = bc_shutdown, 79 .set_state_shutdown = bc_shutdown,
79 .set_next_ktime = bc_set_next, 80 .set_next_ktime = bc_set_next,
80 .features = CLOCK_EVT_FEAT_ONESHOT | 81 .features = CLOCK_EVT_FEAT_ONESHOT |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 966a5a6fdd0a..f738251000fe 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }
164DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); 164DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
165 165
166extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); 166extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
167void timer_clear_idle(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 536ada80f6dd..204fdc86863d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
31#include <trace/events/timer.h> 31#include <trace/events/timer.h>
32 32
33/* 33/*
34 * Per cpu nohz control structure 34 * Per-CPU nohz control structure
35 */ 35 */
36static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 36static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
37 37
@@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now)
61 if (delta.tv64 < tick_period.tv64) 61 if (delta.tv64 < tick_period.tv64)
62 return; 62 return;
63 63
64 /* Reevalute with jiffies_lock held */ 64 /* Reevaluate with jiffies_lock held */
65 write_seqlock(&jiffies_lock); 65 write_seqlock(&jiffies_lock);
66 66
67 delta = ktime_sub(now, last_jiffies_update); 67 delta = ktime_sub(now, last_jiffies_update);
@@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
116#ifdef CONFIG_NO_HZ_COMMON 116#ifdef CONFIG_NO_HZ_COMMON
117 /* 117 /*
118 * Check if the do_timer duty was dropped. We don't care about 118 * Check if the do_timer duty was dropped. We don't care about
119 * concurrency: This happens only when the cpu in charge went 119 * concurrency: This happens only when the CPU in charge went
120 * into a long sleep. If two cpus happen to assign themself to 120 * into a long sleep. If two CPUs happen to assign themselves to
121 * this duty, then the jiffies update is still serialized by 121 * this duty, then the jiffies update is still serialized by
122 * jiffies_lock. 122 * jiffies_lock.
123 */ 123 */
@@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi
349/* 349/*
350 * Re-evaluate the need for the tick as we switch the current task. 350 * Re-evaluate the need for the tick as we switch the current task.
351 * It might need the tick due to per task/process properties: 351 * It might need the tick due to per task/process properties:
352 * perf events, posix cpu timers, ... 352 * perf events, posix CPU timers, ...
353 */ 353 */
354void __tick_nohz_task_switch(void) 354void __tick_nohz_task_switch(void)
355{ 355{
@@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void)
509 * 509 *
510 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 510 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
511 * must be updated. Otherwise an interrupt handler could use a stale jiffy 511 * must be updated. Otherwise an interrupt handler could use a stale jiffy
512 * value. We do this unconditionally on any cpu, as we don't know whether the 512 * value. We do this unconditionally on any CPU, as we don't know whether the
513 * cpu, which has the update task assigned is in a long sleep. 513 * CPU, which has the update task assigned is in a long sleep.
514 */ 514 */
515static void tick_nohz_update_jiffies(ktime_t now) 515static void tick_nohz_update_jiffies(ktime_t now)
516{ 516{
@@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
526} 526}
527 527
528/* 528/*
529 * Updates the per cpu time idle statistics counters 529 * Updates the per-CPU time idle statistics counters
530 */ 530 */
531static void 531static void
532update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) 532update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
@@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
566} 566}
567 567
568/** 568/**
569 * get_cpu_idle_time_us - get the total idle time of a cpu 569 * get_cpu_idle_time_us - get the total idle time of a CPU
570 * @cpu: CPU number to query 570 * @cpu: CPU number to query
571 * @last_update_time: variable to store update time in. Do not update 571 * @last_update_time: variable to store update time in. Do not update
572 * counters if NULL. 572 * counters if NULL.
573 * 573 *
574 * Return the cummulative idle time (since boot) for a given 574 * Return the cumulative idle time (since boot) for a given
575 * CPU, in microseconds. 575 * CPU, in microseconds.
576 * 576 *
577 * This time is measured via accounting rather than sampling, 577 * This time is measured via accounting rather than sampling,
@@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
607EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 607EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
608 608
609/** 609/**
610 * get_cpu_iowait_time_us - get the total iowait time of a cpu 610 * get_cpu_iowait_time_us - get the total iowait time of a CPU
611 * @cpu: CPU number to query 611 * @cpu: CPU number to query
612 * @last_update_time: variable to store update time in. Do not update 612 * @last_update_time: variable to store update time in. Do not update
613 * counters if NULL. 613 * counters if NULL.
614 * 614 *
615 * Return the cummulative iowait time (since boot) for a given 615 * Return the cumulative iowait time (since boot) for a given
616 * CPU, in microseconds. 616 * CPU, in microseconds.
617 * 617 *
618 * This time is measured via accounting rather than sampling, 618 * This time is measured via accounting rather than sampling,
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
700 delta = next_tick - basemono; 700 delta = next_tick - basemono;
701 if (delta <= (u64)TICK_NSEC) { 701 if (delta <= (u64)TICK_NSEC) {
702 tick.tv64 = 0; 702 tick.tv64 = 0;
703
704 /*
705 * Tell the timer code that the base is not idle, i.e. undo
706 * the effect of get_next_timer_interrupt():
707 */
708 timer_clear_idle();
703 /* 709 /*
704 * We've not stopped the tick yet, and there's a timer in the 710 * We've not stopped the tick yet, and there's a timer in the
705 * next period, so no point in stopping it either, bail. 711 * next period, so no point in stopping it either, bail.
@@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
726 } 732 }
727 733
728 /* 734 /*
729 * If this cpu is the one which updates jiffies, then give up 735 * If this CPU is the one which updates jiffies, then give up
730 * the assignment and let it be taken by the cpu which runs 736 * the assignment and let it be taken by the CPU which runs
731 * the tick timer next, which might be this cpu as well. If we 737 * the tick timer next, which might be this CPU as well. If we
732 * don't drop this here the jiffies might be stale and 738 * don't drop this here the jiffies might be stale and
733 * do_timer() never invoked. Keep track of the fact that it 739 * do_timer() never invoked. Keep track of the fact that it
734 * was the one which had the do_timer() duty last. If this cpu 740 * was the one which had the do_timer() duty last. If this CPU
735 * is the one which had the do_timer() duty last, we limit the 741 * is the one which had the do_timer() duty last, we limit the
736 * sleep time to the timekeeping max_deferement value. 742 * sleep time to the timekeeping max_deferment value.
737 * Otherwise we can sleep as long as we want. 743 * Otherwise we can sleep as long as we want.
738 */ 744 */
739 delta = timekeeping_max_deferment(); 745 delta = timekeeping_max_deferment();
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
809 tick_do_update_jiffies64(now); 815 tick_do_update_jiffies64(now);
810 cpu_load_update_nohz_stop(); 816 cpu_load_update_nohz_stop();
811 817
818 /*
819 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
820 * the clock forward checks in the enqueue path:
821 */
822 timer_clear_idle();
823
812 calc_load_exit_idle(); 824 calc_load_exit_idle();
813 touch_softlockup_watchdog_sched(); 825 touch_softlockup_watchdog_sched();
814 /* 826 /*
@@ -841,9 +853,9 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
841static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 853static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
842{ 854{
843 /* 855 /*
844 * If this cpu is offline and it is the one which updates 856 * If this CPU is offline and it is the one which updates
845 * jiffies, then give up the assignment and let it be taken by 857 * jiffies, then give up the assignment and let it be taken by
846 * the cpu which runs the tick timer next. If we don't drop 858 * the CPU which runs the tick timer next. If we don't drop
847 * this here the jiffies might be stale and do_timer() never 859 * this here the jiffies might be stale and do_timer() never
848 * invoked. 860 * invoked.
849 */ 861 */
@@ -896,11 +908,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
896 ktime_t now, expires; 908 ktime_t now, expires;
897 int cpu = smp_processor_id(); 909 int cpu = smp_processor_id();
898 910
899 now = tick_nohz_start_idle(ts);
900
901 if (can_stop_idle_tick(cpu, ts)) { 911 if (can_stop_idle_tick(cpu, ts)) {
902 int was_stopped = ts->tick_stopped; 912 int was_stopped = ts->tick_stopped;
903 913
914 now = tick_nohz_start_idle(ts);
904 ts->idle_calls++; 915 ts->idle_calls++;
905 916
906 expires = tick_nohz_stop_sched_tick(ts, now, cpu); 917 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
@@ -933,11 +944,11 @@ void tick_nohz_idle_enter(void)
933 WARN_ON_ONCE(irqs_disabled()); 944 WARN_ON_ONCE(irqs_disabled());
934 945
935 /* 946 /*
936 * Update the idle state in the scheduler domain hierarchy 947 * Update the idle state in the scheduler domain hierarchy
937 * when tick_nohz_stop_sched_tick() is called from the idle loop. 948 * when tick_nohz_stop_sched_tick() is called from the idle loop.
938 * State will be updated to busy during the first busy tick after 949 * State will be updated to busy during the first busy tick after
939 * exiting idle. 950 * exiting idle.
940 */ 951 */
941 set_cpu_sd_state_idle(); 952 set_cpu_sd_state_idle();
942 953
943 local_irq_disable(); 954 local_irq_disable();
@@ -1092,35 +1103,6 @@ static void tick_nohz_switch_to_nohz(void)
1092 tick_nohz_activate(ts, NOHZ_MODE_LOWRES); 1103 tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
1093} 1104}
1094 1105
1095/*
1096 * When NOHZ is enabled and the tick is stopped, we need to kick the
1097 * tick timer from irq_enter() so that the jiffies update is kept
1098 * alive during long running softirqs. That's ugly as hell, but
1099 * correctness is key even if we need to fix the offending softirq in
1100 * the first place.
1101 *
1102 * Note, this is different to tick_nohz_restart. We just kick the
1103 * timer and do not touch the other magic bits which need to be done
1104 * when idle is left.
1105 */
1106static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1107{
1108#if 0
1109 /* Switch back to 2.6.27 behaviour */
1110 ktime_t delta;
1111
1112 /*
1113 * Do not touch the tick device, when the next expiry is either
1114 * already reached or less/equal than the tick period.
1115 */
1116 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
1117 if (delta.tv64 <= tick_period.tv64)
1118 return;
1119
1120 tick_nohz_restart(ts, now);
1121#endif
1122}
1123
1124static inline void tick_nohz_irq_enter(void) 1106static inline void tick_nohz_irq_enter(void)
1125{ 1107{
1126 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1108 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1113,8 @@ static inline void tick_nohz_irq_enter(void)
1131 now = ktime_get(); 1113 now = ktime_get();
1132 if (ts->idle_active) 1114 if (ts->idle_active)
1133 tick_nohz_stop_idle(ts, now); 1115 tick_nohz_stop_idle(ts, now);
1134 if (ts->tick_stopped) { 1116 if (ts->tick_stopped)
1135 tick_nohz_update_jiffies(now); 1117 tick_nohz_update_jiffies(now);
1136 tick_nohz_kick_tick(ts, now);
1137 }
1138} 1118}
1139 1119
1140#else 1120#else
@@ -1211,7 +1191,7 @@ void tick_setup_sched_timer(void)
1211 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1191 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1212 ts->sched_timer.function = tick_sched_timer; 1192 ts->sched_timer.function = tick_sched_timer;
1213 1193
1214 /* Get the next period (per cpu) */ 1194 /* Get the next period (per-CPU) */
1215 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1195 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
1216 1196
1217 /* Offset the tick to avert jiffies_lock contention. */ 1197 /* Offset the tick to avert jiffies_lock contention. */
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 86628e755f38..7142580ad94f 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = {
67#define SECS_PER_DAY (SECS_PER_HOUR * 24) 67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68 68
69/** 69/**
70 * time_to_tm - converts the calendar time to local broken-down time 70 * time64_to_tm - converts the calendar time to local broken-down time
71 * 71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, 72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC). 73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs. 74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time 75 * @result pointer to struct tm variable to receive broken-down time
76 */ 76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result) 77void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
78{ 78{
79 long days, rem, y; 79 long days, rem, y;
80 int remainder;
80 const unsigned short *ip; 81 const unsigned short *ip;
81 82
82 days = totalsecs / SECS_PER_DAY; 83 days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
83 rem = totalsecs % SECS_PER_DAY; 84 rem = remainder;
84 rem += offset; 85 rem += offset;
85 while (rem < 0) { 86 while (rem < 0) {
86 rem += SECS_PER_DAY; 87 rem += SECS_PER_DAY;
@@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result)
124 result->tm_mon = y; 125 result->tm_mon = y;
125 result->tm_mday = days + 1; 126 result->tm_mday = days + 1;
126} 127}
127EXPORT_SYMBOL(time_to_tm); 128EXPORT_SYMBOL(time64_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 479d25cd3d4f..3b65746c7f15 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -480,10 +480,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
480 * users are removed, this can be killed. 480 * users are removed, this can be killed.
481 */ 481 */
482 remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); 482 remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
483 tk->tkr_mono.xtime_nsec -= remainder; 483 if (remainder != 0) {
484 tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; 484 tk->tkr_mono.xtime_nsec -= remainder;
485 tk->ntp_error += remainder << tk->ntp_error_shift; 485 tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
486 tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; 486 tk->ntp_error += remainder << tk->ntp_error_shift;
487 tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
488 }
487} 489}
488#else 490#else
489#define old_vsyscall_fixup(tk) 491#define old_vsyscall_fixup(tk)
@@ -2186,6 +2188,7 @@ struct timespec64 get_monotonic_coarse64(void)
2186 2188
2187 return now; 2189 return now;
2188} 2190}
2191EXPORT_SYMBOL(get_monotonic_coarse64);
2189 2192
2190/* 2193/*
2191 * Must hold jiffies_lock 2194 * Must hold jiffies_lock
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a95f9728778..555670a5143c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
59EXPORT_SYMBOL(jiffies_64); 59EXPORT_SYMBOL(jiffies_64);
60 60
61/* 61/*
62 * per-CPU timer vector definitions: 62 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
63 * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
64 * level has a different granularity.
65 *
66 * The level granularity is: LVL_CLK_DIV ^ lvl
67 * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
68 *
69 * The array level of a newly armed timer depends on the relative expiry
70 * time. The farther the expiry time is away the higher the array level and
71 * therefor the granularity becomes.
72 *
73 * Contrary to the original timer wheel implementation, which aims for 'exact'
74 * expiry of the timers, this implementation removes the need for recascading
75 * the timers into the lower array levels. The previous 'classic' timer wheel
76 * implementation of the kernel already violated the 'exact' expiry by adding
77 * slack to the expiry time to provide batched expiration. The granularity
78 * levels provide implicit batching.
79 *
80 * This is an optimization of the original timer wheel implementation for the
81 * majority of the timer wheel use cases: timeouts. The vast majority of
82 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
83 * the timeout expires it indicates that normal operation is disturbed, so it
84 * does not matter much whether the timeout comes with a slight delay.
85 *
86 * The only exception to this are networking timers with a small expiry
87 * time. They rely on the granularity. Those fit into the first wheel level,
88 * which has HZ granularity.
89 *
90 * We don't have cascading anymore. timers with a expiry time above the
91 * capacity of the last wheel level are force expired at the maximum timeout
92 * value of the last wheel level. From data sampling we know that the maximum
93 * value observed is 5 days (network connection tracking), so this should not
94 * be an issue.
95 *
96 * The currently chosen array constants values are a good compromise between
97 * array size and granularity.
98 *
99 * This results in the following granularity and range levels:
100 *
101 * HZ 1000 steps
102 * Level Offset Granularity Range
103 * 0 0 1 ms 0 ms - 63 ms
104 * 1 64 8 ms 64 ms - 511 ms
105 * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
106 * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
107 * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
108 * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
109 * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
110 * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
111 * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
112 *
113 * HZ 300
114 * Level Offset Granularity Range
115 * 0 0 3 ms 0 ms - 210 ms
116 * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
117 * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
118 * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
119 * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
120 * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
121 * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
122 * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
123 * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
124 *
125 * HZ 250
126 * Level Offset Granularity Range
127 * 0 0 4 ms 0 ms - 255 ms
128 * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
129 * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
130 * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
131 * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
132 * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
133 * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
134 * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
135 * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
136 *
137 * HZ 100
138 * Level Offset Granularity Range
139 * 0 0 10 ms 0 ms - 630 ms
140 * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
141 * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
142 * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
143 * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
144 * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
145 * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
146 * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
63 */ 147 */
64#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
65#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
66#define TVN_SIZE (1 << TVN_BITS)
67#define TVR_SIZE (1 << TVR_BITS)
68#define TVN_MASK (TVN_SIZE - 1)
69#define TVR_MASK (TVR_SIZE - 1)
70#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
71
72struct tvec {
73 struct hlist_head vec[TVN_SIZE];
74};
75 148
76struct tvec_root { 149/* Clock divisor for the next level */
77 struct hlist_head vec[TVR_SIZE]; 150#define LVL_CLK_SHIFT 3
78}; 151#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
152#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
153#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
154#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
79 155
80struct tvec_base { 156/*
81 spinlock_t lock; 157 * The time start value for each level to select the bucket at enqueue
82 struct timer_list *running_timer; 158 * time.
83 unsigned long timer_jiffies; 159 */
84 unsigned long next_timer; 160#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
85 unsigned long active_timers; 161
86 unsigned long all_timers; 162/* Size of each clock level */
87 int cpu; 163#define LVL_BITS 6
88 bool migration_enabled; 164#define LVL_SIZE (1UL << LVL_BITS)
89 bool nohz_active; 165#define LVL_MASK (LVL_SIZE - 1)
90 struct tvec_root tv1; 166#define LVL_OFFS(n) ((n) * LVL_SIZE)
91 struct tvec tv2; 167
92 struct tvec tv3; 168/* Level depth */
93 struct tvec tv4; 169#if HZ > 100
94 struct tvec tv5; 170# define LVL_DEPTH 9
95} ____cacheline_aligned; 171# else
172# define LVL_DEPTH 8
173#endif
174
175/* The cutoff (max. capacity of the wheel) */
176#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
177#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
178
179/*
180 * The resulting wheel size. If NOHZ is configured we allocate two
181 * wheels so we have a separate storage for the deferrable timers.
182 */
183#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
184
185#ifdef CONFIG_NO_HZ_COMMON
186# define NR_BASES 2
187# define BASE_STD 0
188# define BASE_DEF 1
189#else
190# define NR_BASES 1
191# define BASE_STD 0
192# define BASE_DEF 0
193#endif
96 194
195struct timer_base {
196 spinlock_t lock;
197 struct timer_list *running_timer;
198 unsigned long clk;
199 unsigned long next_expiry;
200 unsigned int cpu;
201 bool migration_enabled;
202 bool nohz_active;
203 bool is_idle;
204 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
205 struct hlist_head vectors[WHEEL_SIZE];
206} ____cacheline_aligned;
97 207
98static DEFINE_PER_CPU(struct tvec_base, tvec_bases); 208static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
99 209
100#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 210#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
101unsigned int sysctl_timer_migration = 1; 211unsigned int sysctl_timer_migration = 1;
@@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)
106 unsigned int cpu; 216 unsigned int cpu;
107 217
108 /* Avoid the loop, if nothing to update */ 218 /* Avoid the loop, if nothing to update */
109 if (this_cpu_read(tvec_bases.migration_enabled) == on) 219 if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
110 return; 220 return;
111 221
112 for_each_possible_cpu(cpu) { 222 for_each_possible_cpu(cpu) {
113 per_cpu(tvec_bases.migration_enabled, cpu) = on; 223 per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
224 per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
114 per_cpu(hrtimer_bases.migration_enabled, cpu) = on; 225 per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
115 if (!update_nohz) 226 if (!update_nohz)
116 continue; 227 continue;
117 per_cpu(tvec_bases.nohz_active, cpu) = true; 228 per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
229 per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
118 per_cpu(hrtimer_bases.nohz_active, cpu) = true; 230 per_cpu(hrtimer_bases.nohz_active, cpu) = true;
119 } 231 }
120} 232}
@@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,
133 mutex_unlock(&mutex); 245 mutex_unlock(&mutex);
134 return ret; 246 return ret;
135} 247}
136
137static inline struct tvec_base *get_target_base(struct tvec_base *base,
138 int pinned)
139{
140 if (pinned || !base->migration_enabled)
141 return this_cpu_ptr(&tvec_bases);
142 return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
143}
144#else
145static inline struct tvec_base *get_target_base(struct tvec_base *base,
146 int pinned)
147{
148 return this_cpu_ptr(&tvec_bases);
149}
150#endif 248#endif
151 249
152static unsigned long round_jiffies_common(unsigned long j, int cpu, 250static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)
351} 449}
352EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 450EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
353 451
354/** 452
355 * set_timer_slack - set the allowed slack for a timer 453static inline unsigned int timer_get_idx(struct timer_list *timer)
356 * @timer: the timer to be modified
357 * @slack_hz: the amount of time (in jiffies) allowed for rounding
358 *
359 * Set the amount of time, in jiffies, that a certain timer has
360 * in terms of slack. By setting this value, the timer subsystem
361 * will schedule the actual timer somewhere between
362 * the time mod_timer() asks for, and that time plus the slack.
363 *
364 * By setting the slack to -1, a percentage of the delay is used
365 * instead.
366 */
367void set_timer_slack(struct timer_list *timer, int slack_hz)
368{ 454{
369 timer->slack = slack_hz; 455 return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
370} 456}
371EXPORT_SYMBOL_GPL(set_timer_slack);
372 457
373static void 458static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
374__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
375{ 459{
376 unsigned long expires = timer->expires; 460 timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
377 unsigned long idx = expires - base->timer_jiffies; 461 idx << TIMER_ARRAYSHIFT;
378 struct hlist_head *vec; 462}
379 463
380 if (idx < TVR_SIZE) { 464/*
381 int i = expires & TVR_MASK; 465 * Helper function to calculate the array index for a given expiry
382 vec = base->tv1.vec + i; 466 * time.
383 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { 467 */
384 int i = (expires >> TVR_BITS) & TVN_MASK; 468static inline unsigned calc_index(unsigned expires, unsigned lvl)
385 vec = base->tv2.vec + i; 469{
386 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { 470 expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
387 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; 471 return LVL_OFFS(lvl) + (expires & LVL_MASK);
388 vec = base->tv3.vec + i; 472}
389 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { 473
390 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; 474static int calc_wheel_index(unsigned long expires, unsigned long clk)
391 vec = base->tv4.vec + i; 475{
392 } else if ((signed long) idx < 0) { 476 unsigned long delta = expires - clk;
393 /* 477 unsigned int idx;
394 * Can happen if you add a timer with expires == jiffies, 478
395 * or you set a timer to go off in the past 479 if (delta < LVL_START(1)) {
396 */ 480 idx = calc_index(expires, 0);
397 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); 481 } else if (delta < LVL_START(2)) {
482 idx = calc_index(expires, 1);
483 } else if (delta < LVL_START(3)) {
484 idx = calc_index(expires, 2);
485 } else if (delta < LVL_START(4)) {
486 idx = calc_index(expires, 3);
487 } else if (delta < LVL_START(5)) {
488 idx = calc_index(expires, 4);
489 } else if (delta < LVL_START(6)) {
490 idx = calc_index(expires, 5);
491 } else if (delta < LVL_START(7)) {
492 idx = calc_index(expires, 6);
493 } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
494 idx = calc_index(expires, 7);
495 } else if ((long) delta < 0) {
496 idx = clk & LVL_MASK;
398 } else { 497 } else {
399 int i; 498 /*
400 /* If the timeout is larger than MAX_TVAL (on 64-bit 499 * Force expire obscene large timeouts to expire at the
401 * architectures or with CONFIG_BASE_SMALL=1) then we 500 * capacity limit of the wheel.
402 * use the maximum timeout.
403 */ 501 */
404 if (idx > MAX_TVAL) { 502 if (expires >= WHEEL_TIMEOUT_CUTOFF)
405 idx = MAX_TVAL; 503 expires = WHEEL_TIMEOUT_MAX;
406 expires = idx + base->timer_jiffies; 504
407 } 505 idx = calc_index(expires, LVL_DEPTH - 1);
408 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
409 vec = base->tv5.vec + i;
410 } 506 }
507 return idx;
508}
509
510/*
511 * Enqueue the timer into the hash bucket, mark it pending in
512 * the bitmap and store the index in the timer flags.
513 */
514static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
515 unsigned int idx)
516{
517 hlist_add_head(&timer->entry, base->vectors + idx);
518 __set_bit(idx, base->pending_map);
519 timer_set_idx(timer, idx);
520}
521
522static void
523__internal_add_timer(struct timer_base *base, struct timer_list *timer)
524{
525 unsigned int idx;
411 526
412 hlist_add_head(&timer->entry, vec); 527 idx = calc_wheel_index(timer->expires, base->clk);
528 enqueue_timer(base, timer, idx);
413} 529}
414 530
415static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 531static void
532trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
416{ 533{
417 /* Advance base->jiffies, if the base is empty */ 534 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
418 if (!base->all_timers++) 535 return;
419 base->timer_jiffies = jiffies;
420 536
421 __internal_add_timer(base, timer);
422 /* 537 /*
423 * Update base->active_timers and base->next_timer 538 * TODO: This wants some optimizing similar to the code below, but we
539 * will do that when we switch from push to pull for deferrable timers.
424 */ 540 */
425 if (!(timer->flags & TIMER_DEFERRABLE)) { 541 if (timer->flags & TIMER_DEFERRABLE) {
426 if (!base->active_timers++ || 542 if (tick_nohz_full_cpu(base->cpu))
427 time_before(timer->expires, base->next_timer)) 543 wake_up_nohz_cpu(base->cpu);
428 base->next_timer = timer->expires; 544 return;
429 } 545 }
430 546
431 /* 547 /*
432 * Check whether the other CPU is in dynticks mode and needs 548 * We might have to IPI the remote CPU if the base is idle and the
433 * to be triggered to reevaluate the timer wheel. 549 * timer is not deferrable. If the other CPU is on the way to idle
434 * We are protected against the other CPU fiddling 550 * then it can't set base->is_idle as we hold the base lock:
435 * with the timer by holding the timer base lock. This also
436 * makes sure that a CPU on the way to stop its tick can not
437 * evaluate the timer wheel.
438 *
439 * Spare the IPI for deferrable timers on idle targets though.
440 * The next busy ticks will take care of it. Except full dynticks
441 * require special care against races with idle_cpu(), lets deal
442 * with that later.
443 */ 551 */
444 if (base->nohz_active) { 552 if (!base->is_idle)
445 if (!(timer->flags & TIMER_DEFERRABLE) || 553 return;
446 tick_nohz_full_cpu(base->cpu)) 554
447 wake_up_nohz_cpu(base->cpu); 555 /* Check whether this is the new first expiring timer: */
448 } 556 if (time_after_eq(timer->expires, base->next_expiry))
557 return;
558
559 /*
560 * Set the next expiry time and kick the CPU so it can reevaluate the
561 * wheel:
562 */
563 base->next_expiry = timer->expires;
564 wake_up_nohz_cpu(base->cpu);
565}
566
567static void
568internal_add_timer(struct timer_base *base, struct timer_list *timer)
569{
570 __internal_add_timer(base, timer);
571 trigger_dyntick_cpu(base, timer);
449} 572}
450 573
451#ifdef CONFIG_TIMER_STATS 574#ifdef CONFIG_TIMER_STATS
@@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
666{ 789{
667 timer->entry.pprev = NULL; 790 timer->entry.pprev = NULL;
668 timer->flags = flags | raw_smp_processor_id(); 791 timer->flags = flags | raw_smp_processor_id();
669 timer->slack = -1;
670#ifdef CONFIG_TIMER_STATS 792#ifdef CONFIG_TIMER_STATS
671 timer->start_site = NULL; 793 timer->start_site = NULL;
672 timer->start_pid = -1; 794 timer->start_pid = -1;
@@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
706 entry->next = LIST_POISON2; 828 entry->next = LIST_POISON2;
707} 829}
708 830
709static inline void 831static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
710detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
711{
712 detach_timer(timer, true);
713 if (!(timer->flags & TIMER_DEFERRABLE))
714 base->active_timers--;
715 base->all_timers--;
716}
717
718static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
719 bool clear_pending) 832 bool clear_pending)
720{ 833{
834 unsigned idx = timer_get_idx(timer);
835
721 if (!timer_pending(timer)) 836 if (!timer_pending(timer))
722 return 0; 837 return 0;
723 838
839 if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
840 __clear_bit(idx, base->pending_map);
841
724 detach_timer(timer, clear_pending); 842 detach_timer(timer, clear_pending);
725 if (!(timer->flags & TIMER_DEFERRABLE)) {
726 base->active_timers--;
727 if (timer->expires == base->next_timer)
728 base->next_timer = base->timer_jiffies;
729 }
730 /* If this was the last timer, advance base->jiffies */
731 if (!--base->all_timers)
732 base->timer_jiffies = jiffies;
733 return 1; 843 return 1;
734} 844}
735 845
846static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
847{
848 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
849
850 /*
851 * If the timer is deferrable and nohz is active then we need to use
852 * the deferrable base.
853 */
854 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
855 (tflags & TIMER_DEFERRABLE))
856 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
857 return base;
858}
859
860static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
861{
862 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
863
864 /*
865 * If the timer is deferrable and nohz is active then we need to use
866 * the deferrable base.
867 */
868 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
869 (tflags & TIMER_DEFERRABLE))
870 base = this_cpu_ptr(&timer_bases[BASE_DEF]);
871 return base;
872}
873
874static inline struct timer_base *get_timer_base(u32 tflags)
875{
876 return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
877}
878
879#ifdef CONFIG_NO_HZ_COMMON
880static inline struct timer_base *
881__get_target_base(struct timer_base *base, unsigned tflags)
882{
883#ifdef CONFIG_SMP
884 if ((tflags & TIMER_PINNED) || !base->migration_enabled)
885 return get_timer_this_cpu_base(tflags);
886 return get_timer_cpu_base(tflags, get_nohz_timer_target());
887#else
888 return get_timer_this_cpu_base(tflags);
889#endif
890}
891
892static inline void forward_timer_base(struct timer_base *base)
893{
894 /*
895 * We only forward the base when it's idle and we have a delta between
896 * base clock and jiffies.
897 */
898 if (!base->is_idle || (long) (jiffies - base->clk) < 2)
899 return;
900
901 /*
902 * If the next expiry value is > jiffies, then we fast forward to
903 * jiffies otherwise we forward to the next expiry value.
904 */
905 if (time_after(base->next_expiry, jiffies))
906 base->clk = jiffies;
907 else
908 base->clk = base->next_expiry;
909}
910#else
911static inline struct timer_base *
912__get_target_base(struct timer_base *base, unsigned tflags)
913{
914 return get_timer_this_cpu_base(tflags);
915}
916
917static inline void forward_timer_base(struct timer_base *base) { }
918#endif
919
920static inline struct timer_base *
921get_target_base(struct timer_base *base, unsigned tflags)
922{
923 struct timer_base *target = __get_target_base(base, tflags);
924
925 forward_timer_base(target);
926 return target;
927}
928
736/* 929/*
737 * We are using hashed locking: holding per_cpu(tvec_bases).lock 930 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
738 * means that all timers which are tied to this base via timer->base are 931 * that all timers which are tied to this base are locked, and the base itself
739 * locked, and the base itself is locked too. 932 * is locked too.
740 * 933 *
741 * So __run_timers/migrate_timers can safely modify all timers which could 934 * So __run_timers/migrate_timers can safely modify all timers which could
742 * be found on ->tvX lists. 935 * be found in the base->vectors array.
743 * 936 *
744 * When the timer's base is locked and removed from the list, the 937 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
745 * TIMER_MIGRATING flag is set, FIXME 938 * to wait until the migration is done.
746 */ 939 */
747static struct tvec_base *lock_timer_base(struct timer_list *timer, 940static struct timer_base *lock_timer_base(struct timer_list *timer,
748 unsigned long *flags) 941 unsigned long *flags)
749 __acquires(timer->base->lock) 942 __acquires(timer->base->lock)
750{ 943{
751 for (;;) { 944 for (;;) {
945 struct timer_base *base;
752 u32 tf = timer->flags; 946 u32 tf = timer->flags;
753 struct tvec_base *base;
754 947
755 if (!(tf & TIMER_MIGRATING)) { 948 if (!(tf & TIMER_MIGRATING)) {
756 base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); 949 base = get_timer_base(tf);
757 spin_lock_irqsave(&base->lock, *flags); 950 spin_lock_irqsave(&base->lock, *flags);
758 if (timer->flags == tf) 951 if (timer->flags == tf)
759 return base; 952 return base;
@@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
764} 957}
765 958
766static inline int 959static inline int
767__mod_timer(struct timer_list *timer, unsigned long expires, 960__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
768 bool pending_only, int pinned)
769{ 961{
770 struct tvec_base *base, *new_base; 962 struct timer_base *base, *new_base;
771 unsigned long flags; 963 unsigned int idx = UINT_MAX;
964 unsigned long clk = 0, flags;
772 int ret = 0; 965 int ret = 0;
773 966
967 /*
968 * This is a common optimization triggered by the networking code - if
969 * the timer is re-modified to have the same timeout or ends up in the
970 * same array bucket then just return:
971 */
972 if (timer_pending(timer)) {
973 if (timer->expires == expires)
974 return 1;
975 /*
976 * Take the current timer_jiffies of base, but without holding
977 * the lock!
978 */
979 base = get_timer_base(timer->flags);
980 clk = base->clk;
981
982 idx = calc_wheel_index(expires, clk);
983
984 /*
985 * Retrieve and compare the array index of the pending
986 * timer. If it matches set the expiry to the new value so a
987 * subsequent call will exit in the expires check above.
988 */
989 if (idx == timer_get_idx(timer)) {
990 timer->expires = expires;
991 return 1;
992 }
993 }
994
774 timer_stats_timer_set_start_info(timer); 995 timer_stats_timer_set_start_info(timer);
775 BUG_ON(!timer->function); 996 BUG_ON(!timer->function);
776 997
@@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
782 1003
783 debug_activate(timer, expires); 1004 debug_activate(timer, expires);
784 1005
785 new_base = get_target_base(base, pinned); 1006 new_base = get_target_base(base, timer->flags);
786 1007
787 if (base != new_base) { 1008 if (base != new_base) {
788 /* 1009 /*
789 * We are trying to schedule the timer on the local CPU. 1010 * We are trying to schedule the timer on the new base.
790 * However we can't change timer's base while it is running, 1011 * However we can't change timer's base while it is running,
791 * otherwise del_timer_sync() can't detect that the timer's 1012 * otherwise del_timer_sync() can't detect that the timer's
792 * handler yet has not finished. This also guarantees that 1013 * handler yet has not finished. This also guarantees that the
793 * the timer is serialized wrt itself. 1014 * timer is serialized wrt itself.
794 */ 1015 */
795 if (likely(base->running_timer != timer)) { 1016 if (likely(base->running_timer != timer)) {
796 /* See the comment in lock_timer_base() */ 1017 /* See the comment in lock_timer_base() */
@@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
805 } 1026 }
806 1027
807 timer->expires = expires; 1028 timer->expires = expires;
808 internal_add_timer(base, timer); 1029 /*
1030 * If 'idx' was calculated above and the base time did not advance
1031 * between calculating 'idx' and taking the lock, only enqueue_timer()
1032 * and trigger_dyntick_cpu() is required. Otherwise we need to
1033 * (re)calculate the wheel index via internal_add_timer().
1034 */
1035 if (idx != UINT_MAX && clk == base->clk) {
1036 enqueue_timer(base, timer, idx);
1037 trigger_dyntick_cpu(base, timer);
1038 } else {
1039 internal_add_timer(base, timer);
1040 }
809 1041
810out_unlock: 1042out_unlock:
811 spin_unlock_irqrestore(&base->lock, flags); 1043 spin_unlock_irqrestore(&base->lock, flags);
@@ -825,49 +1057,10 @@ out_unlock:
825 */ 1057 */
826int mod_timer_pending(struct timer_list *timer, unsigned long expires) 1058int mod_timer_pending(struct timer_list *timer, unsigned long expires)
827{ 1059{
828 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); 1060 return __mod_timer(timer, expires, true);
829} 1061}
830EXPORT_SYMBOL(mod_timer_pending); 1062EXPORT_SYMBOL(mod_timer_pending);
831 1063
832/*
833 * Decide where to put the timer while taking the slack into account
834 *
835 * Algorithm:
836 * 1) calculate the maximum (absolute) time
837 * 2) calculate the highest bit where the expires and new max are different
838 * 3) use this bit to make a mask
839 * 4) use the bitmask to round down the maximum time, so that all last
840 * bits are zeros
841 */
842static inline
843unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
844{
845 unsigned long expires_limit, mask;
846 int bit;
847
848 if (timer->slack >= 0) {
849 expires_limit = expires + timer->slack;
850 } else {
851 long delta = expires - jiffies;
852
853 if (delta < 256)
854 return expires;
855
856 expires_limit = expires + delta / 256;
857 }
858 mask = expires ^ expires_limit;
859 if (mask == 0)
860 return expires;
861
862 bit = __fls(mask);
863
864 mask = (1UL << bit) - 1;
865
866 expires_limit = expires_limit & ~(mask);
867
868 return expires_limit;
869}
870
871/** 1064/**
872 * mod_timer - modify a timer's timeout 1065 * mod_timer - modify a timer's timeout
873 * @timer: the timer to be modified 1066 * @timer: the timer to be modified
@@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
890 */ 1083 */
891int mod_timer(struct timer_list *timer, unsigned long expires) 1084int mod_timer(struct timer_list *timer, unsigned long expires)
892{ 1085{
893 expires = apply_slack(timer, expires); 1086 return __mod_timer(timer, expires, false);
894
895 /*
896 * This is a common optimization triggered by the
897 * networking code - if the timer is re-modified
898 * to be the same thing then just return:
899 */
900 if (timer_pending(timer) && timer->expires == expires)
901 return 1;
902
903 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
904} 1087}
905EXPORT_SYMBOL(mod_timer); 1088EXPORT_SYMBOL(mod_timer);
906 1089
907/** 1090/**
908 * mod_timer_pinned - modify a timer's timeout
909 * @timer: the timer to be modified
910 * @expires: new timeout in jiffies
911 *
912 * mod_timer_pinned() is a way to update the expire field of an
913 * active timer (if the timer is inactive it will be activated)
914 * and to ensure that the timer is scheduled on the current CPU.
915 *
916 * Note that this does not prevent the timer from being migrated
917 * when the current CPU goes offline. If this is a problem for
918 * you, use CPU-hotplug notifiers to handle it correctly, for
919 * example, cancelling the timer when the corresponding CPU goes
920 * offline.
921 *
922 * mod_timer_pinned(timer, expires) is equivalent to:
923 *
924 * del_timer(timer); timer->expires = expires; add_timer(timer);
925 */
926int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
927{
928 if (timer->expires == expires && timer_pending(timer))
929 return 1;
930
931 return __mod_timer(timer, expires, false, TIMER_PINNED);
932}
933EXPORT_SYMBOL(mod_timer_pinned);
934
935/**
936 * add_timer - start a timer 1091 * add_timer - start a timer
937 * @timer: the timer to be added 1092 * @timer: the timer to be added
938 * 1093 *
@@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);
962 */ 1117 */
963void add_timer_on(struct timer_list *timer, int cpu) 1118void add_timer_on(struct timer_list *timer, int cpu)
964{ 1119{
965 struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); 1120 struct timer_base *new_base, *base;
966 struct tvec_base *base;
967 unsigned long flags; 1121 unsigned long flags;
968 1122
969 timer_stats_timer_set_start_info(timer); 1123 timer_stats_timer_set_start_info(timer);
970 BUG_ON(timer_pending(timer) || !timer->function); 1124 BUG_ON(timer_pending(timer) || !timer->function);
971 1125
1126 new_base = get_timer_cpu_base(timer->flags, cpu);
1127
972 /* 1128 /*
973 * If @timer was on a different CPU, it should be migrated with the 1129 * If @timer was on a different CPU, it should be migrated with the
974 * old base locked to prevent other operations proceeding with the 1130 * old base locked to prevent other operations proceeding with the
@@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
1004 */ 1160 */
1005int del_timer(struct timer_list *timer) 1161int del_timer(struct timer_list *timer)
1006{ 1162{
1007 struct tvec_base *base; 1163 struct timer_base *base;
1008 unsigned long flags; 1164 unsigned long flags;
1009 int ret = 0; 1165 int ret = 0;
1010 1166
@@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);
1030 */ 1186 */
1031int try_to_del_timer_sync(struct timer_list *timer) 1187int try_to_del_timer_sync(struct timer_list *timer)
1032{ 1188{
1033 struct tvec_base *base; 1189 struct timer_base *base;
1034 unsigned long flags; 1190 unsigned long flags;
1035 int ret = -1; 1191 int ret = -1;
1036 1192
@@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)
1114EXPORT_SYMBOL(del_timer_sync); 1270EXPORT_SYMBOL(del_timer_sync);
1115#endif 1271#endif
1116 1272
1117static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1118{
1119 /* cascade all the timers from tv up one level */
1120 struct timer_list *timer;
1121 struct hlist_node *tmp;
1122 struct hlist_head tv_list;
1123
1124 hlist_move_list(tv->vec + index, &tv_list);
1125
1126 /*
1127 * We are removing _all_ timers from the list, so we
1128 * don't have to detach them individually.
1129 */
1130 hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1131 /* No accounting, while moving them */
1132 __internal_add_timer(base, timer);
1133 }
1134
1135 return index;
1136}
1137
1138static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1273static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1139 unsigned long data) 1274 unsigned long data)
1140{ 1275{
@@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1178 } 1313 }
1179} 1314}
1180 1315
1181#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1316static void expire_timers(struct timer_base *base, struct hlist_head *head)
1182
1183/**
1184 * __run_timers - run all expired timers (if any) on this CPU.
1185 * @base: the timer vector to be processed.
1186 *
1187 * This function cascades all vectors and executes all expired timer
1188 * vectors.
1189 */
1190static inline void __run_timers(struct tvec_base *base)
1191{ 1317{
1192 struct timer_list *timer; 1318 while (!hlist_empty(head)) {
1319 struct timer_list *timer;
1320 void (*fn)(unsigned long);
1321 unsigned long data;
1193 1322
1194 spin_lock_irq(&base->lock); 1323 timer = hlist_entry(head->first, struct timer_list, entry);
1324 timer_stats_account_timer(timer);
1195 1325
1196 while (time_after_eq(jiffies, base->timer_jiffies)) { 1326 base->running_timer = timer;
1197 struct hlist_head work_list; 1327 detach_timer(timer, true);
1198 struct hlist_head *head = &work_list;
1199 int index;
1200 1328
1201 if (!base->all_timers) { 1329 fn = timer->function;
1202 base->timer_jiffies = jiffies; 1330 data = timer->data;
1203 break; 1331
1332 if (timer->flags & TIMER_IRQSAFE) {
1333 spin_unlock(&base->lock);
1334 call_timer_fn(timer, fn, data);
1335 spin_lock(&base->lock);
1336 } else {
1337 spin_unlock_irq(&base->lock);
1338 call_timer_fn(timer, fn, data);
1339 spin_lock_irq(&base->lock);
1204 } 1340 }
1341 }
1342}
1205 1343
1206 index = base->timer_jiffies & TVR_MASK; 1344static int __collect_expired_timers(struct timer_base *base,
1345 struct hlist_head *heads)
1346{
1347 unsigned long clk = base->clk;
1348 struct hlist_head *vec;
1349 int i, levels = 0;
1350 unsigned int idx;
1207 1351
1208 /* 1352 for (i = 0; i < LVL_DEPTH; i++) {
1209 * Cascade timers: 1353 idx = (clk & LVL_MASK) + i * LVL_SIZE;
1210 */ 1354
1211 if (!index && 1355 if (__test_and_clear_bit(idx, base->pending_map)) {
1212 (!cascade(base, &base->tv2, INDEX(0))) && 1356 vec = base->vectors + idx;
1213 (!cascade(base, &base->tv3, INDEX(1))) && 1357 hlist_move_list(vec, heads++);
1214 !cascade(base, &base->tv4, INDEX(2))) 1358 levels++;
1215 cascade(base, &base->tv5, INDEX(3));
1216 ++base->timer_jiffies;
1217 hlist_move_list(base->tv1.vec + index, head);
1218 while (!hlist_empty(head)) {
1219 void (*fn)(unsigned long);
1220 unsigned long data;
1221 bool irqsafe;
1222
1223 timer = hlist_entry(head->first, struct timer_list, entry);
1224 fn = timer->function;
1225 data = timer->data;
1226 irqsafe = timer->flags & TIMER_IRQSAFE;
1227
1228 timer_stats_account_timer(timer);
1229
1230 base->running_timer = timer;
1231 detach_expired_timer(timer, base);
1232
1233 if (irqsafe) {
1234 spin_unlock(&base->lock);
1235 call_timer_fn(timer, fn, data);
1236 spin_lock(&base->lock);
1237 } else {
1238 spin_unlock_irq(&base->lock);
1239 call_timer_fn(timer, fn, data);
1240 spin_lock_irq(&base->lock);
1241 }
1242 } 1359 }
1360 /* Is it time to look at the next level? */
1361 if (clk & LVL_CLK_MASK)
1362 break;
1363 /* Shift clock for the next level granularity */
1364 clk >>= LVL_CLK_SHIFT;
1243 } 1365 }
1244 base->running_timer = NULL; 1366 return levels;
1245 spin_unlock_irq(&base->lock);
1246} 1367}
1247 1368
1248#ifdef CONFIG_NO_HZ_COMMON 1369#ifdef CONFIG_NO_HZ_COMMON
1249/* 1370/*
1250 * Find out when the next timer event is due to happen. This 1371 * Find the next pending bucket of a level. Search from level start (@offset)
1251 * is used on S/390 to stop all activity when a CPU is idle. 1372 * + @clk upwards and if nothing there, search from start of the level
1252 * This function needs to be called with interrupts disabled. 1373 * (@offset) up to @offset + clk.
1374 */
1375static int next_pending_bucket(struct timer_base *base, unsigned offset,
1376 unsigned clk)
1377{
1378 unsigned pos, start = offset + clk;
1379 unsigned end = offset + LVL_SIZE;
1380
1381 pos = find_next_bit(base->pending_map, end, start);
1382 if (pos < end)
1383 return pos - start;
1384
1385 pos = find_next_bit(base->pending_map, start, offset);
1386 return pos < start ? pos + LVL_SIZE - start : -1;
1387}
1388
1389/*
1390 * Search the first expiring timer in the various clock levels. Caller must
1391 * hold base->lock.
1253 */ 1392 */
1254static unsigned long __next_timer_interrupt(struct tvec_base *base) 1393static unsigned long __next_timer_interrupt(struct timer_base *base)
1255{ 1394{
1256 unsigned long timer_jiffies = base->timer_jiffies; 1395 unsigned long clk, next, adj;
1257 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; 1396 unsigned lvl, offset = 0;
1258 int index, slot, array, found = 0; 1397
1259 struct timer_list *nte; 1398 next = base->clk + NEXT_TIMER_MAX_DELTA;
1260 struct tvec *varray[4]; 1399 clk = base->clk;
1261 1400 for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
1262 /* Look for timer events in tv1. */ 1401 int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
1263 index = slot = timer_jiffies & TVR_MASK; 1402
1264 do { 1403 if (pos >= 0) {
1265 hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { 1404 unsigned long tmp = clk + (unsigned long) pos;
1266 if (nte->flags & TIMER_DEFERRABLE) 1405
1267 continue; 1406 tmp <<= LVL_SHIFT(lvl);
1268 1407 if (time_before(tmp, next))
1269 found = 1; 1408 next = tmp;
1270 expires = nte->expires;
1271 /* Look at the cascade bucket(s)? */
1272 if (!index || slot < index)
1273 goto cascade;
1274 return expires;
1275 } 1409 }
1276 slot = (slot + 1) & TVR_MASK; 1410 /*
1277 } while (slot != index); 1411 * Clock for the next level. If the current level clock lower
1278 1412 * bits are zero, we look at the next level as is. If not we
1279cascade: 1413 * need to advance it by one because that's going to be the
1280 /* Calculate the next cascade event */ 1414 * next expiring bucket in that level. base->clk is the next
1281 if (index) 1415 * expiring jiffie. So in case of:
1282 timer_jiffies += TVR_SIZE - index; 1416 *
1283 timer_jiffies >>= TVR_BITS; 1417 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1284 1418 * 0 0 0 0 0 0
1285 /* Check tv2-tv5. */ 1419 *
1286 varray[0] = &base->tv2; 1420 * we have to look at all levels @index 0. With
1287 varray[1] = &base->tv3; 1421 *
1288 varray[2] = &base->tv4; 1422 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1289 varray[3] = &base->tv5; 1423 * 0 0 0 0 0 2
1290 1424 *
1291 for (array = 0; array < 4; array++) { 1425 * LVL0 has the next expiring bucket @index 2. The upper
1292 struct tvec *varp = varray[array]; 1426 * levels have the next expiring bucket @index 1.
1293 1427 *
1294 index = slot = timer_jiffies & TVN_MASK; 1428 * In case that the propagation wraps the next level the same
1295 do { 1429 * rules apply:
1296 hlist_for_each_entry(nte, varp->vec + slot, entry) { 1430 *
1297 if (nte->flags & TIMER_DEFERRABLE) 1431 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1298 continue; 1432 * 0 0 0 0 F 2
1299 1433 *
1300 found = 1; 1434 * So after looking at LVL0 we get:
1301 if (time_before(nte->expires, expires)) 1435 *
1302 expires = nte->expires; 1436 * LVL5 LVL4 LVL3 LVL2 LVL1
1303 } 1437 * 0 0 0 1 0
1304 /* 1438 *
1305 * Do we still search for the first timer or are 1439 * So no propagation from LVL1 to LVL2 because that happened
1306 * we looking up the cascade buckets ? 1440 * with the add already, but then we need to propagate further
1307 */ 1441 * from LVL2 to LVL3.
1308 if (found) { 1442 *
1309 /* Look at the cascade bucket(s)? */ 1443 * So the simple check whether the lower bits of the current
1310 if (!index || slot < index) 1444 * level are 0 or not is sufficient for all cases.
1311 break; 1445 */
1312 return expires; 1446 adj = clk & LVL_CLK_MASK ? 1 : 0;
1313 } 1447 clk >>= LVL_CLK_SHIFT;
1314 slot = (slot + 1) & TVN_MASK; 1448 clk += adj;
1315 } while (slot != index);
1316
1317 if (index)
1318 timer_jiffies += TVN_SIZE - index;
1319 timer_jiffies >>= TVN_BITS;
1320 } 1449 }
1321 return expires; 1450 return next;
1322} 1451}
1323 1452
1324/* 1453/*
@@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1364 */ 1493 */
1365u64 get_next_timer_interrupt(unsigned long basej, u64 basem) 1494u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1366{ 1495{
1367 struct tvec_base *base = this_cpu_ptr(&tvec_bases); 1496 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1368 u64 expires = KTIME_MAX; 1497 u64 expires = KTIME_MAX;
1369 unsigned long nextevt; 1498 unsigned long nextevt;
1370 1499
@@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1376 return expires; 1505 return expires;
1377 1506
1378 spin_lock(&base->lock); 1507 spin_lock(&base->lock);
1379 if (base->active_timers) { 1508 nextevt = __next_timer_interrupt(base);
1380 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1509 base->next_expiry = nextevt;
1381 base->next_timer = __next_timer_interrupt(base); 1510 /*
1382 nextevt = base->next_timer; 1511 * We have a fresh next event. Check whether we can forward the base:
1383 if (time_before_eq(nextevt, basej)) 1512 */
1384 expires = basem; 1513 if (time_after(nextevt, jiffies))
1385 else 1514 base->clk = jiffies;
1386 expires = basem + (nextevt - basej) * TICK_NSEC; 1515 else if (time_after(nextevt, base->clk))
1516 base->clk = nextevt;
1517
1518 if (time_before_eq(nextevt, basej)) {
1519 expires = basem;
1520 base->is_idle = false;
1521 } else {
1522 expires = basem + (nextevt - basej) * TICK_NSEC;
1523 /*
1524 * If we expect to sleep more than a tick, mark the base idle:
1525 */
1526 if ((expires - basem) > TICK_NSEC)
1527 base->is_idle = true;
1387 } 1528 }
1388 spin_unlock(&base->lock); 1529 spin_unlock(&base->lock);
1389 1530
1390 return cmp_next_hrtimer_event(basem, expires); 1531 return cmp_next_hrtimer_event(basem, expires);
1391} 1532}
1533
1534/**
1535 * timer_clear_idle - Clear the idle state of the timer base
1536 *
1537 * Called with interrupts disabled
1538 */
1539void timer_clear_idle(void)
1540{
1541 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1542
1543 /*
1544 * We do this unlocked. The worst outcome is a remote enqueue sending
1545 * a pointless IPI, but taking the lock would just make the window for
1546 * sending the IPI a few instructions smaller for the cost of taking
1547 * the lock in the exit from idle path.
1548 */
1549 base->is_idle = false;
1550}
1551
1552static int collect_expired_timers(struct timer_base *base,
1553 struct hlist_head *heads)
1554{
1555 /*
1556 * NOHZ optimization. After a long idle sleep we need to forward the
1557 * base to current jiffies. Avoid a loop by searching the bitfield for
1558 * the next expiring timer.
1559 */
1560 if ((long)(jiffies - base->clk) > 2) {
1561 unsigned long next = __next_timer_interrupt(base);
1562
1563 /*
1564 * If the next timer is ahead of time forward to current
1565 * jiffies, otherwise forward to the next expiry time:
1566 */
1567 if (time_after(next, jiffies)) {
1568 /* The call site will increment clock! */
1569 base->clk = jiffies - 1;
1570 return 0;
1571 }
1572 base->clk = next;
1573 }
1574 return __collect_expired_timers(base, heads);
1575}
1576#else
1577static inline int collect_expired_timers(struct timer_base *base,
1578 struct hlist_head *heads)
1579{
1580 return __collect_expired_timers(base, heads);
1581}
1392#endif 1582#endif
1393 1583
1394/* 1584/*
@@ -1411,15 +1601,42 @@ void update_process_times(int user_tick)
1411 run_posix_cpu_timers(p); 1601 run_posix_cpu_timers(p);
1412} 1602}
1413 1603
1604/**
1605 * __run_timers - run all expired timers (if any) on this CPU.
1606 * @base: the timer vector to be processed.
1607 */
1608static inline void __run_timers(struct timer_base *base)
1609{
1610 struct hlist_head heads[LVL_DEPTH];
1611 int levels;
1612
1613 if (!time_after_eq(jiffies, base->clk))
1614 return;
1615
1616 spin_lock_irq(&base->lock);
1617
1618 while (time_after_eq(jiffies, base->clk)) {
1619
1620 levels = collect_expired_timers(base, heads);
1621 base->clk++;
1622
1623 while (levels--)
1624 expire_timers(base, heads + levels);
1625 }
1626 base->running_timer = NULL;
1627 spin_unlock_irq(&base->lock);
1628}
1629
1414/* 1630/*
1415 * This function runs timers and the timer-tq in bottom half context. 1631 * This function runs timers and the timer-tq in bottom half context.
1416 */ 1632 */
1417static void run_timer_softirq(struct softirq_action *h) 1633static void run_timer_softirq(struct softirq_action *h)
1418{ 1634{
1419 struct tvec_base *base = this_cpu_ptr(&tvec_bases); 1635 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1420 1636
1421 if (time_after_eq(jiffies, base->timer_jiffies)) 1637 __run_timers(base);
1422 __run_timers(base); 1638 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
1639 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
1423} 1640}
1424 1641
1425/* 1642/*
@@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h)
1427 */ 1644 */
1428void run_local_timers(void) 1645void run_local_timers(void)
1429{ 1646{
1647 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1648
1430 hrtimer_run_queues(); 1649 hrtimer_run_queues();
1650 /* Raise the softirq only if required. */
1651 if (time_before(jiffies, base->clk)) {
1652 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
1653 return;
1654 /* CPU is awake, so check the deferrable base. */
1655 base++;
1656 if (time_before(jiffies, base->clk))
1657 return;
1658 }
1431 raise_softirq(TIMER_SOFTIRQ); 1659 raise_softirq(TIMER_SOFTIRQ);
1432} 1660}
1433 1661
@@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout)
1512 expire = timeout + jiffies; 1740 expire = timeout + jiffies;
1513 1741
1514 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1742 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1515 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); 1743 __mod_timer(&timer, expire, false);
1516 schedule(); 1744 schedule();
1517 del_singleshot_timer_sync(&timer); 1745 del_singleshot_timer_sync(&timer);
1518 1746
@@ -1563,87 +1791,62 @@ signed long __sched schedule_timeout_idle(signed long timeout)
1563EXPORT_SYMBOL(schedule_timeout_idle); 1791EXPORT_SYMBOL(schedule_timeout_idle);
1564 1792
1565#ifdef CONFIG_HOTPLUG_CPU 1793#ifdef CONFIG_HOTPLUG_CPU
1566static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) 1794static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
1567{ 1795{
1568 struct timer_list *timer; 1796 struct timer_list *timer;
1569 int cpu = new_base->cpu; 1797 int cpu = new_base->cpu;
1570 1798
1571 while (!hlist_empty(head)) { 1799 while (!hlist_empty(head)) {
1572 timer = hlist_entry(head->first, struct timer_list, entry); 1800 timer = hlist_entry(head->first, struct timer_list, entry);
1573 /* We ignore the accounting on the dying cpu */
1574 detach_timer(timer, false); 1801 detach_timer(timer, false);
1575 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; 1802 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
1576 internal_add_timer(new_base, timer); 1803 internal_add_timer(new_base, timer);
1577 } 1804 }
1578} 1805}
1579 1806
1580static void migrate_timers(int cpu) 1807int timers_dead_cpu(unsigned int cpu)
1581{ 1808{
1582 struct tvec_base *old_base; 1809 struct timer_base *old_base;
1583 struct tvec_base *new_base; 1810 struct timer_base *new_base;
1584 int i; 1811 int b, i;
1585 1812
1586 BUG_ON(cpu_online(cpu)); 1813 BUG_ON(cpu_online(cpu));
1587 old_base = per_cpu_ptr(&tvec_bases, cpu);
1588 new_base = get_cpu_ptr(&tvec_bases);
1589 /*
1590 * The caller is globally serialized and nobody else
1591 * takes two locks at once, deadlock is not possible.
1592 */
1593 spin_lock_irq(&new_base->lock);
1594 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1595
1596 BUG_ON(old_base->running_timer);
1597
1598 for (i = 0; i < TVR_SIZE; i++)
1599 migrate_timer_list(new_base, old_base->tv1.vec + i);
1600 for (i = 0; i < TVN_SIZE; i++) {
1601 migrate_timer_list(new_base, old_base->tv2.vec + i);
1602 migrate_timer_list(new_base, old_base->tv3.vec + i);
1603 migrate_timer_list(new_base, old_base->tv4.vec + i);
1604 migrate_timer_list(new_base, old_base->tv5.vec + i);
1605 }
1606 1814
1607 old_base->active_timers = 0; 1815 for (b = 0; b < NR_BASES; b++) {
1608 old_base->all_timers = 0; 1816 old_base = per_cpu_ptr(&timer_bases[b], cpu);
1817 new_base = get_cpu_ptr(&timer_bases[b]);
1818 /*
1819 * The caller is globally serialized and nobody else
1820 * takes two locks at once, deadlock is not possible.
1821 */
1822 spin_lock_irq(&new_base->lock);
1823 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1609 1824
1610 spin_unlock(&old_base->lock); 1825 BUG_ON(old_base->running_timer);
1611 spin_unlock_irq(&new_base->lock);
1612 put_cpu_ptr(&tvec_bases);
1613}
1614 1826
1615static int timer_cpu_notify(struct notifier_block *self, 1827 for (i = 0; i < WHEEL_SIZE; i++)
1616 unsigned long action, void *hcpu) 1828 migrate_timer_list(new_base, old_base->vectors + i);
1617{
1618 switch (action) {
1619 case CPU_DEAD:
1620 case CPU_DEAD_FROZEN:
1621 migrate_timers((long)hcpu);
1622 break;
1623 default:
1624 break;
1625 }
1626 1829
1627 return NOTIFY_OK; 1830 spin_unlock(&old_base->lock);
1831 spin_unlock_irq(&new_base->lock);
1832 put_cpu_ptr(&timer_bases);
1833 }
1834 return 0;
1628} 1835}
1629 1836
1630static inline void timer_register_cpu_notifier(void)
1631{
1632 cpu_notifier(timer_cpu_notify, 0);
1633}
1634#else
1635static inline void timer_register_cpu_notifier(void) { }
1636#endif /* CONFIG_HOTPLUG_CPU */ 1837#endif /* CONFIG_HOTPLUG_CPU */
1637 1838
1638static void __init init_timer_cpu(int cpu) 1839static void __init init_timer_cpu(int cpu)
1639{ 1840{
1640 struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); 1841 struct timer_base *base;
1641 1842 int i;
1642 base->cpu = cpu;
1643 spin_lock_init(&base->lock);
1644 1843
1645 base->timer_jiffies = jiffies; 1844 for (i = 0; i < NR_BASES; i++) {
1646 base->next_timer = base->timer_jiffies; 1845 base = per_cpu_ptr(&timer_bases[i], cpu);
1846 base->cpu = cpu;
1847 spin_lock_init(&base->lock);
1848 base->clk = jiffies;
1849 }
1647} 1850}
1648 1851
1649static void __init init_timer_cpus(void) 1852static void __init init_timer_cpus(void)
@@ -1658,7 +1861,6 @@ void __init init_timers(void)
1658{ 1861{
1659 init_timer_cpus(); 1862 init_timer_cpus();
1660 init_timer_stats(); 1863 init_timer_stats();
1661 timer_register_cpu_notifier();
1662 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1864 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1663} 1865}
1664 1866
@@ -1702,9 +1904,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
1702} 1904}
1703 1905
1704/** 1906/**
1705 * usleep_range - Drop in replacement for udelay where wakeup is flexible 1907 * usleep_range - Sleep for an approximate time
1706 * @min: Minimum time in usecs to sleep 1908 * @min: Minimum time in usecs to sleep
1707 * @max: Maximum time in usecs to sleep 1909 * @max: Maximum time in usecs to sleep
1910 *
1911 * In non-atomic context where the exact wakeup time is flexible, use
1912 * usleep_range() instead of udelay(). The sleep improves responsiveness
1913 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
1914 * power usage by allowing hrtimers to take advantage of an already-
1915 * scheduled interrupt instead of scheduling a new one just for this sleep.
1708 */ 1916 */
1709void __sched usleep_range(unsigned long min, unsigned long max) 1917void __sched usleep_range(unsigned long min, unsigned long max)
1710{ 1918{
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1adecb4b87c8..087204c733eb 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr)
279 279
280static int tstats_show(struct seq_file *m, void *v) 280static int tstats_show(struct seq_file *m, void *v)
281{ 281{
282 struct timespec period; 282 struct timespec64 period;
283 struct entry *entry; 283 struct entry *entry;
284 unsigned long ms; 284 unsigned long ms;
285 long events = 0; 285 long events = 0;
@@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v)
295 295
296 time = ktime_sub(time_stop, time_start); 296 time = ktime_sub(time_stop, time_start);
297 297
298 period = ktime_to_timespec(time); 298 period = ktime_to_timespec64(time);
299 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
300 300
301 seq_puts(m, "Timer Stats Version: v0.3\n"); 301 seq_puts(m, "Timer Stats Version: v0.3\n");
302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms);
303 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); 304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); 305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
diff --git a/kernel/torture.c b/kernel/torture.c
index fa0bdeee17ac..75961b3decfe 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -82,6 +82,104 @@ static int min_online = -1;
82static int max_online; 82static int max_online;
83 83
84/* 84/*
85 * Attempt to take a CPU offline. Return false if the CPU is already
86 * offline or if it is not subject to CPU-hotplug operations. The
87 * caller can detect other failures by looking at the statistics.
88 */
89bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
90 unsigned long *sum_offl, int *min_offl, int *max_offl)
91{
92 unsigned long delta;
93 int ret;
94 unsigned long starttime;
95
96 if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
97 return false;
98
99 if (verbose)
100 pr_alert("%s" TORTURE_FLAG
101 "torture_onoff task: offlining %d\n",
102 torture_type, cpu);
103 starttime = jiffies;
104 (*n_offl_attempts)++;
105 ret = cpu_down(cpu);
106 if (ret) {
107 if (verbose)
108 pr_alert("%s" TORTURE_FLAG
109 "torture_onoff task: offline %d failed: errno %d\n",
110 torture_type, cpu, ret);
111 } else {
112 if (verbose)
113 pr_alert("%s" TORTURE_FLAG
114 "torture_onoff task: offlined %d\n",
115 torture_type, cpu);
116 (*n_offl_successes)++;
117 delta = jiffies - starttime;
118 sum_offl += delta;
119 if (*min_offl < 0) {
120 *min_offl = delta;
121 *max_offl = delta;
122 }
123 if (*min_offl > delta)
124 *min_offl = delta;
125 if (*max_offl < delta)
126 *max_offl = delta;
127 }
128
129 return true;
130}
131EXPORT_SYMBOL_GPL(torture_offline);
132
133/*
134 * Attempt to bring a CPU online. Return false if the CPU is already
135 * online or if it is not subject to CPU-hotplug operations. The
136 * caller can detect other failures by looking at the statistics.
137 */
138bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
139 unsigned long *sum_onl, int *min_onl, int *max_onl)
140{
141 unsigned long delta;
142 int ret;
143 unsigned long starttime;
144
145 if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
146 return false;
147
148 if (verbose)
149 pr_alert("%s" TORTURE_FLAG
150 "torture_onoff task: onlining %d\n",
151 torture_type, cpu);
152 starttime = jiffies;
153 (*n_onl_attempts)++;
154 ret = cpu_up(cpu);
155 if (ret) {
156 if (verbose)
157 pr_alert("%s" TORTURE_FLAG
158 "torture_onoff task: online %d failed: errno %d\n",
159 torture_type, cpu, ret);
160 } else {
161 if (verbose)
162 pr_alert("%s" TORTURE_FLAG
163 "torture_onoff task: onlined %d\n",
164 torture_type, cpu);
165 (*n_onl_successes)++;
166 delta = jiffies - starttime;
167 *sum_onl += delta;
168 if (*min_onl < 0) {
169 *min_onl = delta;
170 *max_onl = delta;
171 }
172 if (*min_onl > delta)
173 *min_onl = delta;
174 if (*max_onl < delta)
175 *max_onl = delta;
176 }
177
178 return true;
179}
180EXPORT_SYMBOL_GPL(torture_online);
181
182/*
85 * Execute random CPU-hotplug operations at the interval specified 183 * Execute random CPU-hotplug operations at the interval specified
86 * by the onoff_interval. 184 * by the onoff_interval.
87 */ 185 */
@@ -89,16 +187,19 @@ static int
89torture_onoff(void *arg) 187torture_onoff(void *arg)
90{ 188{
91 int cpu; 189 int cpu;
92 unsigned long delta;
93 int maxcpu = -1; 190 int maxcpu = -1;
94 DEFINE_TORTURE_RANDOM(rand); 191 DEFINE_TORTURE_RANDOM(rand);
95 int ret;
96 unsigned long starttime;
97 192
98 VERBOSE_TOROUT_STRING("torture_onoff task started"); 193 VERBOSE_TOROUT_STRING("torture_onoff task started");
99 for_each_online_cpu(cpu) 194 for_each_online_cpu(cpu)
100 maxcpu = cpu; 195 maxcpu = cpu;
101 WARN_ON(maxcpu < 0); 196 WARN_ON(maxcpu < 0);
197
198 if (maxcpu == 0) {
199 VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled");
200 goto stop;
201 }
202
102 if (onoff_holdoff > 0) { 203 if (onoff_holdoff > 0) {
103 VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); 204 VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
104 schedule_timeout_interruptible(onoff_holdoff); 205 schedule_timeout_interruptible(onoff_holdoff);
@@ -106,69 +207,16 @@ torture_onoff(void *arg)
106 } 207 }
107 while (!torture_must_stop()) { 208 while (!torture_must_stop()) {
108 cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); 209 cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
109 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 210 if (!torture_offline(cpu,
110 if (verbose) 211 &n_offline_attempts, &n_offline_successes,
111 pr_alert("%s" TORTURE_FLAG 212 &sum_offline, &min_offline, &max_offline))
112 "torture_onoff task: offlining %d\n", 213 torture_online(cpu,
113 torture_type, cpu); 214 &n_online_attempts, &n_online_successes,
114 starttime = jiffies; 215 &sum_online, &min_online, &max_online);
115 n_offline_attempts++;
116 ret = cpu_down(cpu);
117 if (ret) {
118 if (verbose)
119 pr_alert("%s" TORTURE_FLAG
120 "torture_onoff task: offline %d failed: errno %d\n",
121 torture_type, cpu, ret);
122 } else {
123 if (verbose)
124 pr_alert("%s" TORTURE_FLAG
125 "torture_onoff task: offlined %d\n",
126 torture_type, cpu);
127 n_offline_successes++;
128 delta = jiffies - starttime;
129 sum_offline += delta;
130 if (min_offline < 0) {
131 min_offline = delta;
132 max_offline = delta;
133 }
134 if (min_offline > delta)
135 min_offline = delta;
136 if (max_offline < delta)
137 max_offline = delta;
138 }
139 } else if (cpu_is_hotpluggable(cpu)) {
140 if (verbose)
141 pr_alert("%s" TORTURE_FLAG
142 "torture_onoff task: onlining %d\n",
143 torture_type, cpu);
144 starttime = jiffies;
145 n_online_attempts++;
146 ret = cpu_up(cpu);
147 if (ret) {
148 if (verbose)
149 pr_alert("%s" TORTURE_FLAG
150 "torture_onoff task: online %d failed: errno %d\n",
151 torture_type, cpu, ret);
152 } else {
153 if (verbose)
154 pr_alert("%s" TORTURE_FLAG
155 "torture_onoff task: onlined %d\n",
156 torture_type, cpu);
157 n_online_successes++;
158 delta = jiffies - starttime;
159 sum_online += delta;
160 if (min_online < 0) {
161 min_online = delta;
162 max_online = delta;
163 }
164 if (min_online > delta)
165 min_online = delta;
166 if (max_online < delta)
167 max_online = delta;
168 }
169 }
170 schedule_timeout_interruptible(onoff_interval); 216 schedule_timeout_interruptible(onoff_interval);
171 } 217 }
218
219stop:
172 torture_kthread_stopping("torture_onoff"); 220 torture_kthread_stopping("torture_onoff");
173 return 0; 221 return 0;
174} 222}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
542 bool "Histogram triggers" 542 bool "Histogram triggers"
543 depends on ARCH_HAVE_NMI_SAFE_CMPXCHG 543 depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
544 select TRACING_MAP 544 select TRACING_MAP
545 select TRACING
545 default n 546 default n
546 help 547 help
547 Hist triggers allow one or more arbitrary trace event fields 548 Hist triggers allow one or more arbitrary trace event fields
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..fb345cd11883 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
127 127
128static void trace_note_time(struct blk_trace *bt) 128static void trace_note_time(struct blk_trace *bt)
129{ 129{
130 struct timespec now; 130 struct timespec64 now;
131 unsigned long flags; 131 unsigned long flags;
132 u32 words[2]; 132 u32 words[2];
133 133
134 getnstimeofday(&now); 134 /* need to check user space to see if this breaks in y2038 or y2106 */
135 words[0] = now.tv_sec; 135 ktime_get_real_ts64(&now);
136 words[0] = (u32)now.tv_sec;
136 words[1] = now.tv_nsec; 137 words[1] = now.tv_nsec;
137 138
138 local_irq_save(flags); 139 local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
189 BLK_TC_ACT(BLK_TC_WRITE) }; 190 BLK_TC_ACT(BLK_TC_WRITE) };
190 191
191#define BLK_TC_RAHEAD BLK_TC_AHEAD 192#define BLK_TC_RAHEAD BLK_TC_AHEAD
193#define BLK_TC_PREFLUSH BLK_TC_FLUSH
192 194
193/* The ilog2() calls fall out because they're constant */ 195/* The ilog2() calls fall out because they're constant */
194#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ 196#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
199 * blk_io_trace structure and places it in a per-cpu subbuffer. 201 * blk_io_trace structure and places it in a per-cpu subbuffer.
200 */ 202 */
201static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, 203static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
202 int rw, u32 what, int error, int pdu_len, void *pdu_data) 204 int op, int op_flags, u32 what, int error, int pdu_len,
205 void *pdu_data)
203{ 206{
204 struct task_struct *tsk = current; 207 struct task_struct *tsk = current;
205 struct ring_buffer_event *event = NULL; 208 struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
214 if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) 217 if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
215 return; 218 return;
216 219
217 what |= ddir_act[rw & WRITE]; 220 what |= ddir_act[op_is_write(op) ? WRITE : READ];
218 what |= MASK_TC_BIT(rw, SYNC); 221 what |= MASK_TC_BIT(op_flags, SYNC);
219 what |= MASK_TC_BIT(rw, RAHEAD); 222 what |= MASK_TC_BIT(op_flags, RAHEAD);
220 what |= MASK_TC_BIT(rw, META); 223 what |= MASK_TC_BIT(op_flags, META);
221 what |= MASK_TC_BIT(rw, DISCARD); 224 what |= MASK_TC_BIT(op_flags, PREFLUSH);
222 what |= MASK_TC_BIT(rw, FLUSH); 225 what |= MASK_TC_BIT(op_flags, FUA);
223 what |= MASK_TC_BIT(rw, FUA); 226 if (op == REQ_OP_DISCARD)
227 what |= BLK_TC_ACT(BLK_TC_DISCARD);
228 if (op == REQ_OP_FLUSH)
229 what |= BLK_TC_ACT(BLK_TC_FLUSH);
224 230
225 pid = tsk->pid; 231 pid = tsk->pid;
226 if (act_log_check(bt, what, sector, pid)) 232 if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
708 714
709 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 715 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
710 what |= BLK_TC_ACT(BLK_TC_PC); 716 what |= BLK_TC_ACT(BLK_TC_PC);
711 __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, 717 __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
712 what, rq->errors, rq->cmd_len, rq->cmd); 718 what, rq->errors, rq->cmd_len, rq->cmd);
713 } else { 719 } else {
714 what |= BLK_TC_ACT(BLK_TC_FS); 720 what |= BLK_TC_ACT(BLK_TC_FS);
715 __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, 721 __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
716 rq->cmd_flags, what, rq->errors, 0, NULL); 722 rq->cmd_flags, what, rq->errors, 0, NULL);
717 } 723 }
718} 724}
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
770 return; 776 return;
771 777
772 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 778 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
773 bio->bi_rw, what, error, 0, NULL); 779 bio_op(bio), bio->bi_rw, what, error, 0, NULL);
774} 780}
775 781
776static void blk_add_trace_bio_bounce(void *ignore, 782static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
818 struct blk_trace *bt = q->blk_trace; 824 struct blk_trace *bt = q->blk_trace;
819 825
820 if (bt) 826 if (bt)
821 __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); 827 __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
828 NULL);
822 } 829 }
823} 830}
824 831
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
833 struct blk_trace *bt = q->blk_trace; 840 struct blk_trace *bt = q->blk_trace;
834 841
835 if (bt) 842 if (bt)
836 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 843 __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
837 0, 0, NULL); 844 0, 0, NULL);
838 } 845 }
839} 846}
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
843 struct blk_trace *bt = q->blk_trace; 850 struct blk_trace *bt = q->blk_trace;
844 851
845 if (bt) 852 if (bt)
846 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 853 __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
847} 854}
848 855
849static void blk_add_trace_unplug(void *ignore, struct request_queue *q, 856static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
860 else 867 else
861 what = BLK_TA_UNPLUG_TIMER; 868 what = BLK_TA_UNPLUG_TIMER;
862 869
863 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); 870 __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
864 } 871 }
865} 872}
866 873
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
874 __be64 rpdu = cpu_to_be64(pdu); 881 __be64 rpdu = cpu_to_be64(pdu);
875 882
876 __blk_add_trace(bt, bio->bi_iter.bi_sector, 883 __blk_add_trace(bt, bio->bi_iter.bi_sector,
877 bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, 884 bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
878 bio->bi_error, sizeof(rpdu), &rpdu); 885 BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
886 &rpdu);
879 } 887 }
880} 888}
881 889
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
907 r.sector_from = cpu_to_be64(from); 915 r.sector_from = cpu_to_be64(from);
908 916
909 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 917 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
910 bio->bi_rw, BLK_TA_REMAP, bio->bi_error, 918 bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
911 sizeof(r), &r); 919 sizeof(r), &r);
912} 920}
913 921
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
940 r.sector_from = cpu_to_be64(from); 948 r.sector_from = cpu_to_be64(from);
941 949
942 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 950 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
943 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, 951 rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
944 sizeof(r), &r); 952 sizeof(r), &r);
945} 953}
946 954
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
965 return; 973 return;
966 974
967 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) 975 if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
968 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 976 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
969 BLK_TA_DRV_DATA, rq->errors, len, data); 977 BLK_TA_DRV_DATA, rq->errors, len, data);
970 else 978 else
971 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 979 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
972 BLK_TA_DRV_DATA, rq->errors, len, data); 980 BLK_TA_DRV_DATA, rq->errors, len, data);
973} 981}
974EXPORT_SYMBOL_GPL(blk_add_driver_data); 982EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
1769 } 1777 }
1770} 1778}
1771 1779
1772void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) 1780void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
1773{ 1781{
1774 int i = 0; 1782 int i = 0;
1775 1783
1776 if (rw & REQ_FLUSH) 1784 if (rw & REQ_PREFLUSH)
1777 rwbs[i++] = 'F'; 1785 rwbs[i++] = 'F';
1778 1786
1779 if (rw & WRITE) 1787 switch (op) {
1788 case REQ_OP_WRITE:
1789 case REQ_OP_WRITE_SAME:
1780 rwbs[i++] = 'W'; 1790 rwbs[i++] = 'W';
1781 else if (rw & REQ_DISCARD) 1791 break;
1792 case REQ_OP_DISCARD:
1793 rwbs[i++] = 'D';
1794 break;
1795 case REQ_OP_SECURE_ERASE:
1782 rwbs[i++] = 'D'; 1796 rwbs[i++] = 'D';
1783 else if (bytes) 1797 rwbs[i++] = 'E';
1798 break;
1799 case REQ_OP_FLUSH:
1800 rwbs[i++] = 'F';
1801 break;
1802 case REQ_OP_READ:
1784 rwbs[i++] = 'R'; 1803 rwbs[i++] = 'R';
1785 else 1804 break;
1805 default:
1786 rwbs[i++] = 'N'; 1806 rwbs[i++] = 'N';
1807 }
1787 1808
1788 if (rw & REQ_FUA) 1809 if (rw & REQ_FUA)
1789 rwbs[i++] = 'F'; 1810 rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1793 rwbs[i++] = 'S'; 1814 rwbs[i++] = 'S';
1794 if (rw & REQ_META) 1815 if (rw & REQ_META)
1795 rwbs[i++] = 'M'; 1816 rwbs[i++] = 'M';
1796 if (rw & REQ_SECURE)
1797 rwbs[i++] = 'E';
1798 1817
1799 rwbs[i] = '\0'; 1818 rwbs[i] = '\0';
1800} 1819}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 780bcbe1d4de..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
81 .arg3_type = ARG_ANYTHING, 81 .arg3_type = ARG_ANYTHING,
82}; 82};
83 83
84static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
85{
86 void *unsafe_ptr = (void *) (long) r1;
87 void *src = (void *) (long) r2;
88 int size = (int) r3;
89
90 /*
91 * Ensure we're in user context which is safe for the helper to
92 * run. This helper has no business in a kthread.
93 *
94 * access_ok() should prevent writing to non-user memory, but in
95 * some situations (nommu, temporary switch, etc) access_ok() does
96 * not provide enough validation, hence the check on KERNEL_DS.
97 */
98
99 if (unlikely(in_interrupt() ||
100 current->flags & (PF_KTHREAD | PF_EXITING)))
101 return -EPERM;
102 if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
103 return -EPERM;
104 if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
105 return -EPERM;
106
107 return probe_kernel_write(unsafe_ptr, src, size);
108}
109
110static const struct bpf_func_proto bpf_probe_write_user_proto = {
111 .func = bpf_probe_write_user,
112 .gpl_only = true,
113 .ret_type = RET_INTEGER,
114 .arg1_type = ARG_ANYTHING,
115 .arg2_type = ARG_PTR_TO_STACK,
116 .arg3_type = ARG_CONST_STACK_SIZE,
117};
118
119static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
120{
121 pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
122 current->comm, task_pid_nr(current));
123
124 return &bpf_probe_write_user_proto;
125}
126
84/* 127/*
85 * limited trace_printk() 128 * limited trace_printk()
86 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed 129 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
188 return &bpf_trace_printk_proto; 231 return &bpf_trace_printk_proto;
189} 232}
190 233
191static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) 234static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
192{ 235{
193 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; 236 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
194 struct bpf_array *array = container_of(map, struct bpf_array, map); 237 struct bpf_array *array = container_of(map, struct bpf_array, map);
238 unsigned int cpu = smp_processor_id();
239 u64 index = flags & BPF_F_INDEX_MASK;
240 struct bpf_event_entry *ee;
195 struct perf_event *event; 241 struct perf_event *event;
196 struct file *file;
197 242
243 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
244 return -EINVAL;
245 if (index == BPF_F_CURRENT_CPU)
246 index = cpu;
198 if (unlikely(index >= array->map.max_entries)) 247 if (unlikely(index >= array->map.max_entries))
199 return -E2BIG; 248 return -E2BIG;
200 249
201 file = (struct file *)array->ptrs[index]; 250 ee = READ_ONCE(array->ptrs[index]);
202 if (unlikely(!file)) 251 if (!ee)
203 return -ENOENT; 252 return -ENOENT;
204 253
205 event = file->private_data; 254 event = ee->event;
255 if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
256 event->attr.type != PERF_TYPE_RAW))
257 return -EINVAL;
206 258
207 /* make sure event is local and doesn't have pmu::count */ 259 /* make sure event is local and doesn't have pmu::count */
208 if (event->oncpu != smp_processor_id() || 260 if (unlikely(event->oncpu != cpu || event->pmu->count))
209 event->pmu->count)
210 return -EINVAL; 261 return -EINVAL;
211 262
212 /* 263 /*
@@ -225,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
225 .arg2_type = ARG_ANYTHING, 276 .arg2_type = ARG_ANYTHING,
226}; 277};
227 278
228static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) 279static __always_inline u64
280__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
281 u64 flags, struct perf_raw_record *raw)
229{ 282{
230 struct pt_regs *regs = (struct pt_regs *) (long) r1;
231 struct bpf_map *map = (struct bpf_map *) (long) r2;
232 struct bpf_array *array = container_of(map, struct bpf_array, map); 283 struct bpf_array *array = container_of(map, struct bpf_array, map);
284 unsigned int cpu = smp_processor_id();
233 u64 index = flags & BPF_F_INDEX_MASK; 285 u64 index = flags & BPF_F_INDEX_MASK;
234 void *data = (void *) (long) r4;
235 struct perf_sample_data sample_data; 286 struct perf_sample_data sample_data;
287 struct bpf_event_entry *ee;
236 struct perf_event *event; 288 struct perf_event *event;
237 struct file *file;
238 struct perf_raw_record raw = {
239 .size = size,
240 .data = data,
241 };
242 289
243 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
244 return -EINVAL;
245 if (index == BPF_F_CURRENT_CPU) 290 if (index == BPF_F_CURRENT_CPU)
246 index = raw_smp_processor_id(); 291 index = cpu;
247 if (unlikely(index >= array->map.max_entries)) 292 if (unlikely(index >= array->map.max_entries))
248 return -E2BIG; 293 return -E2BIG;
249 294
250 file = (struct file *)array->ptrs[index]; 295 ee = READ_ONCE(array->ptrs[index]);
251 if (unlikely(!file)) 296 if (!ee)
252 return -ENOENT; 297 return -ENOENT;
253 298
254 event = file->private_data; 299 event = ee->event;
255
256 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || 300 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
257 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) 301 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
258 return -EINVAL; 302 return -EINVAL;
259 303
260 if (unlikely(event->oncpu != smp_processor_id())) 304 if (unlikely(event->oncpu != cpu))
261 return -EOPNOTSUPP; 305 return -EOPNOTSUPP;
262 306
263 perf_sample_data_init(&sample_data, 0, 0); 307 perf_sample_data_init(&sample_data, 0, 0);
264 sample_data.raw = &raw; 308 sample_data.raw = raw;
265 perf_event_output(event, &sample_data, regs); 309 perf_event_output(event, &sample_data, regs);
266 return 0; 310 return 0;
267} 311}
268 312
313static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
314{
315 struct pt_regs *regs = (struct pt_regs *)(long) r1;
316 struct bpf_map *map = (struct bpf_map *)(long) r2;
317 void *data = (void *)(long) r4;
318 struct perf_raw_record raw = {
319 .frag = {
320 .size = size,
321 .data = data,
322 },
323 };
324
325 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
326 return -EINVAL;
327
328 return __bpf_perf_event_output(regs, map, flags, &raw);
329}
330
269static const struct bpf_func_proto bpf_perf_event_output_proto = { 331static const struct bpf_func_proto bpf_perf_event_output_proto = {
270 .func = bpf_perf_event_output, 332 .func = bpf_perf_event_output,
271 .gpl_only = true, 333 .gpl_only = true,
@@ -279,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
279 341
280static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); 342static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
281 343
282static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) 344u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
345 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
283{ 346{
284 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); 347 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
348 struct perf_raw_frag frag = {
349 .copy = ctx_copy,
350 .size = ctx_size,
351 .data = ctx,
352 };
353 struct perf_raw_record raw = {
354 .frag = {
355 {
356 .next = ctx_size ? &frag : NULL,
357 },
358 .size = meta_size,
359 .data = meta,
360 },
361 };
285 362
286 perf_fetch_caller_regs(regs); 363 perf_fetch_caller_regs(regs);
287 364
288 return bpf_perf_event_output((long)regs, r2, flags, r4, size); 365 return __bpf_perf_event_output(regs, map, flags, &raw);
289} 366}
290 367
291static const struct bpf_func_proto bpf_event_output_proto = { 368static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
292 .func = bpf_event_output, 369{
370 return (long) current;
371}
372
373static const struct bpf_func_proto bpf_get_current_task_proto = {
374 .func = bpf_get_current_task,
293 .gpl_only = true, 375 .gpl_only = true,
294 .ret_type = RET_INTEGER, 376 .ret_type = RET_INTEGER,
295 .arg1_type = ARG_PTR_TO_CTX,
296 .arg2_type = ARG_CONST_MAP_PTR,
297 .arg3_type = ARG_ANYTHING,
298 .arg4_type = ARG_PTR_TO_STACK,
299 .arg5_type = ARG_CONST_STACK_SIZE,
300}; 377};
301 378
302const struct bpf_func_proto *bpf_get_event_output_proto(void)
303{
304 return &bpf_event_output_proto;
305}
306
307static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) 379static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
308{ 380{
309 switch (func_id) { 381 switch (func_id) {
@@ -321,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
321 return &bpf_tail_call_proto; 393 return &bpf_tail_call_proto;
322 case BPF_FUNC_get_current_pid_tgid: 394 case BPF_FUNC_get_current_pid_tgid:
323 return &bpf_get_current_pid_tgid_proto; 395 return &bpf_get_current_pid_tgid_proto;
396 case BPF_FUNC_get_current_task:
397 return &bpf_get_current_task_proto;
324 case BPF_FUNC_get_current_uid_gid: 398 case BPF_FUNC_get_current_uid_gid:
325 return &bpf_get_current_uid_gid_proto; 399 return &bpf_get_current_uid_gid_proto;
326 case BPF_FUNC_get_current_comm: 400 case BPF_FUNC_get_current_comm:
@@ -331,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
331 return &bpf_get_smp_processor_id_proto; 405 return &bpf_get_smp_processor_id_proto;
332 case BPF_FUNC_perf_event_read: 406 case BPF_FUNC_perf_event_read:
333 return &bpf_perf_event_read_proto; 407 return &bpf_perf_event_read_proto;
408 case BPF_FUNC_probe_write_user:
409 return bpf_get_probe_write_proto();
334 default: 410 default:
335 return NULL; 411 return NULL;
336 } 412 }
@@ -349,20 +425,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
349} 425}
350 426
351/* bpf+kprobe programs can access fields of 'struct pt_regs' */ 427/* bpf+kprobe programs can access fields of 'struct pt_regs' */
352static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) 428static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
429 enum bpf_reg_type *reg_type)
353{ 430{
354 /* check bounds */
355 if (off < 0 || off >= sizeof(struct pt_regs)) 431 if (off < 0 || off >= sizeof(struct pt_regs))
356 return false; 432 return false;
357
358 /* only read is allowed */
359 if (type != BPF_READ) 433 if (type != BPF_READ)
360 return false; 434 return false;
361
362 /* disallow misaligned access */
363 if (off % size != 0) 435 if (off % size != 0)
364 return false; 436 return false;
365
366 return true; 437 return true;
367} 438}
368 439
@@ -427,7 +498,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
427 } 498 }
428} 499}
429 500
430static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) 501static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
502 enum bpf_reg_type *reg_type)
431{ 503{
432 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) 504 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
433 return false; 505 return false;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
89/* What to set function_trace_op to */ 89/* What to set function_trace_op to */
90static struct ftrace_ops *set_function_trace_op; 90static struct ftrace_ops *set_function_trace_op;
91 91
92/* List for set_ftrace_pid's pids. */ 92static bool ftrace_pids_enabled(struct ftrace_ops *ops)
93LIST_HEAD(ftrace_pids);
94struct ftrace_pid {
95 struct list_head list;
96 struct pid *pid;
97};
98
99static bool ftrace_pids_enabled(void)
100{ 93{
101 return !list_empty(&ftrace_pids); 94 struct trace_array *tr;
95
96 if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
97 return false;
98
99 tr = ops->private;
100
101 return tr->function_pids != NULL;
102} 102}
103 103
104static void ftrace_update_trampoline(struct ftrace_ops *ops); 104static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
179static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 179static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
180 struct ftrace_ops *op, struct pt_regs *regs) 180 struct ftrace_ops *op, struct pt_regs *regs)
181{ 181{
182 if (!test_tsk_trace_trace(current)) 182 struct trace_array *tr = op->private;
183
184 if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
183 return; 185 return;
184 186
185 op->saved_func(ip, parent_ip, op, regs); 187 op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
417 /* Always save the function, and reset at unregistering */ 419 /* Always save the function, and reset at unregistering */
418 ops->saved_func = ops->func; 420 ops->saved_func = ops->func;
419 421
420 if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) 422 if (ftrace_pids_enabled(ops))
421 ops->func = ftrace_pid_func; 423 ops->func = ftrace_pid_func;
422 424
423 ftrace_update_trampoline(ops); 425 ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
450 452
451static void ftrace_update_pid_func(void) 453static void ftrace_update_pid_func(void)
452{ 454{
453 bool enabled = ftrace_pids_enabled();
454 struct ftrace_ops *op; 455 struct ftrace_ops *op;
455 456
456 /* Only do something if we are tracing something */ 457 /* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
459 460
460 do_for_each_ftrace_op(op, ftrace_ops_list) { 461 do_for_each_ftrace_op(op, ftrace_ops_list) {
461 if (op->flags & FTRACE_OPS_FL_PID) { 462 if (op->flags & FTRACE_OPS_FL_PID) {
462 op->func = enabled ? ftrace_pid_func : 463 op->func = ftrace_pids_enabled(op) ?
463 op->saved_func; 464 ftrace_pid_func : op->saved_func;
464 ftrace_update_trampoline(op); 465 ftrace_update_trampoline(op);
465 } 466 }
466 } while_for_each_ftrace_op(op); 467 } while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
5324 return ops->func; 5325 return ops->func;
5325} 5326}
5326 5327
5327static void clear_ftrace_swapper(void) 5328static void
5329ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
5330 struct task_struct *prev, struct task_struct *next)
5328{ 5331{
5329 struct task_struct *p; 5332 struct trace_array *tr = data;
5330 int cpu; 5333 struct trace_pid_list *pid_list;
5331 5334
5332 get_online_cpus(); 5335 pid_list = rcu_dereference_sched(tr->function_pids);
5333 for_each_online_cpu(cpu) {
5334 p = idle_task(cpu);
5335 clear_tsk_trace_trace(p);
5336 }
5337 put_online_cpus();
5338}
5339
5340static void set_ftrace_swapper(void)
5341{
5342 struct task_struct *p;
5343 int cpu;
5344 5336
5345 get_online_cpus(); 5337 this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
5346 for_each_online_cpu(cpu) { 5338 trace_ignore_this_task(pid_list, next));
5347 p = idle_task(cpu);
5348 set_tsk_trace_trace(p);
5349 }
5350 put_online_cpus();
5351} 5339}
5352 5340
5353static void clear_ftrace_pid(struct pid *pid) 5341static void clear_ftrace_pids(struct trace_array *tr)
5354{ 5342{
5355 struct task_struct *p; 5343 struct trace_pid_list *pid_list;
5344 int cpu;
5356 5345
5357 rcu_read_lock(); 5346 pid_list = rcu_dereference_protected(tr->function_pids,
5358 do_each_pid_task(pid, PIDTYPE_PID, p) { 5347 lockdep_is_held(&ftrace_lock));
5359 clear_tsk_trace_trace(p); 5348 if (!pid_list)
5360 } while_each_pid_task(pid, PIDTYPE_PID, p); 5349 return;
5361 rcu_read_unlock();
5362 5350
5363 put_pid(pid); 5351 unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
5364}
5365 5352
5366static void set_ftrace_pid(struct pid *pid) 5353 for_each_possible_cpu(cpu)
5367{ 5354 per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
5368 struct task_struct *p;
5369 5355
5370 rcu_read_lock(); 5356 rcu_assign_pointer(tr->function_pids, NULL);
5371 do_each_pid_task(pid, PIDTYPE_PID, p) {
5372 set_tsk_trace_trace(p);
5373 } while_each_pid_task(pid, PIDTYPE_PID, p);
5374 rcu_read_unlock();
5375}
5376 5357
5377static void clear_ftrace_pid_task(struct pid *pid) 5358 /* Wait till all users are no longer using pid filtering */
5378{ 5359 synchronize_sched();
5379 if (pid == ftrace_swapper_pid)
5380 clear_ftrace_swapper();
5381 else
5382 clear_ftrace_pid(pid);
5383}
5384 5360
5385static void set_ftrace_pid_task(struct pid *pid) 5361 trace_free_pid_list(pid_list);
5386{
5387 if (pid == ftrace_swapper_pid)
5388 set_ftrace_swapper();
5389 else
5390 set_ftrace_pid(pid);
5391} 5362}
5392 5363
5393static int ftrace_pid_add(int p) 5364static void ftrace_pid_reset(struct trace_array *tr)
5394{ 5365{
5395 struct pid *pid;
5396 struct ftrace_pid *fpid;
5397 int ret = -EINVAL;
5398
5399 mutex_lock(&ftrace_lock); 5366 mutex_lock(&ftrace_lock);
5400 5367 clear_ftrace_pids(tr);
5401 if (!p)
5402 pid = ftrace_swapper_pid;
5403 else
5404 pid = find_get_pid(p);
5405
5406 if (!pid)
5407 goto out;
5408
5409 ret = 0;
5410
5411 list_for_each_entry(fpid, &ftrace_pids, list)
5412 if (fpid->pid == pid)
5413 goto out_put;
5414
5415 ret = -ENOMEM;
5416
5417 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
5418 if (!fpid)
5419 goto out_put;
5420
5421 list_add(&fpid->list, &ftrace_pids);
5422 fpid->pid = pid;
5423
5424 set_ftrace_pid_task(pid);
5425 5368
5426 ftrace_update_pid_func(); 5369 ftrace_update_pid_func();
5427
5428 ftrace_startup_all(0); 5370 ftrace_startup_all(0);
5429 5371
5430 mutex_unlock(&ftrace_lock); 5372 mutex_unlock(&ftrace_lock);
5431 return 0;
5432
5433out_put:
5434 if (pid != ftrace_swapper_pid)
5435 put_pid(pid);
5436
5437out:
5438 mutex_unlock(&ftrace_lock);
5439 return ret;
5440} 5373}
5441 5374
5442static void ftrace_pid_reset(void) 5375/* Greater than any max PID */
5443{ 5376#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1)
5444 struct ftrace_pid *fpid, *safe;
5445
5446 mutex_lock(&ftrace_lock);
5447 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
5448 struct pid *pid = fpid->pid;
5449
5450 clear_ftrace_pid_task(pid);
5451
5452 list_del(&fpid->list);
5453 kfree(fpid);
5454 }
5455
5456 ftrace_update_pid_func();
5457 ftrace_startup_all(0);
5458
5459 mutex_unlock(&ftrace_lock);
5460}
5461 5377
5462static void *fpid_start(struct seq_file *m, loff_t *pos) 5378static void *fpid_start(struct seq_file *m, loff_t *pos)
5379 __acquires(RCU)
5463{ 5380{
5381 struct trace_pid_list *pid_list;
5382 struct trace_array *tr = m->private;
5383
5464 mutex_lock(&ftrace_lock); 5384 mutex_lock(&ftrace_lock);
5385 rcu_read_lock_sched();
5465 5386
5466 if (!ftrace_pids_enabled() && (!*pos)) 5387 pid_list = rcu_dereference_sched(tr->function_pids);
5467 return (void *) 1;
5468 5388
5469 return seq_list_start(&ftrace_pids, *pos); 5389 if (!pid_list)
5390 return !(*pos) ? FTRACE_NO_PIDS : NULL;
5391
5392 return trace_pid_start(pid_list, pos);
5470} 5393}
5471 5394
5472static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) 5395static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
5473{ 5396{
5474 if (v == (void *)1) 5397 struct trace_array *tr = m->private;
5398 struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
5399
5400 if (v == FTRACE_NO_PIDS)
5475 return NULL; 5401 return NULL;
5476 5402
5477 return seq_list_next(v, &ftrace_pids, pos); 5403 return trace_pid_next(pid_list, v, pos);
5478} 5404}
5479 5405
5480static void fpid_stop(struct seq_file *m, void *p) 5406static void fpid_stop(struct seq_file *m, void *p)
5407 __releases(RCU)
5481{ 5408{
5409 rcu_read_unlock_sched();
5482 mutex_unlock(&ftrace_lock); 5410 mutex_unlock(&ftrace_lock);
5483} 5411}
5484 5412
5485static int fpid_show(struct seq_file *m, void *v) 5413static int fpid_show(struct seq_file *m, void *v)
5486{ 5414{
5487 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 5415 if (v == FTRACE_NO_PIDS) {
5488
5489 if (v == (void *)1) {
5490 seq_puts(m, "no pid\n"); 5416 seq_puts(m, "no pid\n");
5491 return 0; 5417 return 0;
5492 } 5418 }
5493 5419
5494 if (fpid->pid == ftrace_swapper_pid) 5420 return trace_pid_show(m, v);
5495 seq_puts(m, "swapper tasks\n");
5496 else
5497 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
5498
5499 return 0;
5500} 5421}
5501 5422
5502static const struct seq_operations ftrace_pid_sops = { 5423static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
5509static int 5430static int
5510ftrace_pid_open(struct inode *inode, struct file *file) 5431ftrace_pid_open(struct inode *inode, struct file *file)
5511{ 5432{
5433 struct trace_array *tr = inode->i_private;
5434 struct seq_file *m;
5512 int ret = 0; 5435 int ret = 0;
5513 5436
5437 if (trace_array_get(tr) < 0)
5438 return -ENODEV;
5439
5514 if ((file->f_mode & FMODE_WRITE) && 5440 if ((file->f_mode & FMODE_WRITE) &&
5515 (file->f_flags & O_TRUNC)) 5441 (file->f_flags & O_TRUNC))
5516 ftrace_pid_reset(); 5442 ftrace_pid_reset(tr);
5517 5443
5518 if (file->f_mode & FMODE_READ) 5444 ret = seq_open(file, &ftrace_pid_sops);
5519 ret = seq_open(file, &ftrace_pid_sops); 5445 if (ret < 0) {
5446 trace_array_put(tr);
5447 } else {
5448 m = file->private_data;
5449 /* copy tr over to seq ops */
5450 m->private = tr;
5451 }
5520 5452
5521 return ret; 5453 return ret;
5522} 5454}
5523 5455
5456static void ignore_task_cpu(void *data)
5457{
5458 struct trace_array *tr = data;
5459 struct trace_pid_list *pid_list;
5460
5461 /*
5462 * This function is called by on_each_cpu() while the
5463 * event_mutex is held.
5464 */
5465 pid_list = rcu_dereference_protected(tr->function_pids,
5466 mutex_is_locked(&ftrace_lock));
5467
5468 this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
5469 trace_ignore_this_task(pid_list, current));
5470}
5471
5524static ssize_t 5472static ssize_t
5525ftrace_pid_write(struct file *filp, const char __user *ubuf, 5473ftrace_pid_write(struct file *filp, const char __user *ubuf,
5526 size_t cnt, loff_t *ppos) 5474 size_t cnt, loff_t *ppos)
5527{ 5475{
5528 char buf[64], *tmp; 5476 struct seq_file *m = filp->private_data;
5529 long val; 5477 struct trace_array *tr = m->private;
5530 int ret; 5478 struct trace_pid_list *filtered_pids = NULL;
5479 struct trace_pid_list *pid_list;
5480 ssize_t ret;
5531 5481
5532 if (cnt >= sizeof(buf)) 5482 if (!cnt)
5533 return -EINVAL; 5483 return 0;
5484
5485 mutex_lock(&ftrace_lock);
5486
5487 filtered_pids = rcu_dereference_protected(tr->function_pids,
5488 lockdep_is_held(&ftrace_lock));
5489
5490 ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
5491 if (ret < 0)
5492 goto out;
5534 5493
5535 if (copy_from_user(&buf, ubuf, cnt)) 5494 rcu_assign_pointer(tr->function_pids, pid_list);
5536 return -EFAULT;
5537 5495
5538 buf[cnt] = 0; 5496 if (filtered_pids) {
5497 synchronize_sched();
5498 trace_free_pid_list(filtered_pids);
5499 } else if (pid_list) {
5500 /* Register a probe to set whether to ignore the tracing of a task */
5501 register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
5502 }
5539 5503
5540 /* 5504 /*
5541 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" 5505 * Ignoring of pids is done at task switch. But we have to
5542 * to clean the filter quietly. 5506 * check for those tasks that are currently running.
5507 * Always do this in case a pid was appended or removed.
5543 */ 5508 */
5544 tmp = strstrip(buf); 5509 on_each_cpu(ignore_task_cpu, tr, 1);
5545 if (strlen(tmp) == 0)
5546 return 1;
5547 5510
5548 ret = kstrtol(tmp, 10, &val); 5511 ftrace_update_pid_func();
5549 if (ret < 0) 5512 ftrace_startup_all(0);
5550 return ret; 5513 out:
5514 mutex_unlock(&ftrace_lock);
5551 5515
5552 ret = ftrace_pid_add(val); 5516 if (ret > 0)
5517 *ppos += ret;
5553 5518
5554 return ret ? ret : cnt; 5519 return ret;
5555} 5520}
5556 5521
5557static int 5522static int
5558ftrace_pid_release(struct inode *inode, struct file *file) 5523ftrace_pid_release(struct inode *inode, struct file *file)
5559{ 5524{
5560 if (file->f_mode & FMODE_READ) 5525 struct trace_array *tr = inode->i_private;
5561 seq_release(inode, file);
5562 5526
5563 return 0; 5527 trace_array_put(tr);
5528
5529 return seq_release(inode, file);
5564} 5530}
5565 5531
5566static const struct file_operations ftrace_pid_fops = { 5532static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = {
5571 .release = ftrace_pid_release, 5537 .release = ftrace_pid_release,
5572}; 5538};
5573 5539
5574static __init int ftrace_init_tracefs(void) 5540void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
5575{ 5541{
5576 struct dentry *d_tracer; 5542 trace_create_file("set_ftrace_pid", 0644, d_tracer,
5543 tr, &ftrace_pid_fops);
5544}
5577 5545
5578 d_tracer = tracing_init_dentry(); 5546void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
5579 if (IS_ERR(d_tracer)) 5547 struct dentry *d_tracer)
5580 return 0; 5548{
5549 /* Only the top level directory has the dyn_tracefs and profile */
5550 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
5581 5551
5582 ftrace_init_dyn_tracefs(d_tracer); 5552 ftrace_init_dyn_tracefs(d_tracer);
5583
5584 trace_create_file("set_ftrace_pid", 0644, d_tracer,
5585 NULL, &ftrace_pid_fops);
5586
5587 ftrace_profile_tracefs(d_tracer); 5553 ftrace_profile_tracefs(d_tracer);
5588
5589 return 0;
5590} 5554}
5591fs_initcall(ftrace_init_tracefs);
5592 5555
5593/** 5556/**
5594 * ftrace_kill - kill ftrace 5557 * ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kprobes.h> 28#include <linux/vmalloc.h>
29#include <linux/ftrace.h> 29#include <linux/ftrace.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/percpu.h> 31#include <linux/percpu.h>
@@ -319,6 +319,258 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
319 return 0; 319 return 0;
320} 320}
321 321
322void trace_free_pid_list(struct trace_pid_list *pid_list)
323{
324 vfree(pid_list->pids);
325 kfree(pid_list);
326}
327
328/**
329 * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
330 * @filtered_pids: The list of pids to check
331 * @search_pid: The PID to find in @filtered_pids
332 *
333 * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
334 */
335bool
336trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
337{
338 /*
339 * If pid_max changed after filtered_pids was created, we
340 * by default ignore all pids greater than the previous pid_max.
341 */
342 if (search_pid >= filtered_pids->pid_max)
343 return false;
344
345 return test_bit(search_pid, filtered_pids->pids);
346}
347
348/**
349 * trace_ignore_this_task - should a task be ignored for tracing
350 * @filtered_pids: The list of pids to check
351 * @task: The task that should be ignored if not filtered
352 *
353 * Checks if @task should be traced or not from @filtered_pids.
354 * Returns true if @task should *NOT* be traced.
355 * Returns false if @task should be traced.
356 */
357bool
358trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
359{
360 /*
361 * Return false, because if filtered_pids does not exist,
362 * all pids are good to trace.
363 */
364 if (!filtered_pids)
365 return false;
366
367 return !trace_find_filtered_pid(filtered_pids, task->pid);
368}
369
370/**
371 * trace_pid_filter_add_remove - Add or remove a task from a pid_list
372 * @pid_list: The list to modify
373 * @self: The current task for fork or NULL for exit
374 * @task: The task to add or remove
375 *
376 * If adding a task, if @self is defined, the task is only added if @self
377 * is also included in @pid_list. This happens on fork and tasks should
378 * only be added when the parent is listed. If @self is NULL, then the
379 * @task pid will be removed from the list, which would happen on exit
380 * of a task.
381 */
382void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
383 struct task_struct *self,
384 struct task_struct *task)
385{
386 if (!pid_list)
387 return;
388
389 /* For forks, we only add if the forking task is listed */
390 if (self) {
391 if (!trace_find_filtered_pid(pid_list, self->pid))
392 return;
393 }
394
395 /* Sorry, but we don't support pid_max changing after setting */
396 if (task->pid >= pid_list->pid_max)
397 return;
398
399 /* "self" is set for forks, and NULL for exits */
400 if (self)
401 set_bit(task->pid, pid_list->pids);
402 else
403 clear_bit(task->pid, pid_list->pids);
404}
405
406/**
407 * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
408 * @pid_list: The pid list to show
409 * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
410 * @pos: The position of the file
411 *
412 * This is used by the seq_file "next" operation to iterate the pids
413 * listed in a trace_pid_list structure.
414 *
415 * Returns the pid+1 as we want to display pid of zero, but NULL would
416 * stop the iteration.
417 */
418void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
419{
420 unsigned long pid = (unsigned long)v;
421
422 (*pos)++;
423
424 /* pid already is +1 of the actual prevous bit */
425 pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
426
427 /* Return pid + 1 to allow zero to be represented */
428 if (pid < pid_list->pid_max)
429 return (void *)(pid + 1);
430
431 return NULL;
432}
433
434/**
435 * trace_pid_start - Used for seq_file to start reading pid lists
436 * @pid_list: The pid list to show
437 * @pos: The position of the file
438 *
439 * This is used by seq_file "start" operation to start the iteration
440 * of listing pids.
441 *
442 * Returns the pid+1 as we want to display pid of zero, but NULL would
443 * stop the iteration.
444 */
445void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
446{
447 unsigned long pid;
448 loff_t l = 0;
449
450 pid = find_first_bit(pid_list->pids, pid_list->pid_max);
451 if (pid >= pid_list->pid_max)
452 return NULL;
453
454 /* Return pid + 1 so that zero can be the exit value */
455 for (pid++; pid && l < *pos;
456 pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
457 ;
458 return (void *)pid;
459}
460
461/**
462 * trace_pid_show - show the current pid in seq_file processing
463 * @m: The seq_file structure to write into
464 * @v: A void pointer of the pid (+1) value to display
465 *
466 * Can be directly used by seq_file operations to display the current
467 * pid value.
468 */
469int trace_pid_show(struct seq_file *m, void *v)
470{
471 unsigned long pid = (unsigned long)v - 1;
472
473 seq_printf(m, "%lu\n", pid);
474 return 0;
475}
476
477/* 128 should be much more than enough */
478#define PID_BUF_SIZE 127
479
480int trace_pid_write(struct trace_pid_list *filtered_pids,
481 struct trace_pid_list **new_pid_list,
482 const char __user *ubuf, size_t cnt)
483{
484 struct trace_pid_list *pid_list;
485 struct trace_parser parser;
486 unsigned long val;
487 int nr_pids = 0;
488 ssize_t read = 0;
489 ssize_t ret = 0;
490 loff_t pos;
491 pid_t pid;
492
493 if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
494 return -ENOMEM;
495
496 /*
497 * Always recreate a new array. The write is an all or nothing
498 * operation. Always create a new array when adding new pids by
499 * the user. If the operation fails, then the current list is
500 * not modified.
501 */
502 pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
503 if (!pid_list)
504 return -ENOMEM;
505
506 pid_list->pid_max = READ_ONCE(pid_max);
507
508 /* Only truncating will shrink pid_max */
509 if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
510 pid_list->pid_max = filtered_pids->pid_max;
511
512 pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
513 if (!pid_list->pids) {
514 kfree(pid_list);
515 return -ENOMEM;
516 }
517
518 if (filtered_pids) {
519 /* copy the current bits to the new max */
520 for_each_set_bit(pid, filtered_pids->pids,
521 filtered_pids->pid_max) {
522 set_bit(pid, pid_list->pids);
523 nr_pids++;
524 }
525 }
526
527 while (cnt > 0) {
528
529 pos = 0;
530
531 ret = trace_get_user(&parser, ubuf, cnt, &pos);
532 if (ret < 0 || !trace_parser_loaded(&parser))
533 break;
534
535 read += ret;
536 ubuf += ret;
537 cnt -= ret;
538
539 parser.buffer[parser.idx] = 0;
540
541 ret = -EINVAL;
542 if (kstrtoul(parser.buffer, 0, &val))
543 break;
544 if (val >= pid_list->pid_max)
545 break;
546
547 pid = (pid_t)val;
548
549 set_bit(pid, pid_list->pids);
550 nr_pids++;
551
552 trace_parser_clear(&parser);
553 ret = 0;
554 }
555 trace_parser_put(&parser);
556
557 if (ret < 0) {
558 trace_free_pid_list(pid_list);
559 return ret;
560 }
561
562 if (!nr_pids) {
563 /* Cleared the list of pids */
564 trace_free_pid_list(pid_list);
565 read = ret;
566 pid_list = NULL;
567 }
568
569 *new_pid_list = pid_list;
570
571 return read;
572}
573
322static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) 574static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
323{ 575{
324 u64 ts; 576 u64 ts;
@@ -1862,7 +2114,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
1862{ 2114{
1863 __buffer_unlock_commit(buffer, event); 2115 __buffer_unlock_commit(buffer, event);
1864 2116
1865 ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); 2117 /*
2118 * If regs is not set, then skip the following callers:
2119 * trace_buffer_unlock_commit_regs
2120 * event_trigger_unlock_commit
2121 * trace_event_buffer_commit
2122 * trace_event_raw_event_sched_switch
2123 * Note, we can still get here via blktrace, wakeup tracer
2124 * and mmiotrace, but that's ok if they lose a function or
2125 * two. They are that meaningful.
2126 */
2127 ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
1866 ftrace_trace_userstack(buffer, flags, pc); 2128 ftrace_trace_userstack(buffer, flags, pc);
1867} 2129}
1868 2130
@@ -1913,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1913 trace.skip = skip; 2175 trace.skip = skip;
1914 2176
1915 /* 2177 /*
2178 * Add two, for this function and the call to save_stack_trace()
2179 * If regs is set, then these functions will not be in the way.
2180 */
2181 if (!regs)
2182 trace.skip += 2;
2183
2184 /*
1916 * Since events can happen in NMIs there's no safe way to 2185 * Since events can happen in NMIs there's no safe way to
1917 * use the per cpu ftrace_stacks. We reserve it and if an interrupt 2186 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1918 * or NMI comes in, it will just have to use the default 2187 * or NMI comes in, it will just have to use the default
@@ -2083,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
2083 2352
2084/* created for use with alloc_percpu */ 2353/* created for use with alloc_percpu */
2085struct trace_buffer_struct { 2354struct trace_buffer_struct {
2086 char buffer[TRACE_BUF_SIZE]; 2355 int nesting;
2356 char buffer[4][TRACE_BUF_SIZE];
2087}; 2357};
2088 2358
2089static struct trace_buffer_struct *trace_percpu_buffer; 2359static struct trace_buffer_struct *trace_percpu_buffer;
2090static struct trace_buffer_struct *trace_percpu_sirq_buffer;
2091static struct trace_buffer_struct *trace_percpu_irq_buffer;
2092static struct trace_buffer_struct *trace_percpu_nmi_buffer;
2093 2360
2094/* 2361/*
2095 * The buffer used is dependent on the context. There is a per cpu 2362 * Thise allows for lockless recording. If we're nested too deeply, then
2096 * buffer for normal context, softirq contex, hard irq context and 2363 * this returns NULL.
2097 * for NMI context. Thise allows for lockless recording.
2098 *
2099 * Note, if the buffers failed to be allocated, then this returns NULL
2100 */ 2364 */
2101static char *get_trace_buf(void) 2365static char *get_trace_buf(void)
2102{ 2366{
2103 struct trace_buffer_struct *percpu_buffer; 2367 struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
2104
2105 /*
2106 * If we have allocated per cpu buffers, then we do not
2107 * need to do any locking.
2108 */
2109 if (in_nmi())
2110 percpu_buffer = trace_percpu_nmi_buffer;
2111 else if (in_irq())
2112 percpu_buffer = trace_percpu_irq_buffer;
2113 else if (in_softirq())
2114 percpu_buffer = trace_percpu_sirq_buffer;
2115 else
2116 percpu_buffer = trace_percpu_buffer;
2117 2368
2118 if (!percpu_buffer) 2369 if (!buffer || buffer->nesting >= 4)
2119 return NULL; 2370 return NULL;
2120 2371
2121 return this_cpu_ptr(&percpu_buffer->buffer[0]); 2372 return &buffer->buffer[buffer->nesting++][0];
2373}
2374
2375static void put_trace_buf(void)
2376{
2377 this_cpu_dec(trace_percpu_buffer->nesting);
2122} 2378}
2123 2379
2124static int alloc_percpu_trace_buffer(void) 2380static int alloc_percpu_trace_buffer(void)
2125{ 2381{
2126 struct trace_buffer_struct *buffers; 2382 struct trace_buffer_struct *buffers;
2127 struct trace_buffer_struct *sirq_buffers;
2128 struct trace_buffer_struct *irq_buffers;
2129 struct trace_buffer_struct *nmi_buffers;
2130 2383
2131 buffers = alloc_percpu(struct trace_buffer_struct); 2384 buffers = alloc_percpu(struct trace_buffer_struct);
2132 if (!buffers) 2385 if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
2133 goto err_warn; 2386 return -ENOMEM;
2134
2135 sirq_buffers = alloc_percpu(struct trace_buffer_struct);
2136 if (!sirq_buffers)
2137 goto err_sirq;
2138
2139 irq_buffers = alloc_percpu(struct trace_buffer_struct);
2140 if (!irq_buffers)
2141 goto err_irq;
2142
2143 nmi_buffers = alloc_percpu(struct trace_buffer_struct);
2144 if (!nmi_buffers)
2145 goto err_nmi;
2146 2387
2147 trace_percpu_buffer = buffers; 2388 trace_percpu_buffer = buffers;
2148 trace_percpu_sirq_buffer = sirq_buffers;
2149 trace_percpu_irq_buffer = irq_buffers;
2150 trace_percpu_nmi_buffer = nmi_buffers;
2151
2152 return 0; 2389 return 0;
2153
2154 err_nmi:
2155 free_percpu(irq_buffers);
2156 err_irq:
2157 free_percpu(sirq_buffers);
2158 err_sirq:
2159 free_percpu(buffers);
2160 err_warn:
2161 WARN(1, "Could not allocate percpu trace_printk buffer");
2162 return -ENOMEM;
2163} 2390}
2164 2391
2165static int buffers_allocated; 2392static int buffers_allocated;
@@ -2250,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
2250 tbuffer = get_trace_buf(); 2477 tbuffer = get_trace_buf();
2251 if (!tbuffer) { 2478 if (!tbuffer) {
2252 len = 0; 2479 len = 0;
2253 goto out; 2480 goto out_nobuffer;
2254 } 2481 }
2255 2482
2256 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); 2483 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2276,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
2276 } 2503 }
2277 2504
2278out: 2505out:
2506 put_trace_buf();
2507
2508out_nobuffer:
2279 preempt_enable_notrace(); 2509 preempt_enable_notrace();
2280 unpause_graph_tracing(); 2510 unpause_graph_tracing();
2281 2511
@@ -2307,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2307 tbuffer = get_trace_buf(); 2537 tbuffer = get_trace_buf();
2308 if (!tbuffer) { 2538 if (!tbuffer) {
2309 len = 0; 2539 len = 0;
2310 goto out; 2540 goto out_nobuffer;
2311 } 2541 }
2312 2542
2313 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 2543 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2326,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2326 __buffer_unlock_commit(buffer, event); 2556 __buffer_unlock_commit(buffer, event);
2327 ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); 2557 ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
2328 } 2558 }
2329 out: 2559
2560out:
2561 put_trace_buf();
2562
2563out_nobuffer:
2330 preempt_enable_notrace(); 2564 preempt_enable_notrace();
2331 unpause_graph_tracing(); 2565 unpause_graph_tracing();
2332 2566
@@ -6977,6 +7211,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
6977 for_each_tracing_cpu(cpu) 7211 for_each_tracing_cpu(cpu)
6978 tracing_init_tracefs_percpu(tr, cpu); 7212 tracing_init_tracefs_percpu(tr, cpu);
6979 7213
7214 ftrace_init_tracefs(tr, d_tracer);
6980} 7215}
6981 7216
6982static struct vfsmount *trace_automount(void *ingore) 7217static struct vfsmount *trace_automount(void *ingore)
@@ -7130,6 +7365,7 @@ static __init int tracer_init_tracefs(void)
7130 return 0; 7365 return 0;
7131 7366
7132 init_tracer_tracefs(&global_trace, d_tracer); 7367 init_tracer_tracefs(&global_trace, d_tracer);
7368 ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
7133 7369
7134 trace_create_file("tracing_thresh", 0644, d_tracer, 7370 trace_create_file("tracing_thresh", 0644, d_tracer,
7135 &global_trace, &tracing_thresh_fops); 7371 &global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
80 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ 80 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
81 filter) 81 filter)
82 82
83#undef FTRACE_ENTRY_PACKED
84#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \
85 filter) \
86 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
87 filter) __packed
88
83#include "trace_entries.h" 89#include "trace_entries.h"
84 90
85/* 91/*
@@ -156,6 +162,9 @@ struct trace_array_cpu {
156 char comm[TASK_COMM_LEN]; 162 char comm[TASK_COMM_LEN];
157 163
158 bool ignore_pid; 164 bool ignore_pid;
165#ifdef CONFIG_FUNCTION_TRACER
166 bool ftrace_ignore_pid;
167#endif
159}; 168};
160 169
161struct tracer; 170struct tracer;
@@ -247,6 +256,7 @@ struct trace_array {
247 int ref; 256 int ref;
248#ifdef CONFIG_FUNCTION_TRACER 257#ifdef CONFIG_FUNCTION_TRACER
249 struct ftrace_ops *ops; 258 struct ftrace_ops *ops;
259 struct trace_pid_list __rcu *function_pids;
250 /* function tracing enabled */ 260 /* function tracing enabled */
251 int function_enabled; 261 int function_enabled;
252#endif 262#endif
@@ -628,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
628 638
629extern unsigned long tracing_thresh; 639extern unsigned long tracing_thresh;
630 640
641/* PID filtering */
642
643extern int pid_max;
644
645bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
646 pid_t search_pid);
647bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
648 struct task_struct *task);
649void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
650 struct task_struct *self,
651 struct task_struct *task);
652void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
653void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
654int trace_pid_show(struct seq_file *m, void *v);
655void trace_free_pid_list(struct trace_pid_list *pid_list);
656int trace_pid_write(struct trace_pid_list *filtered_pids,
657 struct trace_pid_list **new_pid_list,
658 const char __user *ubuf, size_t cnt);
659
631#ifdef CONFIG_TRACER_MAX_TRACE 660#ifdef CONFIG_TRACER_MAX_TRACE
632void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 661void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
633void update_max_tr_single(struct trace_array *tr, 662void update_max_tr_single(struct trace_array *tr,
@@ -821,12 +850,9 @@ extern struct list_head ftrace_pids;
821 850
822#ifdef CONFIG_FUNCTION_TRACER 851#ifdef CONFIG_FUNCTION_TRACER
823extern bool ftrace_filter_param __initdata; 852extern bool ftrace_filter_param __initdata;
824static inline int ftrace_trace_task(struct task_struct *task) 853static inline int ftrace_trace_task(struct trace_array *tr)
825{ 854{
826 if (list_empty(&ftrace_pids)) 855 return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
827 return 1;
828
829 return test_tsk_trace_trace(task);
830} 856}
831extern int ftrace_is_dead(void); 857extern int ftrace_is_dead(void);
832int ftrace_create_function_files(struct trace_array *tr, 858int ftrace_create_function_files(struct trace_array *tr,
@@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
836void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); 862void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
837void ftrace_reset_array_ops(struct trace_array *tr); 863void ftrace_reset_array_ops(struct trace_array *tr);
838int using_ftrace_ops_list_func(void); 864int using_ftrace_ops_list_func(void);
865void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
866void ftrace_init_tracefs_toplevel(struct trace_array *tr,
867 struct dentry *d_tracer);
839#else 868#else
840static inline int ftrace_trace_task(struct task_struct *task) 869static inline int ftrace_trace_task(struct trace_array *tr)
841{ 870{
842 return 1; 871 return 1;
843} 872}
@@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
852static inline __init void 881static inline __init void
853ftrace_init_global_array_ops(struct trace_array *tr) { } 882ftrace_init_global_array_ops(struct trace_array *tr) { }
854static inline void ftrace_reset_array_ops(struct trace_array *tr) { } 883static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
884static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
885static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
855/* ftace_func_t type is not defined, use macro instead of static inline */ 886/* ftace_func_t type is not defined, use macro instead of static inline */
856#define ftrace_init_array_ops(tr, func) do { } while (0) 887#define ftrace_init_array_ops(tr, func) do { } while (0)
857#endif /* CONFIG_FUNCTION_TRACER */ 888#endif /* CONFIG_FUNCTION_TRACER */
@@ -1600,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1600#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ 1631#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
1601 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ 1632 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
1602 filter) 1633 filter)
1634#undef FTRACE_ENTRY_PACKED
1635#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
1636 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
1637 filter)
1638
1603#include "trace_entries.h" 1639#include "trace_entries.h"
1604 1640
1605#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) 1641#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
72); 72);
73 73
74/* Function call entry */ 74/* Function call entry */
75FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, 75FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
76 76
77 TRACE_GRAPH_ENT, 77 TRACE_GRAPH_ENT,
78 78
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
88); 88);
89 89
90/* Function return entry */ 90/* Function return entry */
91FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, 91FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
92 92
93 TRACE_GRAPH_RET, 93 TRACE_GRAPH_RET,
94 94
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
15#include <linux/kthread.h> 15#include <linux/kthread.h>
16#include <linux/tracefs.h> 16#include <linux/tracefs.h>
17#include <linux/uaccess.h> 17#include <linux/uaccess.h>
18#include <linux/vmalloc.h>
19#include <linux/module.h> 18#include <linux/module.h>
20#include <linux/ctype.h> 19#include <linux/ctype.h>
21#include <linux/sort.h> 20#include <linux/sort.h>
@@ -262,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
262 261
263 local_save_flags(fbuffer->flags); 262 local_save_flags(fbuffer->flags);
264 fbuffer->pc = preempt_count(); 263 fbuffer->pc = preempt_count();
264 /*
265 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
266 * preemption (adding one to the preempt_count). Since we are
267 * interested in the preempt_count at the time the tracepoint was
268 * hit, we need to subtract one to offset the increment.
269 */
270 if (IS_ENABLED(CONFIG_PREEMPT))
271 fbuffer->pc--;
265 fbuffer->trace_file = trace_file; 272 fbuffer->trace_file = trace_file;
266 273
267 fbuffer->event = 274 fbuffer->event =
@@ -499,60 +506,6 @@ static void ftrace_clear_events(struct trace_array *tr)
499 mutex_unlock(&event_mutex); 506 mutex_unlock(&event_mutex);
500} 507}
501 508
502/* Shouldn't this be in a header? */
503extern int pid_max;
504
505/* Returns true if found in filter */
506static bool
507find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
508{
509 /*
510 * If pid_max changed after filtered_pids was created, we
511 * by default ignore all pids greater than the previous pid_max.
512 */
513 if (search_pid >= filtered_pids->pid_max)
514 return false;
515
516 return test_bit(search_pid, filtered_pids->pids);
517}
518
519static bool
520ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
521{
522 /*
523 * Return false, because if filtered_pids does not exist,
524 * all pids are good to trace.
525 */
526 if (!filtered_pids)
527 return false;
528
529 return !find_filtered_pid(filtered_pids, task->pid);
530}
531
532static void filter_add_remove_task(struct trace_pid_list *pid_list,
533 struct task_struct *self,
534 struct task_struct *task)
535{
536 if (!pid_list)
537 return;
538
539 /* For forks, we only add if the forking task is listed */
540 if (self) {
541 if (!find_filtered_pid(pid_list, self->pid))
542 return;
543 }
544
545 /* Sorry, but we don't support pid_max changing after setting */
546 if (task->pid >= pid_list->pid_max)
547 return;
548
549 /* "self" is set for forks, and NULL for exits */
550 if (self)
551 set_bit(task->pid, pid_list->pids);
552 else
553 clear_bit(task->pid, pid_list->pids);
554}
555
556static void 509static void
557event_filter_pid_sched_process_exit(void *data, struct task_struct *task) 510event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
558{ 511{
@@ -560,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
560 struct trace_array *tr = data; 513 struct trace_array *tr = data;
561 514
562 pid_list = rcu_dereference_sched(tr->filtered_pids); 515 pid_list = rcu_dereference_sched(tr->filtered_pids);
563 filter_add_remove_task(pid_list, NULL, task); 516 trace_filter_add_remove_task(pid_list, NULL, task);
564} 517}
565 518
566static void 519static void
@@ -572,7 +525,7 @@ event_filter_pid_sched_process_fork(void *data,
572 struct trace_array *tr = data; 525 struct trace_array *tr = data;
573 526
574 pid_list = rcu_dereference_sched(tr->filtered_pids); 527 pid_list = rcu_dereference_sched(tr->filtered_pids);
575 filter_add_remove_task(pid_list, self, task); 528 trace_filter_add_remove_task(pid_list, self, task);
576} 529}
577 530
578void trace_event_follow_fork(struct trace_array *tr, bool enable) 531void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
600 pid_list = rcu_dereference_sched(tr->filtered_pids); 553 pid_list = rcu_dereference_sched(tr->filtered_pids);
601 554
602 this_cpu_write(tr->trace_buffer.data->ignore_pid, 555 this_cpu_write(tr->trace_buffer.data->ignore_pid,
603 ignore_this_task(pid_list, prev) && 556 trace_ignore_this_task(pid_list, prev) &&
604 ignore_this_task(pid_list, next)); 557 trace_ignore_this_task(pid_list, next));
605} 558}
606 559
607static void 560static void
@@ -614,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
614 pid_list = rcu_dereference_sched(tr->filtered_pids); 567 pid_list = rcu_dereference_sched(tr->filtered_pids);
615 568
616 this_cpu_write(tr->trace_buffer.data->ignore_pid, 569 this_cpu_write(tr->trace_buffer.data->ignore_pid,
617 ignore_this_task(pid_list, next)); 570 trace_ignore_this_task(pid_list, next));
618} 571}
619 572
620static void 573static void
@@ -630,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
630 pid_list = rcu_dereference_sched(tr->filtered_pids); 583 pid_list = rcu_dereference_sched(tr->filtered_pids);
631 584
632 this_cpu_write(tr->trace_buffer.data->ignore_pid, 585 this_cpu_write(tr->trace_buffer.data->ignore_pid,
633 ignore_this_task(pid_list, task)); 586 trace_ignore_this_task(pid_list, task));
634} 587}
635 588
636static void 589static void
@@ -647,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
647 600
648 /* Set tracing if current is enabled */ 601 /* Set tracing if current is enabled */
649 this_cpu_write(tr->trace_buffer.data->ignore_pid, 602 this_cpu_write(tr->trace_buffer.data->ignore_pid,
650 ignore_this_task(pid_list, current)); 603 trace_ignore_this_task(pid_list, current));
651} 604}
652 605
653static void __ftrace_clear_event_pids(struct trace_array *tr) 606static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -685,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
685 /* Wait till all users are no longer using pid filtering */ 638 /* Wait till all users are no longer using pid filtering */
686 synchronize_sched(); 639 synchronize_sched();
687 640
688 vfree(pid_list->pids); 641 trace_free_pid_list(pid_list);
689 kfree(pid_list);
690} 642}
691 643
692static void ftrace_clear_event_pids(struct trace_array *tr) 644static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1034,18 +986,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
1034{ 986{
1035 struct trace_array *tr = m->private; 987 struct trace_array *tr = m->private;
1036 struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); 988 struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
1037 unsigned long pid = (unsigned long)v;
1038
1039 (*pos)++;
1040
1041 /* pid already is +1 of the actual prevous bit */
1042 pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
1043 989
1044 /* Return pid + 1 to allow zero to be represented */ 990 return trace_pid_next(pid_list, v, pos);
1045 if (pid < pid_list->pid_max)
1046 return (void *)(pid + 1);
1047
1048 return NULL;
1049} 991}
1050 992
1051static void *p_start(struct seq_file *m, loff_t *pos) 993static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1053,8 +995,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
1053{ 995{
1054 struct trace_pid_list *pid_list; 996 struct trace_pid_list *pid_list;
1055 struct trace_array *tr = m->private; 997 struct trace_array *tr = m->private;
1056 unsigned long pid;
1057 loff_t l = 0;
1058 998
1059 /* 999 /*
1060 * Grab the mutex, to keep calls to p_next() having the same 1000 * Grab the mutex, to keep calls to p_next() having the same
@@ -1070,15 +1010,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
1070 if (!pid_list) 1010 if (!pid_list)
1071 return NULL; 1011 return NULL;
1072 1012
1073 pid = find_first_bit(pid_list->pids, pid_list->pid_max); 1013 return trace_pid_start(pid_list, pos);
1074 if (pid >= pid_list->pid_max)
1075 return NULL;
1076
1077 /* Return pid + 1 so that zero can be the exit value */
1078 for (pid++; pid && l < *pos;
1079 pid = (unsigned long)p_next(m, (void *)pid, &l))
1080 ;
1081 return (void *)pid;
1082} 1014}
1083 1015
1084static void p_stop(struct seq_file *m, void *p) 1016static void p_stop(struct seq_file *m, void *p)
@@ -1088,14 +1020,6 @@ static void p_stop(struct seq_file *m, void *p)
1088 mutex_unlock(&event_mutex); 1020 mutex_unlock(&event_mutex);
1089} 1021}
1090 1022
1091static int p_show(struct seq_file *m, void *v)
1092{
1093 unsigned long pid = (unsigned long)v - 1;
1094
1095 seq_printf(m, "%lu\n", pid);
1096 return 0;
1097}
1098
1099static ssize_t 1023static ssize_t
1100event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 1024event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
1101 loff_t *ppos) 1025 loff_t *ppos)
@@ -1654,7 +1578,7 @@ static void ignore_task_cpu(void *data)
1654 mutex_is_locked(&event_mutex)); 1578 mutex_is_locked(&event_mutex));
1655 1579
1656 this_cpu_write(tr->trace_buffer.data->ignore_pid, 1580 this_cpu_write(tr->trace_buffer.data->ignore_pid,
1657 ignore_this_task(pid_list, current)); 1581 trace_ignore_this_task(pid_list, current));
1658} 1582}
1659 1583
1660static ssize_t 1584static ssize_t
@@ -1666,13 +1590,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
1666 struct trace_pid_list *filtered_pids = NULL; 1590 struct trace_pid_list *filtered_pids = NULL;
1667 struct trace_pid_list *pid_list; 1591 struct trace_pid_list *pid_list;
1668 struct trace_event_file *file; 1592 struct trace_event_file *file;
1669 struct trace_parser parser; 1593 ssize_t ret;
1670 unsigned long val;
1671 loff_t this_pos;
1672 ssize_t read = 0;
1673 ssize_t ret = 0;
1674 pid_t pid;
1675 int nr_pids = 0;
1676 1594
1677 if (!cnt) 1595 if (!cnt)
1678 return 0; 1596 return 0;
@@ -1681,93 +1599,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
1681 if (ret < 0) 1599 if (ret < 0)
1682 return ret; 1600 return ret;
1683 1601
1684 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
1685 return -ENOMEM;
1686
1687 mutex_lock(&event_mutex); 1602 mutex_lock(&event_mutex);
1603
1688 filtered_pids = rcu_dereference_protected(tr->filtered_pids, 1604 filtered_pids = rcu_dereference_protected(tr->filtered_pids,
1689 lockdep_is_held(&event_mutex)); 1605 lockdep_is_held(&event_mutex));
1690 1606
1691 /* 1607 ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
1692 * Always recreate a new array. The write is an all or nothing 1608 if (ret < 0)
1693 * operation. Always create a new array when adding new pids by
1694 * the user. If the operation fails, then the current list is
1695 * not modified.
1696 */
1697 pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
1698 if (!pid_list) {
1699 read = -ENOMEM;
1700 goto out;
1701 }
1702 pid_list->pid_max = READ_ONCE(pid_max);
1703 /* Only truncating will shrink pid_max */
1704 if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
1705 pid_list->pid_max = filtered_pids->pid_max;
1706 pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
1707 if (!pid_list->pids) {
1708 kfree(pid_list);
1709 read = -ENOMEM;
1710 goto out;
1711 }
1712 if (filtered_pids) {
1713 /* copy the current bits to the new max */
1714 pid = find_first_bit(filtered_pids->pids,
1715 filtered_pids->pid_max);
1716 while (pid < filtered_pids->pid_max) {
1717 set_bit(pid, pid_list->pids);
1718 pid = find_next_bit(filtered_pids->pids,
1719 filtered_pids->pid_max,
1720 pid + 1);
1721 nr_pids++;
1722 }
1723 }
1724
1725 while (cnt > 0) {
1726
1727 this_pos = 0;
1728
1729 ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
1730 if (ret < 0 || !trace_parser_loaded(&parser))
1731 break;
1732
1733 read += ret;
1734 ubuf += ret;
1735 cnt -= ret;
1736
1737 parser.buffer[parser.idx] = 0;
1738
1739 ret = -EINVAL;
1740 if (kstrtoul(parser.buffer, 0, &val))
1741 break;
1742 if (val >= pid_list->pid_max)
1743 break;
1744
1745 pid = (pid_t)val;
1746
1747 set_bit(pid, pid_list->pids);
1748 nr_pids++;
1749
1750 trace_parser_clear(&parser);
1751 ret = 0;
1752 }
1753 trace_parser_put(&parser);
1754
1755 if (ret < 0) {
1756 vfree(pid_list->pids);
1757 kfree(pid_list);
1758 read = ret;
1759 goto out; 1609 goto out;
1760 }
1761 1610
1762 if (!nr_pids) {
1763 /* Cleared the list of pids */
1764 vfree(pid_list->pids);
1765 kfree(pid_list);
1766 read = ret;
1767 if (!filtered_pids)
1768 goto out;
1769 pid_list = NULL;
1770 }
1771 rcu_assign_pointer(tr->filtered_pids, pid_list); 1611 rcu_assign_pointer(tr->filtered_pids, pid_list);
1772 1612
1773 list_for_each_entry(file, &tr->events, list) { 1613 list_for_each_entry(file, &tr->events, list) {
@@ -1776,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
1776 1616
1777 if (filtered_pids) { 1617 if (filtered_pids) {
1778 synchronize_sched(); 1618 synchronize_sched();
1779 1619 trace_free_pid_list(filtered_pids);
1780 vfree(filtered_pids->pids); 1620 } else if (pid_list) {
1781 kfree(filtered_pids);
1782 } else {
1783 /* 1621 /*
1784 * Register a probe that is called before all other probes 1622 * Register a probe that is called before all other probes
1785 * to set ignore_pid if next or prev do not match. 1623 * to set ignore_pid if next or prev do not match.
@@ -1817,9 +1655,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
1817 out: 1655 out:
1818 mutex_unlock(&event_mutex); 1656 mutex_unlock(&event_mutex);
1819 1657
1820 ret = read; 1658 if (ret > 0)
1821 if (read > 0) 1659 *ppos += ret;
1822 *ppos += read;
1823 1660
1824 return ret; 1661 return ret;
1825} 1662}
@@ -1846,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = {
1846static const struct seq_operations show_set_pid_seq_ops = { 1683static const struct seq_operations show_set_pid_seq_ops = {
1847 .start = p_start, 1684 .start = p_start,
1848 .next = p_next, 1685 .next = p_next,
1849 .show = p_show, 1686 .show = trace_pid_show,
1850 .stop = p_stop, 1687 .stop = p_stop,
1851}; 1688};
1852 1689
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
43 43
44 /* Currently only the non stack verision is supported */ 44 /* Currently only the non stack verision is supported */
45 ops->func = function_trace_call; 45 ops->func = function_trace_call;
46 ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; 46 ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
47 47
48 tr->ops = ops; 48 tr->ops = ops;
49 ops->private = tr; 49 ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
319 int cpu; 319 int cpu;
320 int pc; 320 int pc;
321 321
322 if (!ftrace_trace_task(current)) 322 if (!ftrace_trace_task(tr))
323 return 0; 323 return 0;
324 324
325 /* trace it when it is-nested-in or is a function enabled. */ 325 /* trace it when it is-nested-in or is a function enabled. */
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
338 if (ftrace_graph_notrace_addr(trace->func)) 338 if (ftrace_graph_notrace_addr(trace->func))
339 return 1; 339 return 1;
340 340
341 /*
342 * Stop here if tracing_threshold is set. We only write function return
343 * events to the ring buffer.
344 */
345 if (tracing_thresh)
346 return 1;
347
341 local_irq_save(flags); 348 local_irq_save(flags);
342 cpu = raw_smp_processor_id(); 349 cpu = raw_smp_processor_id();
343 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 350 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
355 return ret; 362 return ret;
356} 363}
357 364
358static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
359{
360 if (tracing_thresh)
361 return 1;
362 else
363 return trace_graph_entry(trace);
364}
365
366static void 365static void
367__trace_graph_function(struct trace_array *tr, 366__trace_graph_function(struct trace_array *tr,
368 unsigned long ip, unsigned long flags, int pc) 367 unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
457 set_graph_array(tr); 456 set_graph_array(tr);
458 if (tracing_thresh) 457 if (tracing_thresh)
459 ret = register_ftrace_graph(&trace_graph_thresh_return, 458 ret = register_ftrace_graph(&trace_graph_thresh_return,
460 &trace_graph_thresh_entry); 459 &trace_graph_entry);
461 else 460 else
462 ret = register_ftrace_graph(&trace_graph_return, 461 ret = register_ftrace_graph(&trace_graph_return,
463 &trace_graph_entry); 462 &trace_graph_entry);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
587 * $retval : fetch return value 587 * $retval : fetch return value
588 * $stack : fetch stack address 588 * $stack : fetch stack address
589 * $stackN : fetch Nth of stack (N:0-) 589 * $stackN : fetch Nth of stack (N:0-)
590 * $comm : fetch current task comm
590 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 591 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
591 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 592 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
592 * %REG : fetch register REG 593 * %REG : fetch register REG
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", 68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
69 dev->bus->number, dev->devfn, 69 dev->bus->number, dev->devfn,
70 dev->vendor, dev->device, dev->irq); 70 dev->vendor, dev->device, dev->irq);
71 /*
72 * XXX: is pci_resource_to_user() appropriate, since we are
73 * supposed to interpret the __ioremap() phys_addr argument based on
74 * these printed values?
75 */
76 for (i = 0; i < 7; i++) { 71 for (i = 0; i < 7; i++) {
77 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 72 start = dev->resource[i].start;
78 trace_seq_printf(s, " %llx", 73 trace_seq_printf(s, " %llx",
79 (unsigned long long)(start | 74 (unsigned long long)(start |
80 (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); 75 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
81 } 76 }
82 for (i = 0; i < 7; i++) { 77 for (i = 0; i < 7; i++) {
83 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 78 start = dev->resource[i].start;
79 end = dev->resource[i].end;
84 trace_seq_printf(s, " %llx", 80 trace_seq_printf(s, " %llx",
85 dev->resource[i].start < dev->resource[i].end ? 81 dev->resource[i].start < dev->resource[i].end ?
86 (unsigned long long)(end - start) + 1 : 0); 82 (unsigned long long)(end - start) + 1 : 0);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
36static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) 36static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
37{ 37{
38 struct trace_bprintk_fmt *pos; 38 struct trace_bprintk_fmt *pos;
39
40 if (!fmt)
41 return ERR_PTR(-EINVAL);
42
39 list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { 43 list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
40 if (!strcmp(pos->fmt, fmt)) 44 if (!strcmp(pos->fmt, fmt))
41 return pos; 45 return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
57 for (iter = start; iter < end; iter++) { 61 for (iter = start; iter < end; iter++) {
58 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); 62 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
59 if (tb_fmt) { 63 if (tb_fmt) {
60 *iter = tb_fmt->fmt; 64 if (!IS_ERR(tb_fmt))
65 *iter = tb_fmt->fmt;
61 continue; 66 continue;
62 } 67 }
63 68
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
218 kfree(data); 218 kfree(data);
219} 219}
220 220
221void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
222 void *data, void *dest)
223{
224 int maxlen = get_rloc_len(*(u32 *)dest);
225 u8 *dst = get_rloc_data(dest);
226 long ret;
227
228 if (!maxlen)
229 return;
230
231 ret = strlcpy(dst, current->comm, maxlen);
232 *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
233}
234NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
235
236void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
237 void *data, void *dest)
238{
239 *(u32 *)dest = strlen(current->comm) + 1;
240}
241NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
242
221static const struct fetch_type *find_fetch_type(const char *type, 243static const struct fetch_type *find_fetch_type(const char *type,
222 const struct fetch_type *ftbl) 244 const struct fetch_type *ftbl)
223{ 245{
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
348 } 370 }
349 } else 371 } else
350 ret = -EINVAL; 372 ret = -EINVAL;
373 } else if (strcmp(arg, "comm") == 0) {
374 if (strcmp(t->name, "string") != 0 &&
375 strcmp(t->name, "string_size") != 0)
376 return -EINVAL;
377 f->fn = t->fetch[FETCH_MTD_comm];
351 } else 378 } else
352 ret = -EINVAL; 379 ret = -EINVAL;
353 380
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
522 arg[t - parg->comm] = '\0'; 549 arg[t - parg->comm] = '\0';
523 t++; 550 t++;
524 } 551 }
552 /*
553 * The default type of $comm should be "string", and it can't be
554 * dereferenced.
555 */
556 if (!t && strcmp(arg, "$comm") == 0)
557 t = "string";
525 parg->type = find_fetch_type(t, ftbl); 558 parg->type = find_fetch_type(t, ftbl);
526 if (!parg->type) { 559 if (!parg->type) {
527 pr_info("Unsupported type: %s\n", t); 560 pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
102 FETCH_MTD_reg = 0, 102 FETCH_MTD_reg = 0,
103 FETCH_MTD_stack, 103 FETCH_MTD_stack,
104 FETCH_MTD_retval, 104 FETCH_MTD_retval,
105 FETCH_MTD_comm,
105 FETCH_MTD_memory, 106 FETCH_MTD_memory,
106 FETCH_MTD_symbol, 107 FETCH_MTD_symbol,
107 FETCH_MTD_deref, 108 FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
183#define fetch_bitfield_string NULL 184#define fetch_bitfield_string NULL
184#define fetch_bitfield_string_size NULL 185#define fetch_bitfield_string_size NULL
185 186
187/* comm only makes sense as a string */
188#define fetch_comm_u8 NULL
189#define fetch_comm_u16 NULL
190#define fetch_comm_u32 NULL
191#define fetch_comm_u64 NULL
192DECLARE_FETCH_FUNC(comm, string);
193DECLARE_FETCH_FUNC(comm, string_size);
194
186/* 195/*
187 * Define macro for basic types - we don't need to define s* types, because 196 * Define macro for basic types - we don't need to define s* types, because
188 * we have to care only about bitwidth at recording time. 197 * we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
213ASSIGN_FETCH_FUNC(reg, ftype), \ 222ASSIGN_FETCH_FUNC(reg, ftype), \
214ASSIGN_FETCH_FUNC(stack, ftype), \ 223ASSIGN_FETCH_FUNC(stack, ftype), \
215ASSIGN_FETCH_FUNC(retval, ftype), \ 224ASSIGN_FETCH_FUNC(retval, ftype), \
225ASSIGN_FETCH_FUNC(comm, ftype), \
216ASSIGN_FETCH_FUNC(memory, ftype), \ 226ASSIGN_FETCH_FUNC(memory, ftype), \
217ASSIGN_FETCH_FUNC(symbol, ftype), \ 227ASSIGN_FETCH_FUNC(symbol, ftype), \
218ASSIGN_FETCH_FUNC(deref, ftype), \ 228ASSIGN_FETCH_FUNC(deref, ftype), \
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..68f594212759 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
938 return allowed; 938 return allowed;
939} 939}
940 940
941/*
942 * Returns true if @ns is the same namespace as or a descendant of
943 * @target_ns.
944 */
945bool current_in_userns(const struct user_namespace *target_ns)
946{
947 struct user_namespace *ns;
948 for (ns = current_user_ns(); ns; ns = ns->parent) {
949 if (ns == target_ns)
950 return true;
951 }
952 return false;
953}
954
941static inline struct user_namespace *to_user_ns(struct ns_common *ns) 955static inline struct user_namespace *to_user_ns(struct ns_common *ns)
942{ 956{
943 return container_of(ns, struct user_namespace, ns); 957 return container_of(ns, struct user_namespace, ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..ef071ca73fc3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
4369/** 4369/**
4370 * show_workqueue_state - dump workqueue state 4370 * show_workqueue_state - dump workqueue state
4371 * 4371 *
4372 * Called from a sysrq handler and prints out all busy workqueues and 4372 * Called from a sysrq handler or try_to_freeze_tasks() and prints out
4373 * pools. 4373 * all busy workqueues and pools.
4374 */ 4374 */
4375void show_workqueue_state(void) 4375void show_workqueue_state(void)
4376{ 4376{
@@ -4600,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4600 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) 4600 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
4601 return; 4601 return;
4602 4602
4603 /* is @cpu the only online CPU? */
4604 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); 4603 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
4605 if (cpumask_weight(&cpumask) != 1)
4606 return;
4607 4604
4608 /* as we're called from CPU_ONLINE, the following shouldn't fail */ 4605 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4609 for_each_pool_worker(worker, pool) 4606 for_each_pool_worker(worker, pool)
4610 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4607 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
4611 pool->attrs->cpumask) < 0);
4612} 4608}
4613 4609
4614/* 4610int workqueue_prepare_cpu(unsigned int cpu)
4615 * Workqueues should be brought up before normal priority CPU notifiers. 4611{
4616 * This will be registered high priority CPU notifier. 4612 struct worker_pool *pool;
4617 */ 4613
4618static int workqueue_cpu_up_callback(struct notifier_block *nfb, 4614 for_each_cpu_worker_pool(pool, cpu) {
4619 unsigned long action, 4615 if (pool->nr_workers)
4620 void *hcpu) 4616 continue;
4617 if (!create_worker(pool))
4618 return -ENOMEM;
4619 }
4620 return 0;
4621}
4622
4623int workqueue_online_cpu(unsigned int cpu)
4621{ 4624{
4622 int cpu = (unsigned long)hcpu;
4623 struct worker_pool *pool; 4625 struct worker_pool *pool;
4624 struct workqueue_struct *wq; 4626 struct workqueue_struct *wq;
4625 int pi; 4627 int pi;
4626 4628
4627 switch (action & ~CPU_TASKS_FROZEN) { 4629 mutex_lock(&wq_pool_mutex);
4628 case CPU_UP_PREPARE:
4629 for_each_cpu_worker_pool(pool, cpu) {
4630 if (pool->nr_workers)
4631 continue;
4632 if (!create_worker(pool))
4633 return NOTIFY_BAD;
4634 }
4635 break;
4636
4637 case CPU_DOWN_FAILED:
4638 case CPU_ONLINE:
4639 mutex_lock(&wq_pool_mutex);
4640 4630
4641 for_each_pool(pool, pi) { 4631 for_each_pool(pool, pi) {
4642 mutex_lock(&pool->attach_mutex); 4632 mutex_lock(&pool->attach_mutex);
4643 4633
4644 if (pool->cpu == cpu) 4634 if (pool->cpu == cpu)
4645 rebind_workers(pool); 4635 rebind_workers(pool);
4646 else if (pool->cpu < 0) 4636 else if (pool->cpu < 0)
4647 restore_unbound_workers_cpumask(pool, cpu); 4637 restore_unbound_workers_cpumask(pool, cpu);
4648 4638
4649 mutex_unlock(&pool->attach_mutex); 4639 mutex_unlock(&pool->attach_mutex);
4650 } 4640 }
4651 4641
4652 /* update NUMA affinity of unbound workqueues */ 4642 /* update NUMA affinity of unbound workqueues */
4653 list_for_each_entry(wq, &workqueues, list) 4643 list_for_each_entry(wq, &workqueues, list)
4654 wq_update_unbound_numa(wq, cpu, true); 4644 wq_update_unbound_numa(wq, cpu, true);
4655 4645
4656 mutex_unlock(&wq_pool_mutex); 4646 mutex_unlock(&wq_pool_mutex);
4657 break; 4647 return 0;
4658 }
4659 return NOTIFY_OK;
4660} 4648}
4661 4649
4662/* 4650int workqueue_offline_cpu(unsigned int cpu)
4663 * Workqueues should be brought down after normal priority CPU notifiers.
4664 * This will be registered as low priority CPU notifier.
4665 */
4666static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4667 unsigned long action,
4668 void *hcpu)
4669{ 4651{
4670 int cpu = (unsigned long)hcpu;
4671 struct work_struct unbind_work; 4652 struct work_struct unbind_work;
4672 struct workqueue_struct *wq; 4653 struct workqueue_struct *wq;
4673 4654
4674 switch (action & ~CPU_TASKS_FROZEN) { 4655 /* unbinding per-cpu workers should happen on the local CPU */
4675 case CPU_DOWN_PREPARE: 4656 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
4676 /* unbinding per-cpu workers should happen on the local CPU */ 4657 queue_work_on(cpu, system_highpri_wq, &unbind_work);
4677 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4658
4678 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4659 /* update NUMA affinity of unbound workqueues */
4679 4660 mutex_lock(&wq_pool_mutex);
4680 /* update NUMA affinity of unbound workqueues */ 4661 list_for_each_entry(wq, &workqueues, list)
4681 mutex_lock(&wq_pool_mutex); 4662 wq_update_unbound_numa(wq, cpu, false);
4682 list_for_each_entry(wq, &workqueues, list) 4663 mutex_unlock(&wq_pool_mutex);
4683 wq_update_unbound_numa(wq, cpu, false); 4664
4684 mutex_unlock(&wq_pool_mutex); 4665 /* wait for per-cpu unbinding to finish */
4685 4666 flush_work(&unbind_work);
4686 /* wait for per-cpu unbinding to finish */ 4667 destroy_work_on_stack(&unbind_work);
4687 flush_work(&unbind_work); 4668 return 0;
4688 destroy_work_on_stack(&unbind_work);
4689 break;
4690 }
4691 return NOTIFY_OK;
4692} 4669}
4693 4670
4694#ifdef CONFIG_SMP 4671#ifdef CONFIG_SMP
@@ -5490,9 +5467,6 @@ static int __init init_workqueues(void)
5490 5467
5491 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5468 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5492 5469
5493 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
5494 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
5495
5496 wq_numa_init(); 5470 wq_numa_init();
5497 5471
5498 /* initialize CPU pools */ 5472 /* initialize CPU pools */