diff options
Diffstat (limited to 'kernel')
126 files changed, 6575 insertions, 2605 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eb26e12c6c2a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
| 9 | extable.o params.o \ | 9 | extable.o params.o \ |
| 10 | kthread.o sys_ni.o nsproxy.o \ | 10 | kthread.o sys_ni.o nsproxy.o \ |
| 11 | notifier.o ksysfs.o cred.o reboot.o \ | 11 | notifier.o ksysfs.o cred.o reboot.o \ |
| 12 | async.o range.o smpboot.o | 12 | async.o range.o smpboot.o ucount.o |
| 13 | 13 | ||
| 14 | obj-$(CONFIG_MULTIUSER) += groups.o | 14 | obj-$(CONFIG_MULTIUSER) += groups.o |
| 15 | 15 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index a8a91bd2b2a9..f1ca11613379 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 877 | return err; | 877 | return err; |
| 878 | } | 878 | } |
| 879 | if (s.mask & AUDIT_STATUS_PID) { | 879 | if (s.mask & AUDIT_STATUS_PID) { |
| 880 | /* NOTE: we are using task_tgid_vnr() below because | ||
| 881 | * the s.pid value is relative to the namespace | ||
| 882 | * of the caller; at present this doesn't matter | ||
| 883 | * much since you can really only run auditd | ||
| 884 | * from the initial pid namespace, but something | ||
| 885 | * to keep in mind if this changes */ | ||
| 880 | int new_pid = s.pid; | 886 | int new_pid = s.pid; |
| 881 | pid_t requesting_pid = task_tgid_vnr(current); | 887 | pid_t requesting_pid = task_tgid_vnr(current); |
| 882 | 888 | ||
| @@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
| 1917 | " euid=%u suid=%u fsuid=%u" | 1923 | " euid=%u suid=%u fsuid=%u" |
| 1918 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", | 1924 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", |
| 1919 | task_ppid_nr(tsk), | 1925 | task_ppid_nr(tsk), |
| 1920 | task_pid_nr(tsk), | 1926 | task_tgid_nr(tsk), |
| 1921 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), | 1927 | from_kuid(&init_user_ns, audit_get_loginuid(tsk)), |
| 1922 | from_kuid(&init_user_ns, cred->uid), | 1928 | from_kuid(&init_user_ns, cred->uid), |
| 1923 | from_kgid(&init_user_ns, cred->gid), | 1929 | from_kgid(&init_user_ns, cred->gid), |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d6709eb70970..0d302a87f21b 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 20 | */ | 20 | */ |
| 21 | 21 | ||
| 22 | #include <linux/file.h> | ||
| 22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
| 23 | #include <linux/audit.h> | 24 | #include <linux/audit.h> |
| 24 | #include <linux/kthread.h> | 25 | #include <linux/kthread.h> |
| @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) | |||
| 544 | unsigned long ino; | 545 | unsigned long ino; |
| 545 | dev_t dev; | 546 | dev_t dev; |
| 546 | 547 | ||
| 547 | rcu_read_lock(); | 548 | exe_file = get_task_exe_file(tsk); |
| 548 | exe_file = rcu_dereference(tsk->mm->exe_file); | 549 | if (!exe_file) |
| 550 | return 0; | ||
| 549 | ino = exe_file->f_inode->i_ino; | 551 | ino = exe_file->f_inode->i_ino; |
| 550 | dev = exe_file->f_inode->i_sb->s_dev; | 552 | dev = exe_file->f_inode->i_sb->s_dev; |
| 551 | rcu_read_unlock(); | 553 | fput(exe_file); |
| 552 | return audit_mark_compare(mark, ino, dev); | 554 | return audit_mark_compare(mark, ino, dev); |
| 553 | } | 555 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5abf1dc1f91c..2cd5256dbff7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -457,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 457 | 457 | ||
| 458 | switch (f->type) { | 458 | switch (f->type) { |
| 459 | case AUDIT_PID: | 459 | case AUDIT_PID: |
| 460 | pid = task_pid_nr(tsk); | 460 | pid = task_tgid_nr(tsk); |
| 461 | result = audit_comparator(pid, f->op, f->val); | 461 | result = audit_comparator(pid, f->op, f->val); |
| 462 | break; | 462 | break; |
| 463 | case AUDIT_PPID: | 463 | case AUDIT_PPID: |
| @@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, | |||
| 1993 | loginuid = from_kuid(&init_user_ns, kloginuid), | 1993 | loginuid = from_kuid(&init_user_ns, kloginuid), |
| 1994 | tty = audit_get_tty(current); | 1994 | tty = audit_get_tty(current); |
| 1995 | 1995 | ||
| 1996 | audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); | 1996 | audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); |
| 1997 | audit_log_task_context(ab); | 1997 | audit_log_task_context(ab); |
| 1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", | 1998 | audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", |
| 1999 | oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", | 1999 | oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", |
| @@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t) | |||
| 2220 | { | 2220 | { |
| 2221 | struct audit_context *context = current->audit_context; | 2221 | struct audit_context *context = current->audit_context; |
| 2222 | 2222 | ||
| 2223 | context->target_pid = task_pid_nr(t); | 2223 | context->target_pid = task_tgid_nr(t); |
| 2224 | context->target_auid = audit_get_loginuid(t); | 2224 | context->target_auid = audit_get_loginuid(t); |
| 2225 | context->target_uid = task_uid(t); | 2225 | context->target_uid = task_uid(t); |
| 2226 | context->target_sessionid = audit_get_sessionid(t); | 2226 | context->target_sessionid = audit_get_sessionid(t); |
| @@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2245 | 2245 | ||
| 2246 | if (audit_pid && t->tgid == audit_pid) { | 2246 | if (audit_pid && t->tgid == audit_pid) { |
| 2247 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { | 2247 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { |
| 2248 | audit_sig_pid = task_pid_nr(tsk); | 2248 | audit_sig_pid = task_tgid_nr(tsk); |
| 2249 | if (uid_valid(tsk->loginuid)) | 2249 | if (uid_valid(tsk->loginuid)) |
| 2250 | audit_sig_uid = tsk->loginuid; | 2250 | audit_sig_uid = tsk->loginuid; |
| 2251 | else | 2251 | else |
| @@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2345 | void __audit_log_capset(const struct cred *new, const struct cred *old) | 2345 | void __audit_log_capset(const struct cred *new, const struct cred *old) |
| 2346 | { | 2346 | { |
| 2347 | struct audit_context *context = current->audit_context; | 2347 | struct audit_context *context = current->audit_context; |
| 2348 | context->capset.pid = task_pid_nr(current); | 2348 | context->capset.pid = task_tgid_nr(current); |
| 2349 | context->capset.cap.effective = new->cap_effective; | 2349 | context->capset.cap.effective = new->cap_effective; |
| 2350 | context->capset.cap.inheritable = new->cap_effective; | 2350 | context->capset.cap.inheritable = new->cap_effective; |
| 2351 | context->capset.cap.permitted = new->cap_permitted; | 2351 | context->capset.cap.permitted = new->cap_permitted; |
| @@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
| 2377 | from_kgid(&init_user_ns, gid), | 2377 | from_kgid(&init_user_ns, gid), |
| 2378 | sessionid); | 2378 | sessionid); |
| 2379 | audit_log_task_context(ab); | 2379 | audit_log_task_context(ab); |
| 2380 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); | 2380 | audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); |
| 2381 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); | 2381 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); |
| 2382 | audit_log_d_path_exe(ab, current->mm); | 2382 | audit_log_d_path_exe(ab, current->mm); |
| 2383 | } | 2383 | } |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) | |||
| 538 | } | 538 | } |
| 539 | late_initcall(register_perf_event_array_map); | 539 | late_initcall(register_perf_event_array_map); |
| 540 | 540 | ||
| 541 | #ifdef CONFIG_SOCK_CGROUP_DATA | 541 | #ifdef CONFIG_CGROUPS |
| 542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, | 542 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, |
| 543 | struct file *map_file /* not used */, | 543 | struct file *map_file /* not used */, |
| 544 | int fd) | 544 | int fd) |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..aa6d98154106 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) | |||
| 1018 | prandom_init_once(&bpf_user_rnd_state); | 1018 | prandom_init_once(&bpf_user_rnd_state); |
| 1019 | } | 1019 | } |
| 1020 | 1020 | ||
| 1021 | u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 1021 | BPF_CALL_0(bpf_user_rnd_u32) |
| 1022 | { | 1022 | { |
| 1023 | /* Should someone ever have the rather unwise idea to use some | 1023 | /* Should someone ever have the rather unwise idea to use some |
| 1024 | * of the registers passed into this function, then note that | 1024 | * of the registers passed into this function, then note that |
| @@ -1031,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
| 1031 | 1031 | ||
| 1032 | state = &get_cpu_var(bpf_user_rnd_state); | 1032 | state = &get_cpu_var(bpf_user_rnd_state); |
| 1033 | res = prandom_u32_state(state); | 1033 | res = prandom_u32_state(state); |
| 1034 | put_cpu_var(state); | 1034 | put_cpu_var(bpf_user_rnd_state); |
| 1035 | 1035 | ||
| 1036 | return res; | 1036 | return res; |
| 1037 | } | 1037 | } |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fff3650d52fc..570eeca7bdfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -26,11 +26,18 @@ struct bpf_htab { | |||
| 26 | struct bucket *buckets; | 26 | struct bucket *buckets; |
| 27 | void *elems; | 27 | void *elems; |
| 28 | struct pcpu_freelist freelist; | 28 | struct pcpu_freelist freelist; |
| 29 | void __percpu *extra_elems; | ||
| 29 | atomic_t count; /* number of elements in this hashtable */ | 30 | atomic_t count; /* number of elements in this hashtable */ |
| 30 | u32 n_buckets; /* number of hash buckets */ | 31 | u32 n_buckets; /* number of hash buckets */ |
| 31 | u32 elem_size; /* size of each element in bytes */ | 32 | u32 elem_size; /* size of each element in bytes */ |
| 32 | }; | 33 | }; |
| 33 | 34 | ||
| 35 | enum extra_elem_state { | ||
| 36 | HTAB_NOT_AN_EXTRA_ELEM = 0, | ||
| 37 | HTAB_EXTRA_ELEM_FREE, | ||
| 38 | HTAB_EXTRA_ELEM_USED | ||
| 39 | }; | ||
| 40 | |||
| 34 | /* each htab element is struct htab_elem + key + value */ | 41 | /* each htab element is struct htab_elem + key + value */ |
| 35 | struct htab_elem { | 42 | struct htab_elem { |
| 36 | union { | 43 | union { |
| @@ -38,7 +45,10 @@ struct htab_elem { | |||
| 38 | struct bpf_htab *htab; | 45 | struct bpf_htab *htab; |
| 39 | struct pcpu_freelist_node fnode; | 46 | struct pcpu_freelist_node fnode; |
| 40 | }; | 47 | }; |
| 41 | struct rcu_head rcu; | 48 | union { |
| 49 | struct rcu_head rcu; | ||
| 50 | enum extra_elem_state state; | ||
| 51 | }; | ||
| 42 | u32 hash; | 52 | u32 hash; |
| 43 | char key[0] __aligned(8); | 53 | char key[0] __aligned(8); |
| 44 | }; | 54 | }; |
| @@ -113,6 +123,23 @@ free_elems: | |||
| 113 | return err; | 123 | return err; |
| 114 | } | 124 | } |
| 115 | 125 | ||
| 126 | static int alloc_extra_elems(struct bpf_htab *htab) | ||
| 127 | { | ||
| 128 | void __percpu *pptr; | ||
| 129 | int cpu; | ||
| 130 | |||
| 131 | pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); | ||
| 132 | if (!pptr) | ||
| 133 | return -ENOMEM; | ||
| 134 | |||
| 135 | for_each_possible_cpu(cpu) { | ||
| 136 | ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = | ||
| 137 | HTAB_EXTRA_ELEM_FREE; | ||
| 138 | } | ||
| 139 | htab->extra_elems = pptr; | ||
| 140 | return 0; | ||
| 141 | } | ||
| 142 | |||
| 116 | /* Called from syscall */ | 143 | /* Called from syscall */ |
| 117 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | 144 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) |
| 118 | { | 145 | { |
| @@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 185 | if (percpu) | 212 | if (percpu) |
| 186 | cost += (u64) round_up(htab->map.value_size, 8) * | 213 | cost += (u64) round_up(htab->map.value_size, 8) * |
| 187 | num_possible_cpus() * htab->map.max_entries; | 214 | num_possible_cpus() * htab->map.max_entries; |
| 215 | else | ||
| 216 | cost += (u64) htab->elem_size * num_possible_cpus(); | ||
| 188 | 217 | ||
| 189 | if (cost >= U32_MAX - PAGE_SIZE) | 218 | if (cost >= U32_MAX - PAGE_SIZE) |
| 190 | /* make sure page count doesn't overflow */ | 219 | /* make sure page count doesn't overflow */ |
| @@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 212 | raw_spin_lock_init(&htab->buckets[i].lock); | 241 | raw_spin_lock_init(&htab->buckets[i].lock); |
| 213 | } | 242 | } |
| 214 | 243 | ||
| 244 | if (!percpu) { | ||
| 245 | err = alloc_extra_elems(htab); | ||
| 246 | if (err) | ||
| 247 | goto free_buckets; | ||
| 248 | } | ||
| 249 | |||
| 215 | if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { | 250 | if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { |
| 216 | err = prealloc_elems_and_freelist(htab); | 251 | err = prealloc_elems_and_freelist(htab); |
| 217 | if (err) | 252 | if (err) |
| 218 | goto free_buckets; | 253 | goto free_extra_elems; |
| 219 | } | 254 | } |
| 220 | 255 | ||
| 221 | return &htab->map; | 256 | return &htab->map; |
| 222 | 257 | ||
| 258 | free_extra_elems: | ||
| 259 | free_percpu(htab->extra_elems); | ||
| 223 | free_buckets: | 260 | free_buckets: |
| 224 | kvfree(htab->buckets); | 261 | kvfree(htab->buckets); |
| 225 | free_htab: | 262 | free_htab: |
| @@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) | |||
| 349 | if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) | 386 | if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) |
| 350 | free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); | 387 | free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); |
| 351 | kfree(l); | 388 | kfree(l); |
| 352 | |||
| 353 | } | 389 | } |
| 354 | 390 | ||
| 355 | static void htab_elem_free_rcu(struct rcu_head *head) | 391 | static void htab_elem_free_rcu(struct rcu_head *head) |
| @@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) | |||
| 370 | 406 | ||
| 371 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | 407 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
| 372 | { | 408 | { |
| 409 | if (l->state == HTAB_EXTRA_ELEM_USED) { | ||
| 410 | l->state = HTAB_EXTRA_ELEM_FREE; | ||
| 411 | return; | ||
| 412 | } | ||
| 413 | |||
| 373 | if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { | 414 | if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { |
| 374 | pcpu_freelist_push(&htab->freelist, &l->fnode); | 415 | pcpu_freelist_push(&htab->freelist, &l->fnode); |
| 375 | } else { | 416 | } else { |
| @@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | |||
| 381 | 422 | ||
| 382 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | 423 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
| 383 | void *value, u32 key_size, u32 hash, | 424 | void *value, u32 key_size, u32 hash, |
| 384 | bool percpu, bool onallcpus) | 425 | bool percpu, bool onallcpus, |
| 426 | bool old_elem_exists) | ||
| 385 | { | 427 | { |
| 386 | u32 size = htab->map.value_size; | 428 | u32 size = htab->map.value_size; |
| 387 | bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); | 429 | bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); |
| 388 | struct htab_elem *l_new; | 430 | struct htab_elem *l_new; |
| 389 | void __percpu *pptr; | 431 | void __percpu *pptr; |
| 432 | int err = 0; | ||
| 390 | 433 | ||
| 391 | if (prealloc) { | 434 | if (prealloc) { |
| 392 | l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); | 435 | l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); |
| 393 | if (!l_new) | 436 | if (!l_new) |
| 394 | return ERR_PTR(-E2BIG); | 437 | err = -E2BIG; |
| 395 | } else { | 438 | } else { |
| 396 | if (atomic_inc_return(&htab->count) > htab->map.max_entries) { | 439 | if (atomic_inc_return(&htab->count) > htab->map.max_entries) { |
| 397 | atomic_dec(&htab->count); | 440 | atomic_dec(&htab->count); |
| 398 | return ERR_PTR(-E2BIG); | 441 | err = -E2BIG; |
| 442 | } else { | ||
| 443 | l_new = kmalloc(htab->elem_size, | ||
| 444 | GFP_ATOMIC | __GFP_NOWARN); | ||
| 445 | if (!l_new) | ||
| 446 | return ERR_PTR(-ENOMEM); | ||
| 399 | } | 447 | } |
| 400 | l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); | 448 | } |
| 401 | if (!l_new) | 449 | |
| 402 | return ERR_PTR(-ENOMEM); | 450 | if (err) { |
| 451 | if (!old_elem_exists) | ||
| 452 | return ERR_PTR(err); | ||
| 453 | |||
| 454 | /* if we're updating the existing element and the hash table | ||
| 455 | * is full, use per-cpu extra elems | ||
| 456 | */ | ||
| 457 | l_new = this_cpu_ptr(htab->extra_elems); | ||
| 458 | if (l_new->state != HTAB_EXTRA_ELEM_FREE) | ||
| 459 | return ERR_PTR(-E2BIG); | ||
| 460 | l_new->state = HTAB_EXTRA_ELEM_USED; | ||
| 461 | } else { | ||
| 462 | l_new->state = HTAB_NOT_AN_EXTRA_ELEM; | ||
| 403 | } | 463 | } |
| 404 | 464 | ||
| 405 | memcpy(l_new->key, key, key_size); | 465 | memcpy(l_new->key, key, key_size); |
| @@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 489 | if (ret) | 549 | if (ret) |
| 490 | goto err; | 550 | goto err; |
| 491 | 551 | ||
| 492 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); | 552 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, |
| 553 | !!l_old); | ||
| 493 | if (IS_ERR(l_new)) { | 554 | if (IS_ERR(l_new)) { |
| 494 | /* all pre-allocated elements are in use or memory exhausted */ | 555 | /* all pre-allocated elements are in use or memory exhausted */ |
| 495 | ret = PTR_ERR(l_new); | 556 | ret = PTR_ERR(l_new); |
| @@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, | |||
| 563 | } | 624 | } |
| 564 | } else { | 625 | } else { |
| 565 | l_new = alloc_htab_elem(htab, key, value, key_size, | 626 | l_new = alloc_htab_elem(htab, key, value, key_size, |
| 566 | hash, true, onallcpus); | 627 | hash, true, onallcpus, false); |
| 567 | if (IS_ERR(l_new)) { | 628 | if (IS_ERR(l_new)) { |
| 568 | ret = PTR_ERR(l_new); | 629 | ret = PTR_ERR(l_new); |
| 569 | goto err; | 630 | goto err; |
| @@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) | |||
| 652 | htab_free_elems(htab); | 713 | htab_free_elems(htab); |
| 653 | pcpu_freelist_destroy(&htab->freelist); | 714 | pcpu_freelist_destroy(&htab->freelist); |
| 654 | } | 715 | } |
| 716 | free_percpu(htab->extra_elems); | ||
| 655 | kvfree(htab->buckets); | 717 | kvfree(htab->buckets); |
| 656 | kfree(htab); | 718 | kfree(htab); |
| 657 | } | 719 | } |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/ktime.h> | 16 | #include <linux/ktime.h> |
| 17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
| 18 | #include <linux/uidgid.h> | 18 | #include <linux/uidgid.h> |
| 19 | #include <linux/filter.h> | ||
| 19 | 20 | ||
| 20 | /* If kernel subsystem is allowing eBPF programs to call this function, | 21 | /* If kernel subsystem is allowing eBPF programs to call this function, |
| 21 | * inside its own verifier_ops->get_func_proto() callback it should return | 22 | * inside its own verifier_ops->get_func_proto() callback it should return |
| @@ -26,48 +27,32 @@ | |||
| 26 | * if program is allowed to access maps, so check rcu_read_lock_held in | 27 | * if program is allowed to access maps, so check rcu_read_lock_held in |
| 27 | * all three functions. | 28 | * all three functions. |
| 28 | */ | 29 | */ |
| 29 | static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 30 | BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) |
| 30 | { | 31 | { |
| 31 | /* verifier checked that R1 contains a valid pointer to bpf_map | ||
| 32 | * and R2 points to a program stack and map->key_size bytes were | ||
| 33 | * initialized | ||
| 34 | */ | ||
| 35 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 36 | void *key = (void *) (unsigned long) r2; | ||
| 37 | void *value; | ||
| 38 | |||
| 39 | WARN_ON_ONCE(!rcu_read_lock_held()); | 32 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 40 | 33 | return (unsigned long) map->ops->map_lookup_elem(map, key); | |
| 41 | value = map->ops->map_lookup_elem(map, key); | ||
| 42 | |||
| 43 | /* lookup() returns either pointer to element value or NULL | ||
| 44 | * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type | ||
| 45 | */ | ||
| 46 | return (unsigned long) value; | ||
| 47 | } | 34 | } |
| 48 | 35 | ||
| 49 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { | 36 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { |
| 50 | .func = bpf_map_lookup_elem, | 37 | .func = bpf_map_lookup_elem, |
| 51 | .gpl_only = false, | 38 | .gpl_only = false, |
| 39 | .pkt_access = true, | ||
| 52 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | 40 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, |
| 53 | .arg1_type = ARG_CONST_MAP_PTR, | 41 | .arg1_type = ARG_CONST_MAP_PTR, |
| 54 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 42 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
| 55 | }; | 43 | }; |
| 56 | 44 | ||
| 57 | static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 45 | BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, |
| 46 | void *, value, u64, flags) | ||
| 58 | { | 47 | { |
| 59 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 60 | void *key = (void *) (unsigned long) r2; | ||
| 61 | void *value = (void *) (unsigned long) r3; | ||
| 62 | |||
| 63 | WARN_ON_ONCE(!rcu_read_lock_held()); | 48 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 64 | 49 | return map->ops->map_update_elem(map, key, value, flags); | |
| 65 | return map->ops->map_update_elem(map, key, value, r4); | ||
| 66 | } | 50 | } |
| 67 | 51 | ||
| 68 | const struct bpf_func_proto bpf_map_update_elem_proto = { | 52 | const struct bpf_func_proto bpf_map_update_elem_proto = { |
| 69 | .func = bpf_map_update_elem, | 53 | .func = bpf_map_update_elem, |
| 70 | .gpl_only = false, | 54 | .gpl_only = false, |
| 55 | .pkt_access = true, | ||
| 71 | .ret_type = RET_INTEGER, | 56 | .ret_type = RET_INTEGER, |
| 72 | .arg1_type = ARG_CONST_MAP_PTR, | 57 | .arg1_type = ARG_CONST_MAP_PTR, |
| 73 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 58 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
| @@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { | |||
| 75 | .arg4_type = ARG_ANYTHING, | 60 | .arg4_type = ARG_ANYTHING, |
| 76 | }; | 61 | }; |
| 77 | 62 | ||
| 78 | static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 63 | BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) |
| 79 | { | 64 | { |
| 80 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 81 | void *key = (void *) (unsigned long) r2; | ||
| 82 | |||
| 83 | WARN_ON_ONCE(!rcu_read_lock_held()); | 65 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 84 | |||
| 85 | return map->ops->map_delete_elem(map, key); | 66 | return map->ops->map_delete_elem(map, key); |
| 86 | } | 67 | } |
| 87 | 68 | ||
| 88 | const struct bpf_func_proto bpf_map_delete_elem_proto = { | 69 | const struct bpf_func_proto bpf_map_delete_elem_proto = { |
| 89 | .func = bpf_map_delete_elem, | 70 | .func = bpf_map_delete_elem, |
| 90 | .gpl_only = false, | 71 | .gpl_only = false, |
| 72 | .pkt_access = true, | ||
| 91 | .ret_type = RET_INTEGER, | 73 | .ret_type = RET_INTEGER, |
| 92 | .arg1_type = ARG_CONST_MAP_PTR, | 74 | .arg1_type = ARG_CONST_MAP_PTR, |
| 93 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 75 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
| @@ -99,7 +81,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { | |||
| 99 | .ret_type = RET_INTEGER, | 81 | .ret_type = RET_INTEGER, |
| 100 | }; | 82 | }; |
| 101 | 83 | ||
| 102 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 84 | BPF_CALL_0(bpf_get_smp_processor_id) |
| 103 | { | 85 | { |
| 104 | return smp_processor_id(); | 86 | return smp_processor_id(); |
| 105 | } | 87 | } |
| @@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | |||
| 110 | .ret_type = RET_INTEGER, | 92 | .ret_type = RET_INTEGER, |
| 111 | }; | 93 | }; |
| 112 | 94 | ||
| 113 | static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 95 | BPF_CALL_0(bpf_ktime_get_ns) |
| 114 | { | 96 | { |
| 115 | /* NMI safe access to clock monotonic */ | 97 | /* NMI safe access to clock monotonic */ |
| 116 | return ktime_get_mono_fast_ns(); | 98 | return ktime_get_mono_fast_ns(); |
| @@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { | |||
| 122 | .ret_type = RET_INTEGER, | 104 | .ret_type = RET_INTEGER, |
| 123 | }; | 105 | }; |
| 124 | 106 | ||
| 125 | static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 107 | BPF_CALL_0(bpf_get_current_pid_tgid) |
| 126 | { | 108 | { |
| 127 | struct task_struct *task = current; | 109 | struct task_struct *task = current; |
| 128 | 110 | ||
| 129 | if (!task) | 111 | if (unlikely(!task)) |
| 130 | return -EINVAL; | 112 | return -EINVAL; |
| 131 | 113 | ||
| 132 | return (u64) task->tgid << 32 | task->pid; | 114 | return (u64) task->tgid << 32 | task->pid; |
| @@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { | |||
| 138 | .ret_type = RET_INTEGER, | 120 | .ret_type = RET_INTEGER, |
| 139 | }; | 121 | }; |
| 140 | 122 | ||
| 141 | static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 123 | BPF_CALL_0(bpf_get_current_uid_gid) |
| 142 | { | 124 | { |
| 143 | struct task_struct *task = current; | 125 | struct task_struct *task = current; |
| 144 | kuid_t uid; | 126 | kuid_t uid; |
| 145 | kgid_t gid; | 127 | kgid_t gid; |
| 146 | 128 | ||
| 147 | if (!task) | 129 | if (unlikely(!task)) |
| 148 | return -EINVAL; | 130 | return -EINVAL; |
| 149 | 131 | ||
| 150 | current_uid_gid(&uid, &gid); | 132 | current_uid_gid(&uid, &gid); |
| 151 | return (u64) from_kgid(&init_user_ns, gid) << 32 | | 133 | return (u64) from_kgid(&init_user_ns, gid) << 32 | |
| 152 | from_kuid(&init_user_ns, uid); | 134 | from_kuid(&init_user_ns, uid); |
| 153 | } | 135 | } |
| 154 | 136 | ||
| 155 | const struct bpf_func_proto bpf_get_current_uid_gid_proto = { | 137 | const struct bpf_func_proto bpf_get_current_uid_gid_proto = { |
| @@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { | |||
| 158 | .ret_type = RET_INTEGER, | 140 | .ret_type = RET_INTEGER, |
| 159 | }; | 141 | }; |
| 160 | 142 | ||
| 161 | static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) | 143 | BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) |
| 162 | { | 144 | { |
| 163 | struct task_struct *task = current; | 145 | struct task_struct *task = current; |
| 164 | char *buf = (char *) (long) r1; | ||
| 165 | 146 | ||
| 166 | if (unlikely(!task)) | 147 | if (unlikely(!task)) |
| 167 | goto err_clear; | 148 | goto err_clear; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5967b870a895..1ed8473ec537 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -97,7 +97,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, | |||
| 97 | return ERR_PTR(-ENOSPC); | 97 | return ERR_PTR(-ENOSPC); |
| 98 | 98 | ||
| 99 | inode->i_ino = get_next_ino(); | 99 | inode->i_ino = get_next_ino(); |
| 100 | inode->i_atime = CURRENT_TIME; | 100 | inode->i_atime = current_time(inode); |
| 101 | inode->i_mtime = inode->i_atime; | 101 | inode->i_mtime = inode->i_atime; |
| 102 | inode->i_ctime = inode->i_atime; | 102 | inode->i_ctime = inode->i_atime; |
| 103 | 103 | ||
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
| @@ -116,10 +116,9 @@ free_smap: | |||
| 116 | return ERR_PTR(err); | 116 | return ERR_PTR(err); |
| 117 | } | 117 | } |
| 118 | 118 | ||
| 119 | u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) | 119 | BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, |
| 120 | u64, flags) | ||
| 120 | { | 121 | { |
| 121 | struct pt_regs *regs = (struct pt_regs *) (long) r1; | ||
| 122 | struct bpf_map *map = (struct bpf_map *) (long) r2; | ||
| 123 | struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); | 122 | struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); |
| 124 | struct perf_callchain_entry *trace; | 123 | struct perf_callchain_entry *trace; |
| 125 | struct stack_map_bucket *bucket, *new_bucket, *old_bucket; | 124 | struct stack_map_bucket *bucket, *new_bucket, *old_bucket; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f72f23b8fdab..99a7e5b388f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
| 15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
| 16 | #include <linux/bpf.h> | 16 | #include <linux/bpf.h> |
| 17 | #include <linux/bpf_verifier.h> | ||
| 17 | #include <linux/filter.h> | 18 | #include <linux/filter.h> |
| 18 | #include <net/netlink.h> | 19 | #include <net/netlink.h> |
| 19 | #include <linux/file.h> | 20 | #include <linux/file.h> |
| @@ -126,75 +127,16 @@ | |||
| 126 | * are set to NOT_INIT to indicate that they are no longer readable. | 127 | * are set to NOT_INIT to indicate that they are no longer readable. |
| 127 | */ | 128 | */ |
| 128 | 129 | ||
| 129 | struct reg_state { | ||
| 130 | enum bpf_reg_type type; | ||
| 131 | union { | ||
| 132 | /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ | ||
| 133 | s64 imm; | ||
| 134 | |||
| 135 | /* valid when type == PTR_TO_PACKET* */ | ||
| 136 | struct { | ||
| 137 | u32 id; | ||
| 138 | u16 off; | ||
| 139 | u16 range; | ||
| 140 | }; | ||
| 141 | |||
| 142 | /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | | ||
| 143 | * PTR_TO_MAP_VALUE_OR_NULL | ||
| 144 | */ | ||
| 145 | struct bpf_map *map_ptr; | ||
| 146 | }; | ||
| 147 | }; | ||
| 148 | |||
| 149 | enum bpf_stack_slot_type { | ||
| 150 | STACK_INVALID, /* nothing was stored in this stack slot */ | ||
| 151 | STACK_SPILL, /* register spilled into stack */ | ||
| 152 | STACK_MISC /* BPF program wrote some data into this slot */ | ||
| 153 | }; | ||
| 154 | |||
| 155 | #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ | ||
| 156 | |||
| 157 | /* state of the program: | ||
| 158 | * type of all registers and stack info | ||
| 159 | */ | ||
| 160 | struct verifier_state { | ||
| 161 | struct reg_state regs[MAX_BPF_REG]; | ||
| 162 | u8 stack_slot_type[MAX_BPF_STACK]; | ||
| 163 | struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; | ||
| 164 | }; | ||
| 165 | |||
| 166 | /* linked list of verifier states used to prune search */ | ||
| 167 | struct verifier_state_list { | ||
| 168 | struct verifier_state state; | ||
| 169 | struct verifier_state_list *next; | ||
| 170 | }; | ||
| 171 | |||
| 172 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ | 130 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ |
| 173 | struct verifier_stack_elem { | 131 | struct bpf_verifier_stack_elem { |
| 174 | /* verifer state is 'st' | 132 | /* verifer state is 'st' |
| 175 | * before processing instruction 'insn_idx' | 133 | * before processing instruction 'insn_idx' |
| 176 | * and after processing instruction 'prev_insn_idx' | 134 | * and after processing instruction 'prev_insn_idx' |
| 177 | */ | 135 | */ |
| 178 | struct verifier_state st; | 136 | struct bpf_verifier_state st; |
| 179 | int insn_idx; | 137 | int insn_idx; |
| 180 | int prev_insn_idx; | 138 | int prev_insn_idx; |
| 181 | struct verifier_stack_elem *next; | 139 | struct bpf_verifier_stack_elem *next; |
| 182 | }; | ||
| 183 | |||
| 184 | #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ | ||
| 185 | |||
| 186 | /* single container for all structs | ||
| 187 | * one verifier_env per bpf_check() call | ||
| 188 | */ | ||
| 189 | struct verifier_env { | ||
| 190 | struct bpf_prog *prog; /* eBPF program being verified */ | ||
| 191 | struct verifier_stack_elem *head; /* stack of verifier states to be processed */ | ||
| 192 | int stack_size; /* number of states to be processed */ | ||
| 193 | struct verifier_state cur_state; /* current verifier state */ | ||
| 194 | struct verifier_state_list **explored_states; /* search pruning optimization */ | ||
| 195 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ | ||
| 196 | u32 used_map_cnt; /* number of used maps */ | ||
| 197 | bool allow_ptr_leaks; | ||
| 198 | }; | 140 | }; |
| 199 | 141 | ||
| 200 | #define BPF_COMPLEXITY_LIMIT_INSNS 65536 | 142 | #define BPF_COMPLEXITY_LIMIT_INSNS 65536 |
| @@ -203,6 +145,7 @@ struct verifier_env { | |||
| 203 | struct bpf_call_arg_meta { | 145 | struct bpf_call_arg_meta { |
| 204 | struct bpf_map *map_ptr; | 146 | struct bpf_map *map_ptr; |
| 205 | bool raw_mode; | 147 | bool raw_mode; |
| 148 | bool pkt_access; | ||
| 206 | int regno; | 149 | int regno; |
| 207 | int access_size; | 150 | int access_size; |
| 208 | }; | 151 | }; |
| @@ -239,6 +182,7 @@ static const char * const reg_type_str[] = { | |||
| 239 | [CONST_PTR_TO_MAP] = "map_ptr", | 182 | [CONST_PTR_TO_MAP] = "map_ptr", |
| 240 | [PTR_TO_MAP_VALUE] = "map_value", | 183 | [PTR_TO_MAP_VALUE] = "map_value", |
| 241 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", | 184 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", |
| 185 | [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", | ||
| 242 | [FRAME_PTR] = "fp", | 186 | [FRAME_PTR] = "fp", |
| 243 | [PTR_TO_STACK] = "fp", | 187 | [PTR_TO_STACK] = "fp", |
| 244 | [CONST_IMM] = "imm", | 188 | [CONST_IMM] = "imm", |
| @@ -246,9 +190,9 @@ static const char * const reg_type_str[] = { | |||
| 246 | [PTR_TO_PACKET_END] = "pkt_end", | 190 | [PTR_TO_PACKET_END] = "pkt_end", |
| 247 | }; | 191 | }; |
| 248 | 192 | ||
| 249 | static void print_verifier_state(struct verifier_state *state) | 193 | static void print_verifier_state(struct bpf_verifier_state *state) |
| 250 | { | 194 | { |
| 251 | struct reg_state *reg; | 195 | struct bpf_reg_state *reg; |
| 252 | enum bpf_reg_type t; | 196 | enum bpf_reg_type t; |
| 253 | int i; | 197 | int i; |
| 254 | 198 | ||
| @@ -266,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state) | |||
| 266 | else if (t == UNKNOWN_VALUE && reg->imm) | 210 | else if (t == UNKNOWN_VALUE && reg->imm) |
| 267 | verbose("%lld", reg->imm); | 211 | verbose("%lld", reg->imm); |
| 268 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || | 212 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || |
| 269 | t == PTR_TO_MAP_VALUE_OR_NULL) | 213 | t == PTR_TO_MAP_VALUE_OR_NULL || |
| 214 | t == PTR_TO_MAP_VALUE_ADJ) | ||
| 270 | verbose("(ks=%d,vs=%d)", | 215 | verbose("(ks=%d,vs=%d)", |
| 271 | reg->map_ptr->key_size, | 216 | reg->map_ptr->key_size, |
| 272 | reg->map_ptr->value_size); | 217 | reg->map_ptr->value_size); |
| 218 | if (reg->min_value != BPF_REGISTER_MIN_RANGE) | ||
| 219 | verbose(",min_value=%llu", | ||
| 220 | (unsigned long long)reg->min_value); | ||
| 221 | if (reg->max_value != BPF_REGISTER_MAX_RANGE) | ||
| 222 | verbose(",max_value=%llu", | ||
| 223 | (unsigned long long)reg->max_value); | ||
| 273 | } | 224 | } |
| 274 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 225 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
| 275 | if (state->stack_slot_type[i] == STACK_SPILL) | 226 | if (state->stack_slot_type[i] == STACK_SPILL) |
| @@ -424,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn) | |||
| 424 | } | 375 | } |
| 425 | } | 376 | } |
| 426 | 377 | ||
| 427 | static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | 378 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) |
| 428 | { | 379 | { |
| 429 | struct verifier_stack_elem *elem; | 380 | struct bpf_verifier_stack_elem *elem; |
| 430 | int insn_idx; | 381 | int insn_idx; |
| 431 | 382 | ||
| 432 | if (env->head == NULL) | 383 | if (env->head == NULL) |
| @@ -443,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | |||
| 443 | return insn_idx; | 394 | return insn_idx; |
| 444 | } | 395 | } |
| 445 | 396 | ||
| 446 | static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, | 397 | static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, |
| 447 | int prev_insn_idx) | 398 | int insn_idx, int prev_insn_idx) |
| 448 | { | 399 | { |
| 449 | struct verifier_stack_elem *elem; | 400 | struct bpf_verifier_stack_elem *elem; |
| 450 | 401 | ||
| 451 | elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); | 402 | elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); |
| 452 | if (!elem) | 403 | if (!elem) |
| 453 | goto err; | 404 | goto err; |
| 454 | 405 | ||
| @@ -474,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = { | |||
| 474 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | 425 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 |
| 475 | }; | 426 | }; |
| 476 | 427 | ||
| 477 | static void init_reg_state(struct reg_state *regs) | 428 | static void init_reg_state(struct bpf_reg_state *regs) |
| 478 | { | 429 | { |
| 479 | int i; | 430 | int i; |
| 480 | 431 | ||
| 481 | for (i = 0; i < MAX_BPF_REG; i++) { | 432 | for (i = 0; i < MAX_BPF_REG; i++) { |
| 482 | regs[i].type = NOT_INIT; | 433 | regs[i].type = NOT_INIT; |
| 483 | regs[i].imm = 0; | 434 | regs[i].imm = 0; |
| 435 | regs[i].min_value = BPF_REGISTER_MIN_RANGE; | ||
| 436 | regs[i].max_value = BPF_REGISTER_MAX_RANGE; | ||
| 484 | } | 437 | } |
| 485 | 438 | ||
| 486 | /* frame pointer */ | 439 | /* frame pointer */ |
| @@ -490,20 +443,26 @@ static void init_reg_state(struct reg_state *regs) | |||
| 490 | regs[BPF_REG_1].type = PTR_TO_CTX; | 443 | regs[BPF_REG_1].type = PTR_TO_CTX; |
| 491 | } | 444 | } |
| 492 | 445 | ||
| 493 | static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) | 446 | static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) |
| 494 | { | 447 | { |
| 495 | BUG_ON(regno >= MAX_BPF_REG); | 448 | BUG_ON(regno >= MAX_BPF_REG); |
| 496 | regs[regno].type = UNKNOWN_VALUE; | 449 | regs[regno].type = UNKNOWN_VALUE; |
| 497 | regs[regno].imm = 0; | 450 | regs[regno].imm = 0; |
| 498 | } | 451 | } |
| 499 | 452 | ||
| 453 | static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | ||
| 454 | { | ||
| 455 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | ||
| 456 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | ||
| 457 | } | ||
| 458 | |||
| 500 | enum reg_arg_type { | 459 | enum reg_arg_type { |
| 501 | SRC_OP, /* register is used as source operand */ | 460 | SRC_OP, /* register is used as source operand */ |
| 502 | DST_OP, /* register is used as destination operand */ | 461 | DST_OP, /* register is used as destination operand */ |
| 503 | DST_OP_NO_MARK /* same as above, check only, don't mark */ | 462 | DST_OP_NO_MARK /* same as above, check only, don't mark */ |
| 504 | }; | 463 | }; |
| 505 | 464 | ||
| 506 | static int check_reg_arg(struct reg_state *regs, u32 regno, | 465 | static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, |
| 507 | enum reg_arg_type t) | 466 | enum reg_arg_type t) |
| 508 | { | 467 | { |
| 509 | if (regno >= MAX_BPF_REG) { | 468 | if (regno >= MAX_BPF_REG) { |
| @@ -563,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) | |||
| 563 | /* check_stack_read/write functions track spill/fill of registers, | 522 | /* check_stack_read/write functions track spill/fill of registers, |
| 564 | * stack boundary and alignment are checked in check_mem_access() | 523 | * stack boundary and alignment are checked in check_mem_access() |
| 565 | */ | 524 | */ |
| 566 | static int check_stack_write(struct verifier_state *state, int off, int size, | 525 | static int check_stack_write(struct bpf_verifier_state *state, int off, |
| 567 | int value_regno) | 526 | int size, int value_regno) |
| 568 | { | 527 | { |
| 569 | int i; | 528 | int i; |
| 570 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, | 529 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, |
| @@ -589,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
| 589 | } else { | 548 | } else { |
| 590 | /* regular write of data into stack */ | 549 | /* regular write of data into stack */ |
| 591 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = | 550 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = |
| 592 | (struct reg_state) {}; | 551 | (struct bpf_reg_state) {}; |
| 593 | 552 | ||
| 594 | for (i = 0; i < size; i++) | 553 | for (i = 0; i < size; i++) |
| 595 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; | 554 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; |
| @@ -597,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
| 597 | return 0; | 556 | return 0; |
| 598 | } | 557 | } |
| 599 | 558 | ||
| 600 | static int check_stack_read(struct verifier_state *state, int off, int size, | 559 | static int check_stack_read(struct bpf_verifier_state *state, int off, int size, |
| 601 | int value_regno) | 560 | int value_regno) |
| 602 | { | 561 | { |
| 603 | u8 *slot_type; | 562 | u8 *slot_type; |
| @@ -638,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, | |||
| 638 | } | 597 | } |
| 639 | 598 | ||
| 640 | /* check read/write into map element returned by bpf_map_lookup_elem() */ | 599 | /* check read/write into map element returned by bpf_map_lookup_elem() */ |
| 641 | static int check_map_access(struct verifier_env *env, u32 regno, int off, | 600 | static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, |
| 642 | int size) | 601 | int size) |
| 643 | { | 602 | { |
| 644 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; | 603 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; |
| @@ -653,24 +612,31 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, | |||
| 653 | 612 | ||
| 654 | #define MAX_PACKET_OFF 0xffff | 613 | #define MAX_PACKET_OFF 0xffff |
| 655 | 614 | ||
| 656 | static bool may_write_pkt_data(enum bpf_prog_type type) | 615 | static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, |
| 616 | const struct bpf_call_arg_meta *meta) | ||
| 657 | { | 617 | { |
| 658 | switch (type) { | 618 | switch (env->prog->type) { |
| 619 | case BPF_PROG_TYPE_SCHED_CLS: | ||
| 620 | case BPF_PROG_TYPE_SCHED_ACT: | ||
| 659 | case BPF_PROG_TYPE_XDP: | 621 | case BPF_PROG_TYPE_XDP: |
| 622 | if (meta) | ||
| 623 | return meta->pkt_access; | ||
| 624 | |||
| 625 | env->seen_direct_write = true; | ||
| 660 | return true; | 626 | return true; |
| 661 | default: | 627 | default: |
| 662 | return false; | 628 | return false; |
| 663 | } | 629 | } |
| 664 | } | 630 | } |
| 665 | 631 | ||
| 666 | static int check_packet_access(struct verifier_env *env, u32 regno, int off, | 632 | static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, |
| 667 | int size) | 633 | int size) |
| 668 | { | 634 | { |
| 669 | struct reg_state *regs = env->cur_state.regs; | 635 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 670 | struct reg_state *reg = ®s[regno]; | 636 | struct bpf_reg_state *reg = ®s[regno]; |
| 671 | 637 | ||
| 672 | off += reg->off; | 638 | off += reg->off; |
| 673 | if (off < 0 || off + size > reg->range) { | 639 | if (off < 0 || size <= 0 || off + size > reg->range) { |
| 674 | verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", | 640 | verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", |
| 675 | off, size, regno, reg->id, reg->off, reg->range); | 641 | off, size, regno, reg->id, reg->off, reg->range); |
| 676 | return -EACCES; | 642 | return -EACCES; |
| @@ -679,9 +645,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, | |||
| 679 | } | 645 | } |
| 680 | 646 | ||
| 681 | /* check access to 'struct bpf_context' fields */ | 647 | /* check access to 'struct bpf_context' fields */ |
| 682 | static int check_ctx_access(struct verifier_env *env, int off, int size, | 648 | static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, |
| 683 | enum bpf_access_type t, enum bpf_reg_type *reg_type) | 649 | enum bpf_access_type t, enum bpf_reg_type *reg_type) |
| 684 | { | 650 | { |
| 651 | /* for analyzer ctx accesses are already validated and converted */ | ||
| 652 | if (env->analyzer_ops) | ||
| 653 | return 0; | ||
| 654 | |||
| 685 | if (env->prog->aux->ops->is_valid_access && | 655 | if (env->prog->aux->ops->is_valid_access && |
| 686 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { | 656 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { |
| 687 | /* remember the offset of last byte accessed in ctx */ | 657 | /* remember the offset of last byte accessed in ctx */ |
| @@ -694,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, | |||
| 694 | return -EACCES; | 664 | return -EACCES; |
| 695 | } | 665 | } |
| 696 | 666 | ||
| 697 | static bool is_pointer_value(struct verifier_env *env, int regno) | 667 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) |
| 698 | { | 668 | { |
| 699 | if (env->allow_ptr_leaks) | 669 | if (env->allow_ptr_leaks) |
| 700 | return false; | 670 | return false; |
| @@ -708,28 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno) | |||
| 708 | } | 678 | } |
| 709 | } | 679 | } |
| 710 | 680 | ||
| 711 | static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | 681 | static int check_ptr_alignment(struct bpf_verifier_env *env, |
| 712 | int off, int size) | 682 | struct bpf_reg_state *reg, int off, int size) |
| 713 | { | 683 | { |
| 714 | if (reg->type != PTR_TO_PACKET) { | 684 | if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { |
| 715 | if (off % size != 0) { | 685 | if (off % size != 0) { |
| 716 | verbose("misaligned access off %d size %d\n", off, size); | 686 | verbose("misaligned access off %d size %d\n", |
| 687 | off, size); | ||
| 717 | return -EACCES; | 688 | return -EACCES; |
| 718 | } else { | 689 | } else { |
| 719 | return 0; | 690 | return 0; |
| 720 | } | 691 | } |
| 721 | } | 692 | } |
| 722 | 693 | ||
| 723 | switch (env->prog->type) { | ||
| 724 | case BPF_PROG_TYPE_SCHED_CLS: | ||
| 725 | case BPF_PROG_TYPE_SCHED_ACT: | ||
| 726 | case BPF_PROG_TYPE_XDP: | ||
| 727 | break; | ||
| 728 | default: | ||
| 729 | verbose("verifier is misconfigured\n"); | ||
| 730 | return -EACCES; | ||
| 731 | } | ||
| 732 | |||
| 733 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | 694 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) |
| 734 | /* misaligned access to packet is ok on x86,arm,arm64 */ | 695 | /* misaligned access to packet is ok on x86,arm,arm64 */ |
| 735 | return 0; | 696 | return 0; |
| @@ -740,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
| 740 | } | 701 | } |
| 741 | 702 | ||
| 742 | /* skb->data is NET_IP_ALIGN-ed */ | 703 | /* skb->data is NET_IP_ALIGN-ed */ |
| 743 | if ((NET_IP_ALIGN + reg->off + off) % size != 0) { | 704 | if (reg->type == PTR_TO_PACKET && |
| 705 | (NET_IP_ALIGN + reg->off + off) % size != 0) { | ||
| 744 | verbose("misaligned packet access off %d+%d+%d size %d\n", | 706 | verbose("misaligned packet access off %d+%d+%d size %d\n", |
| 745 | NET_IP_ALIGN, reg->off, off, size); | 707 | NET_IP_ALIGN, reg->off, off, size); |
| 746 | return -EACCES; | 708 | return -EACCES; |
| @@ -754,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, | |||
| 754 | * if t==write && value_regno==-1, some unknown value is stored into memory | 716 | * if t==write && value_regno==-1, some unknown value is stored into memory |
| 755 | * if t==read && value_regno==-1, don't care what we read from memory | 717 | * if t==read && value_regno==-1, don't care what we read from memory |
| 756 | */ | 718 | */ |
| 757 | static int check_mem_access(struct verifier_env *env, u32 regno, int off, | 719 | static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, |
| 758 | int bpf_size, enum bpf_access_type t, | 720 | int bpf_size, enum bpf_access_type t, |
| 759 | int value_regno) | 721 | int value_regno) |
| 760 | { | 722 | { |
| 761 | struct verifier_state *state = &env->cur_state; | 723 | struct bpf_verifier_state *state = &env->cur_state; |
| 762 | struct reg_state *reg = &state->regs[regno]; | 724 | struct bpf_reg_state *reg = &state->regs[regno]; |
| 763 | int size, err = 0; | 725 | int size, err = 0; |
| 764 | 726 | ||
| 765 | if (reg->type == PTR_TO_STACK) | 727 | if (reg->type == PTR_TO_STACK) |
| @@ -773,12 +735,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
| 773 | if (err) | 735 | if (err) |
| 774 | return err; | 736 | return err; |
| 775 | 737 | ||
| 776 | if (reg->type == PTR_TO_MAP_VALUE) { | 738 | if (reg->type == PTR_TO_MAP_VALUE || |
| 739 | reg->type == PTR_TO_MAP_VALUE_ADJ) { | ||
| 777 | if (t == BPF_WRITE && value_regno >= 0 && | 740 | if (t == BPF_WRITE && value_regno >= 0 && |
| 778 | is_pointer_value(env, value_regno)) { | 741 | is_pointer_value(env, value_regno)) { |
| 779 | verbose("R%d leaks addr into map\n", value_regno); | 742 | verbose("R%d leaks addr into map\n", value_regno); |
| 780 | return -EACCES; | 743 | return -EACCES; |
| 781 | } | 744 | } |
| 745 | |||
| 746 | /* If we adjusted the register to this map value at all then we | ||
| 747 | * need to change off and size to min_value and max_value | ||
| 748 | * respectively to make sure our theoretical access will be | ||
| 749 | * safe. | ||
| 750 | */ | ||
| 751 | if (reg->type == PTR_TO_MAP_VALUE_ADJ) { | ||
| 752 | if (log_level) | ||
| 753 | print_verifier_state(state); | ||
| 754 | env->varlen_map_value_access = true; | ||
| 755 | /* The minimum value is only important with signed | ||
| 756 | * comparisons where we can't assume the floor of a | ||
| 757 | * value is 0. If we are using signed variables for our | ||
| 758 | * index'es we need to make sure that whatever we use | ||
| 759 | * will have a set floor within our range. | ||
| 760 | */ | ||
| 761 | if ((s64)reg->min_value < 0) { | ||
| 762 | verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", | ||
| 763 | regno); | ||
| 764 | return -EACCES; | ||
| 765 | } | ||
| 766 | err = check_map_access(env, regno, reg->min_value + off, | ||
| 767 | size); | ||
| 768 | if (err) { | ||
| 769 | verbose("R%d min value is outside of the array range\n", | ||
| 770 | regno); | ||
| 771 | return err; | ||
| 772 | } | ||
| 773 | |||
| 774 | /* If we haven't set a max value then we need to bail | ||
| 775 | * since we can't be sure we won't do bad things. | ||
| 776 | */ | ||
| 777 | if (reg->max_value == BPF_REGISTER_MAX_RANGE) { | ||
| 778 | verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", | ||
| 779 | regno); | ||
| 780 | return -EACCES; | ||
| 781 | } | ||
| 782 | off += reg->max_value; | ||
| 783 | } | ||
| 782 | err = check_map_access(env, regno, off, size); | 784 | err = check_map_access(env, regno, off, size); |
| 783 | if (!err && t == BPF_READ && value_regno >= 0) | 785 | if (!err && t == BPF_READ && value_regno >= 0) |
| 784 | mark_reg_unknown_value(state->regs, value_regno); | 786 | mark_reg_unknown_value(state->regs, value_regno); |
| @@ -794,9 +796,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
| 794 | err = check_ctx_access(env, off, size, t, ®_type); | 796 | err = check_ctx_access(env, off, size, t, ®_type); |
| 795 | if (!err && t == BPF_READ && value_regno >= 0) { | 797 | if (!err && t == BPF_READ && value_regno >= 0) { |
| 796 | mark_reg_unknown_value(state->regs, value_regno); | 798 | mark_reg_unknown_value(state->regs, value_regno); |
| 797 | if (env->allow_ptr_leaks) | 799 | /* note that reg.[id|off|range] == 0 */ |
| 798 | /* note that reg.[id|off|range] == 0 */ | 800 | state->regs[value_regno].type = reg_type; |
| 799 | state->regs[value_regno].type = reg_type; | ||
| 800 | } | 801 | } |
| 801 | 802 | ||
| 802 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { | 803 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { |
| @@ -816,7 +817,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
| 816 | err = check_stack_read(state, off, size, value_regno); | 817 | err = check_stack_read(state, off, size, value_regno); |
| 817 | } | 818 | } |
| 818 | } else if (state->regs[regno].type == PTR_TO_PACKET) { | 819 | } else if (state->regs[regno].type == PTR_TO_PACKET) { |
| 819 | if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { | 820 | if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { |
| 820 | verbose("cannot write into packet\n"); | 821 | verbose("cannot write into packet\n"); |
| 821 | return -EACCES; | 822 | return -EACCES; |
| 822 | } | 823 | } |
| @@ -845,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
| 845 | return err; | 846 | return err; |
| 846 | } | 847 | } |
| 847 | 848 | ||
| 848 | static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | 849 | static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) |
| 849 | { | 850 | { |
| 850 | struct reg_state *regs = env->cur_state.regs; | 851 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 851 | int err; | 852 | int err; |
| 852 | 853 | ||
| 853 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || | 854 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || |
| @@ -881,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | |||
| 881 | * bytes from that pointer, make sure that it's within stack boundary | 882 | * bytes from that pointer, make sure that it's within stack boundary |
| 882 | * and all elements of stack are initialized | 883 | * and all elements of stack are initialized |
| 883 | */ | 884 | */ |
| 884 | static int check_stack_boundary(struct verifier_env *env, int regno, | 885 | static int check_stack_boundary(struct bpf_verifier_env *env, int regno, |
| 885 | int access_size, bool zero_size_allowed, | 886 | int access_size, bool zero_size_allowed, |
| 886 | struct bpf_call_arg_meta *meta) | 887 | struct bpf_call_arg_meta *meta) |
| 887 | { | 888 | { |
| 888 | struct verifier_state *state = &env->cur_state; | 889 | struct bpf_verifier_state *state = &env->cur_state; |
| 889 | struct reg_state *regs = state->regs; | 890 | struct bpf_reg_state *regs = state->regs; |
| 890 | int off, i; | 891 | int off, i; |
| 891 | 892 | ||
| 892 | if (regs[regno].type != PTR_TO_STACK) { | 893 | if (regs[regno].type != PTR_TO_STACK) { |
| @@ -925,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno, | |||
| 925 | return 0; | 926 | return 0; |
| 926 | } | 927 | } |
| 927 | 928 | ||
| 928 | static int check_func_arg(struct verifier_env *env, u32 regno, | 929 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, |
| 929 | enum bpf_arg_type arg_type, | 930 | enum bpf_arg_type arg_type, |
| 930 | struct bpf_call_arg_meta *meta) | 931 | struct bpf_call_arg_meta *meta) |
| 931 | { | 932 | { |
| 932 | struct reg_state *reg = env->cur_state.regs + regno; | 933 | struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; |
| 933 | enum bpf_reg_type expected_type; | 934 | enum bpf_reg_type expected_type, type = reg->type; |
| 934 | int err = 0; | 935 | int err = 0; |
| 935 | 936 | ||
| 936 | if (arg_type == ARG_DONTCARE) | 937 | if (arg_type == ARG_DONTCARE) |
| 937 | return 0; | 938 | return 0; |
| 938 | 939 | ||
| 939 | if (reg->type == NOT_INIT) { | 940 | if (type == NOT_INIT) { |
| 940 | verbose("R%d !read_ok\n", regno); | 941 | verbose("R%d !read_ok\n", regno); |
| 941 | return -EACCES; | 942 | return -EACCES; |
| 942 | } | 943 | } |
| @@ -949,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 949 | return 0; | 950 | return 0; |
| 950 | } | 951 | } |
| 951 | 952 | ||
| 953 | if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { | ||
| 954 | verbose("helper access to the packet is not allowed\n"); | ||
| 955 | return -EACCES; | ||
| 956 | } | ||
| 957 | |||
| 952 | if (arg_type == ARG_PTR_TO_MAP_KEY || | 958 | if (arg_type == ARG_PTR_TO_MAP_KEY || |
| 953 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 959 | arg_type == ARG_PTR_TO_MAP_VALUE) { |
| 954 | expected_type = PTR_TO_STACK; | 960 | expected_type = PTR_TO_STACK; |
| 961 | if (type != PTR_TO_PACKET && type != expected_type) | ||
| 962 | goto err_type; | ||
| 955 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 963 | } else if (arg_type == ARG_CONST_STACK_SIZE || |
| 956 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 964 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { |
| 957 | expected_type = CONST_IMM; | 965 | expected_type = CONST_IMM; |
| 966 | if (type != expected_type) | ||
| 967 | goto err_type; | ||
| 958 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 968 | } else if (arg_type == ARG_CONST_MAP_PTR) { |
| 959 | expected_type = CONST_PTR_TO_MAP; | 969 | expected_type = CONST_PTR_TO_MAP; |
| 970 | if (type != expected_type) | ||
| 971 | goto err_type; | ||
| 960 | } else if (arg_type == ARG_PTR_TO_CTX) { | 972 | } else if (arg_type == ARG_PTR_TO_CTX) { |
| 961 | expected_type = PTR_TO_CTX; | 973 | expected_type = PTR_TO_CTX; |
| 974 | if (type != expected_type) | ||
| 975 | goto err_type; | ||
| 962 | } else if (arg_type == ARG_PTR_TO_STACK || | 976 | } else if (arg_type == ARG_PTR_TO_STACK || |
| 963 | arg_type == ARG_PTR_TO_RAW_STACK) { | 977 | arg_type == ARG_PTR_TO_RAW_STACK) { |
| 964 | expected_type = PTR_TO_STACK; | 978 | expected_type = PTR_TO_STACK; |
| @@ -966,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 966 | * passed in as argument, it's a CONST_IMM type. Final test | 980 | * passed in as argument, it's a CONST_IMM type. Final test |
| 967 | * happens during stack boundary checking. | 981 | * happens during stack boundary checking. |
| 968 | */ | 982 | */ |
| 969 | if (reg->type == CONST_IMM && reg->imm == 0) | 983 | if (type == CONST_IMM && reg->imm == 0) |
| 970 | expected_type = CONST_IMM; | 984 | /* final test in check_stack_boundary() */; |
| 985 | else if (type != PTR_TO_PACKET && type != expected_type) | ||
| 986 | goto err_type; | ||
| 971 | meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; | 987 | meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; |
| 972 | } else { | 988 | } else { |
| 973 | verbose("unsupported arg_type %d\n", arg_type); | 989 | verbose("unsupported arg_type %d\n", arg_type); |
| 974 | return -EFAULT; | 990 | return -EFAULT; |
| 975 | } | 991 | } |
| 976 | 992 | ||
| 977 | if (reg->type != expected_type) { | ||
| 978 | verbose("R%d type=%s expected=%s\n", regno, | ||
| 979 | reg_type_str[reg->type], reg_type_str[expected_type]); | ||
| 980 | return -EACCES; | ||
| 981 | } | ||
| 982 | |||
| 983 | if (arg_type == ARG_CONST_MAP_PTR) { | 993 | if (arg_type == ARG_CONST_MAP_PTR) { |
| 984 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ | 994 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ |
| 985 | meta->map_ptr = reg->map_ptr; | 995 | meta->map_ptr = reg->map_ptr; |
| @@ -997,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 997 | verbose("invalid map_ptr to access map->key\n"); | 1007 | verbose("invalid map_ptr to access map->key\n"); |
| 998 | return -EACCES; | 1008 | return -EACCES; |
| 999 | } | 1009 | } |
| 1000 | err = check_stack_boundary(env, regno, meta->map_ptr->key_size, | 1010 | if (type == PTR_TO_PACKET) |
| 1001 | false, NULL); | 1011 | err = check_packet_access(env, regno, 0, |
| 1012 | meta->map_ptr->key_size); | ||
| 1013 | else | ||
| 1014 | err = check_stack_boundary(env, regno, | ||
| 1015 | meta->map_ptr->key_size, | ||
| 1016 | false, NULL); | ||
| 1002 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { | 1017 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { |
| 1003 | /* bpf_map_xxx(..., map_ptr, ..., value) call: | 1018 | /* bpf_map_xxx(..., map_ptr, ..., value) call: |
| 1004 | * check [value, value + map->value_size) validity | 1019 | * check [value, value + map->value_size) validity |
| @@ -1008,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 1008 | verbose("invalid map_ptr to access map->value\n"); | 1023 | verbose("invalid map_ptr to access map->value\n"); |
| 1009 | return -EACCES; | 1024 | return -EACCES; |
| 1010 | } | 1025 | } |
| 1011 | err = check_stack_boundary(env, regno, | 1026 | if (type == PTR_TO_PACKET) |
| 1012 | meta->map_ptr->value_size, | 1027 | err = check_packet_access(env, regno, 0, |
| 1013 | false, NULL); | 1028 | meta->map_ptr->value_size); |
| 1029 | else | ||
| 1030 | err = check_stack_boundary(env, regno, | ||
| 1031 | meta->map_ptr->value_size, | ||
| 1032 | false, NULL); | ||
| 1014 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 1033 | } else if (arg_type == ARG_CONST_STACK_SIZE || |
| 1015 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 1034 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { |
| 1016 | bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); | 1035 | bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); |
| @@ -1024,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 1024 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); | 1043 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); |
| 1025 | return -EACCES; | 1044 | return -EACCES; |
| 1026 | } | 1045 | } |
| 1027 | err = check_stack_boundary(env, regno - 1, reg->imm, | 1046 | if (regs[regno - 1].type == PTR_TO_PACKET) |
| 1028 | zero_size_allowed, meta); | 1047 | err = check_packet_access(env, regno - 1, 0, reg->imm); |
| 1048 | else | ||
| 1049 | err = check_stack_boundary(env, regno - 1, reg->imm, | ||
| 1050 | zero_size_allowed, meta); | ||
| 1029 | } | 1051 | } |
| 1030 | 1052 | ||
| 1031 | return err; | 1053 | return err; |
| 1054 | err_type: | ||
| 1055 | verbose("R%d type=%s expected=%s\n", regno, | ||
| 1056 | reg_type_str[type], reg_type_str[expected_type]); | ||
| 1057 | return -EACCES; | ||
| 1032 | } | 1058 | } |
| 1033 | 1059 | ||
| 1034 | static int check_map_func_compatibility(struct bpf_map *map, int func_id) | 1060 | static int check_map_func_compatibility(struct bpf_map *map, int func_id) |
| @@ -1052,7 +1078,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
| 1052 | goto error; | 1078 | goto error; |
| 1053 | break; | 1079 | break; |
| 1054 | case BPF_MAP_TYPE_CGROUP_ARRAY: | 1080 | case BPF_MAP_TYPE_CGROUP_ARRAY: |
| 1055 | if (func_id != BPF_FUNC_skb_in_cgroup) | 1081 | if (func_id != BPF_FUNC_skb_under_cgroup && |
| 1082 | func_id != BPF_FUNC_current_task_under_cgroup) | ||
| 1056 | goto error; | 1083 | goto error; |
| 1057 | break; | 1084 | break; |
| 1058 | default: | 1085 | default: |
| @@ -1074,7 +1101,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
| 1074 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) | 1101 | if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) |
| 1075 | goto error; | 1102 | goto error; |
| 1076 | break; | 1103 | break; |
| 1077 | case BPF_FUNC_skb_in_cgroup: | 1104 | case BPF_FUNC_current_task_under_cgroup: |
| 1105 | case BPF_FUNC_skb_under_cgroup: | ||
| 1078 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) | 1106 | if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) |
| 1079 | goto error; | 1107 | goto error; |
| 1080 | break; | 1108 | break; |
| @@ -1107,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) | |||
| 1107 | return count > 1 ? -EINVAL : 0; | 1135 | return count > 1 ? -EINVAL : 0; |
| 1108 | } | 1136 | } |
| 1109 | 1137 | ||
| 1110 | static void clear_all_pkt_pointers(struct verifier_env *env) | 1138 | static void clear_all_pkt_pointers(struct bpf_verifier_env *env) |
| 1111 | { | 1139 | { |
| 1112 | struct verifier_state *state = &env->cur_state; | 1140 | struct bpf_verifier_state *state = &env->cur_state; |
| 1113 | struct reg_state *regs = state->regs, *reg; | 1141 | struct bpf_reg_state *regs = state->regs, *reg; |
| 1114 | int i; | 1142 | int i; |
| 1115 | 1143 | ||
| 1116 | for (i = 0; i < MAX_BPF_REG; i++) | 1144 | for (i = 0; i < MAX_BPF_REG; i++) |
| @@ -1130,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env) | |||
| 1130 | } | 1158 | } |
| 1131 | } | 1159 | } |
| 1132 | 1160 | ||
| 1133 | static int check_call(struct verifier_env *env, int func_id) | 1161 | static int check_call(struct bpf_verifier_env *env, int func_id) |
| 1134 | { | 1162 | { |
| 1135 | struct verifier_state *state = &env->cur_state; | 1163 | struct bpf_verifier_state *state = &env->cur_state; |
| 1136 | const struct bpf_func_proto *fn = NULL; | 1164 | const struct bpf_func_proto *fn = NULL; |
| 1137 | struct reg_state *regs = state->regs; | 1165 | struct bpf_reg_state *regs = state->regs; |
| 1138 | struct reg_state *reg; | 1166 | struct bpf_reg_state *reg; |
| 1139 | struct bpf_call_arg_meta meta; | 1167 | struct bpf_call_arg_meta meta; |
| 1140 | bool changes_data; | 1168 | bool changes_data; |
| 1141 | int i, err; | 1169 | int i, err; |
| @@ -1163,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
| 1163 | changes_data = bpf_helper_changes_skb_data(fn->func); | 1191 | changes_data = bpf_helper_changes_skb_data(fn->func); |
| 1164 | 1192 | ||
| 1165 | memset(&meta, 0, sizeof(meta)); | 1193 | memset(&meta, 0, sizeof(meta)); |
| 1194 | meta.pkt_access = fn->pkt_access; | ||
| 1166 | 1195 | ||
| 1167 | /* We only support one arg being in raw mode at the moment, which | 1196 | /* We only support one arg being in raw mode at the moment, which |
| 1168 | * is sufficient for the helper functions we have right now. | 1197 | * is sufficient for the helper functions we have right now. |
| @@ -1213,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
| 1213 | regs[BPF_REG_0].type = NOT_INIT; | 1242 | regs[BPF_REG_0].type = NOT_INIT; |
| 1214 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { | 1243 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { |
| 1215 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; | 1244 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; |
| 1245 | regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; | ||
| 1216 | /* remember map_ptr, so that check_map_access() | 1246 | /* remember map_ptr, so that check_map_access() |
| 1217 | * can check 'value_size' boundary of memory access | 1247 | * can check 'value_size' boundary of memory access |
| 1218 | * to map element returned from bpf_map_lookup_elem() | 1248 | * to map element returned from bpf_map_lookup_elem() |
| @@ -1237,12 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id) | |||
| 1237 | return 0; | 1267 | return 0; |
| 1238 | } | 1268 | } |
| 1239 | 1269 | ||
| 1240 | static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) | 1270 | static int check_packet_ptr_add(struct bpf_verifier_env *env, |
| 1271 | struct bpf_insn *insn) | ||
| 1241 | { | 1272 | { |
| 1242 | struct reg_state *regs = env->cur_state.regs; | 1273 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 1243 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1274 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
| 1244 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1275 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
| 1245 | struct reg_state tmp_reg; | 1276 | struct bpf_reg_state tmp_reg; |
| 1246 | s32 imm; | 1277 | s32 imm; |
| 1247 | 1278 | ||
| 1248 | if (BPF_SRC(insn->code) == BPF_K) { | 1279 | if (BPF_SRC(insn->code) == BPF_K) { |
| @@ -1301,7 +1332,7 @@ add_imm: | |||
| 1301 | /* dst_reg stays as pkt_ptr type and since some positive | 1332 | /* dst_reg stays as pkt_ptr type and since some positive |
| 1302 | * integer value was added to the pointer, increment its 'id' | 1333 | * integer value was added to the pointer, increment its 'id' |
| 1303 | */ | 1334 | */ |
| 1304 | dst_reg->id++; | 1335 | dst_reg->id = ++env->id_gen; |
| 1305 | 1336 | ||
| 1306 | /* something was added to pkt_ptr, set range and off to zero */ | 1337 | /* something was added to pkt_ptr, set range and off to zero */ |
| 1307 | dst_reg->off = 0; | 1338 | dst_reg->off = 0; |
| @@ -1310,10 +1341,10 @@ add_imm: | |||
| 1310 | return 0; | 1341 | return 0; |
| 1311 | } | 1342 | } |
| 1312 | 1343 | ||
| 1313 | static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | 1344 | static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) |
| 1314 | { | 1345 | { |
| 1315 | struct reg_state *regs = env->cur_state.regs; | 1346 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 1316 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1347 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
| 1317 | u8 opcode = BPF_OP(insn->code); | 1348 | u8 opcode = BPF_OP(insn->code); |
| 1318 | s64 imm_log2; | 1349 | s64 imm_log2; |
| 1319 | 1350 | ||
| @@ -1323,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1323 | */ | 1354 | */ |
| 1324 | 1355 | ||
| 1325 | if (BPF_SRC(insn->code) == BPF_X) { | 1356 | if (BPF_SRC(insn->code) == BPF_X) { |
| 1326 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1357 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
| 1327 | 1358 | ||
| 1328 | if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && | 1359 | if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && |
| 1329 | dst_reg->imm && opcode == BPF_ADD) { | 1360 | dst_reg->imm && opcode == BPF_ADD) { |
| @@ -1412,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1412 | return 0; | 1443 | return 0; |
| 1413 | } | 1444 | } |
| 1414 | 1445 | ||
| 1415 | static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) | 1446 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, |
| 1447 | struct bpf_insn *insn) | ||
| 1416 | { | 1448 | { |
| 1417 | struct reg_state *regs = env->cur_state.regs; | 1449 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 1418 | struct reg_state *dst_reg = ®s[insn->dst_reg]; | 1450 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
| 1419 | struct reg_state *src_reg = ®s[insn->src_reg]; | 1451 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
| 1420 | u8 opcode = BPF_OP(insn->code); | 1452 | u8 opcode = BPF_OP(insn->code); |
| 1421 | 1453 | ||
| 1422 | /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. | 1454 | /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. |
| @@ -1432,10 +1464,110 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1432 | return 0; | 1464 | return 0; |
| 1433 | } | 1465 | } |
| 1434 | 1466 | ||
| 1467 | static void check_reg_overflow(struct bpf_reg_state *reg) | ||
| 1468 | { | ||
| 1469 | if (reg->max_value > BPF_REGISTER_MAX_RANGE) | ||
| 1470 | reg->max_value = BPF_REGISTER_MAX_RANGE; | ||
| 1471 | if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) | ||
| 1472 | reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | ||
| 1476 | struct bpf_insn *insn) | ||
| 1477 | { | ||
| 1478 | struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; | ||
| 1479 | u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; | ||
| 1480 | bool min_set = false, max_set = false; | ||
| 1481 | u8 opcode = BPF_OP(insn->code); | ||
| 1482 | |||
| 1483 | dst_reg = ®s[insn->dst_reg]; | ||
| 1484 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1485 | check_reg_overflow(®s[insn->src_reg]); | ||
| 1486 | min_val = regs[insn->src_reg].min_value; | ||
| 1487 | max_val = regs[insn->src_reg].max_value; | ||
| 1488 | |||
| 1489 | /* If the source register is a random pointer then the | ||
| 1490 | * min_value/max_value values represent the range of the known | ||
| 1491 | * accesses into that value, not the actual min/max value of the | ||
| 1492 | * register itself. In this case we have to reset the reg range | ||
| 1493 | * values so we know it is not safe to look at. | ||
| 1494 | */ | ||
| 1495 | if (regs[insn->src_reg].type != CONST_IMM && | ||
| 1496 | regs[insn->src_reg].type != UNKNOWN_VALUE) { | ||
| 1497 | min_val = BPF_REGISTER_MIN_RANGE; | ||
| 1498 | max_val = BPF_REGISTER_MAX_RANGE; | ||
| 1499 | } | ||
| 1500 | } else if (insn->imm < BPF_REGISTER_MAX_RANGE && | ||
| 1501 | (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { | ||
| 1502 | min_val = max_val = insn->imm; | ||
| 1503 | min_set = max_set = true; | ||
| 1504 | } | ||
| 1505 | |||
| 1506 | /* We don't know anything about what was done to this register, mark it | ||
| 1507 | * as unknown. | ||
| 1508 | */ | ||
| 1509 | if (min_val == BPF_REGISTER_MIN_RANGE && | ||
| 1510 | max_val == BPF_REGISTER_MAX_RANGE) { | ||
| 1511 | reset_reg_range_values(regs, insn->dst_reg); | ||
| 1512 | return; | ||
| 1513 | } | ||
| 1514 | |||
| 1515 | switch (opcode) { | ||
| 1516 | case BPF_ADD: | ||
| 1517 | dst_reg->min_value += min_val; | ||
| 1518 | dst_reg->max_value += max_val; | ||
| 1519 | break; | ||
| 1520 | case BPF_SUB: | ||
| 1521 | dst_reg->min_value -= min_val; | ||
| 1522 | dst_reg->max_value -= max_val; | ||
| 1523 | break; | ||
| 1524 | case BPF_MUL: | ||
| 1525 | dst_reg->min_value *= min_val; | ||
| 1526 | dst_reg->max_value *= max_val; | ||
| 1527 | break; | ||
| 1528 | case BPF_AND: | ||
| 1529 | /* & is special since it could end up with 0 bits set. */ | ||
| 1530 | dst_reg->min_value &= min_val; | ||
| 1531 | dst_reg->max_value = max_val; | ||
| 1532 | break; | ||
| 1533 | case BPF_LSH: | ||
| 1534 | /* Gotta have special overflow logic here, if we're shifting | ||
| 1535 | * more than MAX_RANGE then just assume we have an invalid | ||
| 1536 | * range. | ||
| 1537 | */ | ||
| 1538 | if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) | ||
| 1539 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | ||
| 1540 | else | ||
| 1541 | dst_reg->min_value <<= min_val; | ||
| 1542 | |||
| 1543 | if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) | ||
| 1544 | dst_reg->max_value = BPF_REGISTER_MAX_RANGE; | ||
| 1545 | else | ||
| 1546 | dst_reg->max_value <<= max_val; | ||
| 1547 | break; | ||
| 1548 | case BPF_RSH: | ||
| 1549 | dst_reg->min_value >>= min_val; | ||
| 1550 | dst_reg->max_value >>= max_val; | ||
| 1551 | break; | ||
| 1552 | case BPF_MOD: | ||
| 1553 | /* % is special since it is an unsigned modulus, so the floor | ||
| 1554 | * will always be 0. | ||
| 1555 | */ | ||
| 1556 | dst_reg->min_value = 0; | ||
| 1557 | dst_reg->max_value = max_val - 1; | ||
| 1558 | break; | ||
| 1559 | default: | ||
| 1560 | reset_reg_range_values(regs, insn->dst_reg); | ||
| 1561 | break; | ||
| 1562 | } | ||
| 1563 | |||
| 1564 | check_reg_overflow(dst_reg); | ||
| 1565 | } | ||
| 1566 | |||
| 1435 | /* check validity of 32-bit and 64-bit arithmetic operations */ | 1567 | /* check validity of 32-bit and 64-bit arithmetic operations */ |
| 1436 | static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | 1568 | static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) |
| 1437 | { | 1569 | { |
| 1438 | struct reg_state *regs = env->cur_state.regs, *dst_reg; | 1570 | struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; |
| 1439 | u8 opcode = BPF_OP(insn->code); | 1571 | u8 opcode = BPF_OP(insn->code); |
| 1440 | int err; | 1572 | int err; |
| 1441 | 1573 | ||
| @@ -1495,6 +1627,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1495 | if (err) | 1627 | if (err) |
| 1496 | return err; | 1628 | return err; |
| 1497 | 1629 | ||
| 1630 | /* we are setting our register to something new, we need to | ||
| 1631 | * reset its range values. | ||
| 1632 | */ | ||
| 1633 | reset_reg_range_values(regs, insn->dst_reg); | ||
| 1634 | |||
| 1498 | if (BPF_SRC(insn->code) == BPF_X) { | 1635 | if (BPF_SRC(insn->code) == BPF_X) { |
| 1499 | if (BPF_CLASS(insn->code) == BPF_ALU64) { | 1636 | if (BPF_CLASS(insn->code) == BPF_ALU64) { |
| 1500 | /* case: R1 = R2 | 1637 | /* case: R1 = R2 |
| @@ -1516,6 +1653,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1516 | */ | 1653 | */ |
| 1517 | regs[insn->dst_reg].type = CONST_IMM; | 1654 | regs[insn->dst_reg].type = CONST_IMM; |
| 1518 | regs[insn->dst_reg].imm = insn->imm; | 1655 | regs[insn->dst_reg].imm = insn->imm; |
| 1656 | regs[insn->dst_reg].max_value = insn->imm; | ||
| 1657 | regs[insn->dst_reg].min_value = insn->imm; | ||
| 1519 | } | 1658 | } |
| 1520 | 1659 | ||
| 1521 | } else if (opcode > BPF_END) { | 1660 | } else if (opcode > BPF_END) { |
| @@ -1568,6 +1707,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1568 | 1707 | ||
| 1569 | dst_reg = ®s[insn->dst_reg]; | 1708 | dst_reg = ®s[insn->dst_reg]; |
| 1570 | 1709 | ||
| 1710 | /* first we want to adjust our ranges. */ | ||
| 1711 | adjust_reg_min_max_vals(env, insn); | ||
| 1712 | |||
| 1571 | /* pattern match 'bpf_add Rx, imm' instruction */ | 1713 | /* pattern match 'bpf_add Rx, imm' instruction */ |
| 1572 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && | 1714 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && |
| 1573 | dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { | 1715 | dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { |
| @@ -1602,28 +1744,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1602 | return -EACCES; | 1744 | return -EACCES; |
| 1603 | } | 1745 | } |
| 1604 | 1746 | ||
| 1605 | /* mark dest operand */ | 1747 | /* If we did pointer math on a map value then just set it to our |
| 1606 | mark_reg_unknown_value(regs, insn->dst_reg); | 1748 | * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or |
| 1749 | * loads to this register appropriately, otherwise just mark the | ||
| 1750 | * register as unknown. | ||
| 1751 | */ | ||
| 1752 | if (env->allow_ptr_leaks && | ||
| 1753 | (dst_reg->type == PTR_TO_MAP_VALUE || | ||
| 1754 | dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) | ||
| 1755 | dst_reg->type = PTR_TO_MAP_VALUE_ADJ; | ||
| 1756 | else | ||
| 1757 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
| 1607 | } | 1758 | } |
| 1608 | 1759 | ||
| 1609 | return 0; | 1760 | return 0; |
| 1610 | } | 1761 | } |
| 1611 | 1762 | ||
| 1612 | static void find_good_pkt_pointers(struct verifier_env *env, | 1763 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, |
| 1613 | struct reg_state *dst_reg) | 1764 | struct bpf_reg_state *dst_reg) |
| 1614 | { | 1765 | { |
| 1615 | struct verifier_state *state = &env->cur_state; | 1766 | struct bpf_reg_state *regs = state->regs, *reg; |
| 1616 | struct reg_state *regs = state->regs, *reg; | ||
| 1617 | int i; | 1767 | int i; |
| 1618 | /* r2 = r3; | 1768 | |
| 1619 | * r2 += 8 | 1769 | /* LLVM can generate two kind of checks: |
| 1620 | * if (r2 > pkt_end) goto somewhere | 1770 | * |
| 1621 | * r2 == dst_reg, pkt_end == src_reg, | 1771 | * Type 1: |
| 1622 | * r2=pkt(id=n,off=8,r=0) | 1772 | * |
| 1623 | * r3=pkt(id=n,off=0,r=0) | 1773 | * r2 = r3; |
| 1624 | * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | 1774 | * r2 += 8; |
| 1625 | * so that range of bytes [r3, r3 + 8) is safe to access | 1775 | * if (r2 > pkt_end) goto <handle exception> |
| 1776 | * <access okay> | ||
| 1777 | * | ||
| 1778 | * Where: | ||
| 1779 | * r2 == dst_reg, pkt_end == src_reg | ||
| 1780 | * r2=pkt(id=n,off=8,r=0) | ||
| 1781 | * r3=pkt(id=n,off=0,r=0) | ||
| 1782 | * | ||
| 1783 | * Type 2: | ||
| 1784 | * | ||
| 1785 | * r2 = r3; | ||
| 1786 | * r2 += 8; | ||
| 1787 | * if (pkt_end >= r2) goto <access okay> | ||
| 1788 | * <handle exception> | ||
| 1789 | * | ||
| 1790 | * Where: | ||
| 1791 | * pkt_end == dst_reg, r2 == src_reg | ||
| 1792 | * r2=pkt(id=n,off=8,r=0) | ||
| 1793 | * r3=pkt(id=n,off=0,r=0) | ||
| 1794 | * | ||
| 1795 | * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) | ||
| 1796 | * so that range of bytes [r3, r3 + 8) is safe to access. | ||
| 1626 | */ | 1797 | */ |
| 1798 | |||
| 1627 | for (i = 0; i < MAX_BPF_REG; i++) | 1799 | for (i = 0; i < MAX_BPF_REG; i++) |
| 1628 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) | 1800 | if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) |
| 1629 | regs[i].range = dst_reg->off; | 1801 | regs[i].range = dst_reg->off; |
| @@ -1637,11 +1809,109 @@ static void find_good_pkt_pointers(struct verifier_env *env, | |||
| 1637 | } | 1809 | } |
| 1638 | } | 1810 | } |
| 1639 | 1811 | ||
| 1640 | static int check_cond_jmp_op(struct verifier_env *env, | 1812 | /* Adjusts the register min/max values in the case that the dst_reg is the |
| 1813 | * variable register that we are working on, and src_reg is a constant or we're | ||
| 1814 | * simply doing a BPF_K check. | ||
| 1815 | */ | ||
| 1816 | static void reg_set_min_max(struct bpf_reg_state *true_reg, | ||
| 1817 | struct bpf_reg_state *false_reg, u64 val, | ||
| 1818 | u8 opcode) | ||
| 1819 | { | ||
| 1820 | switch (opcode) { | ||
| 1821 | case BPF_JEQ: | ||
| 1822 | /* If this is false then we know nothing Jon Snow, but if it is | ||
| 1823 | * true then we know for sure. | ||
| 1824 | */ | ||
| 1825 | true_reg->max_value = true_reg->min_value = val; | ||
| 1826 | break; | ||
| 1827 | case BPF_JNE: | ||
| 1828 | /* If this is true we know nothing Jon Snow, but if it is false | ||
| 1829 | * we know the value for sure; | ||
| 1830 | */ | ||
| 1831 | false_reg->max_value = false_reg->min_value = val; | ||
| 1832 | break; | ||
| 1833 | case BPF_JGT: | ||
| 1834 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 1835 | false_reg->min_value = 0; | ||
| 1836 | case BPF_JSGT: | ||
| 1837 | /* If this is false then we know the maximum val is val, | ||
| 1838 | * otherwise we know the min val is val+1. | ||
| 1839 | */ | ||
| 1840 | false_reg->max_value = val; | ||
| 1841 | true_reg->min_value = val + 1; | ||
| 1842 | break; | ||
| 1843 | case BPF_JGE: | ||
| 1844 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 1845 | false_reg->min_value = 0; | ||
| 1846 | case BPF_JSGE: | ||
| 1847 | /* If this is false then we know the maximum value is val - 1, | ||
| 1848 | * otherwise we know the mimimum value is val. | ||
| 1849 | */ | ||
| 1850 | false_reg->max_value = val - 1; | ||
| 1851 | true_reg->min_value = val; | ||
| 1852 | break; | ||
| 1853 | default: | ||
| 1854 | break; | ||
| 1855 | } | ||
| 1856 | |||
| 1857 | check_reg_overflow(false_reg); | ||
| 1858 | check_reg_overflow(true_reg); | ||
| 1859 | } | ||
| 1860 | |||
| 1861 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg | ||
| 1862 | * is the variable reg. | ||
| 1863 | */ | ||
| 1864 | static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | ||
| 1865 | struct bpf_reg_state *false_reg, u64 val, | ||
| 1866 | u8 opcode) | ||
| 1867 | { | ||
| 1868 | switch (opcode) { | ||
| 1869 | case BPF_JEQ: | ||
| 1870 | /* If this is false then we know nothing Jon Snow, but if it is | ||
| 1871 | * true then we know for sure. | ||
| 1872 | */ | ||
| 1873 | true_reg->max_value = true_reg->min_value = val; | ||
| 1874 | break; | ||
| 1875 | case BPF_JNE: | ||
| 1876 | /* If this is true we know nothing Jon Snow, but if it is false | ||
| 1877 | * we know the value for sure; | ||
| 1878 | */ | ||
| 1879 | false_reg->max_value = false_reg->min_value = val; | ||
| 1880 | break; | ||
| 1881 | case BPF_JGT: | ||
| 1882 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 1883 | true_reg->min_value = 0; | ||
| 1884 | case BPF_JSGT: | ||
| 1885 | /* | ||
| 1886 | * If this is false, then the val is <= the register, if it is | ||
| 1887 | * true the register <= to the val. | ||
| 1888 | */ | ||
| 1889 | false_reg->min_value = val; | ||
| 1890 | true_reg->max_value = val - 1; | ||
| 1891 | break; | ||
| 1892 | case BPF_JGE: | ||
| 1893 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 1894 | true_reg->min_value = 0; | ||
| 1895 | case BPF_JSGE: | ||
| 1896 | /* If this is false then constant < register, if it is true then | ||
| 1897 | * the register < constant. | ||
| 1898 | */ | ||
| 1899 | false_reg->min_value = val + 1; | ||
| 1900 | true_reg->max_value = val; | ||
| 1901 | break; | ||
| 1902 | default: | ||
| 1903 | break; | ||
| 1904 | } | ||
| 1905 | |||
| 1906 | check_reg_overflow(false_reg); | ||
| 1907 | check_reg_overflow(true_reg); | ||
| 1908 | } | ||
| 1909 | |||
| 1910 | static int check_cond_jmp_op(struct bpf_verifier_env *env, | ||
| 1641 | struct bpf_insn *insn, int *insn_idx) | 1911 | struct bpf_insn *insn, int *insn_idx) |
| 1642 | { | 1912 | { |
| 1643 | struct reg_state *regs = env->cur_state.regs, *dst_reg; | 1913 | struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; |
| 1644 | struct verifier_state *other_branch; | 1914 | struct bpf_reg_state *regs = this_branch->regs, *dst_reg; |
| 1645 | u8 opcode = BPF_OP(insn->code); | 1915 | u8 opcode = BPF_OP(insn->code); |
| 1646 | int err; | 1916 | int err; |
| 1647 | 1917 | ||
| @@ -1703,7 +1973,24 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
| 1703 | if (!other_branch) | 1973 | if (!other_branch) |
| 1704 | return -EFAULT; | 1974 | return -EFAULT; |
| 1705 | 1975 | ||
| 1706 | /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ | 1976 | /* detect if we are comparing against a constant value so we can adjust |
| 1977 | * our min/max values for our dst register. | ||
| 1978 | */ | ||
| 1979 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1980 | if (regs[insn->src_reg].type == CONST_IMM) | ||
| 1981 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | ||
| 1982 | dst_reg, regs[insn->src_reg].imm, | ||
| 1983 | opcode); | ||
| 1984 | else if (dst_reg->type == CONST_IMM) | ||
| 1985 | reg_set_min_max_inv(&other_branch->regs[insn->src_reg], | ||
| 1986 | ®s[insn->src_reg], dst_reg->imm, | ||
| 1987 | opcode); | ||
| 1988 | } else { | ||
| 1989 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | ||
| 1990 | dst_reg, insn->imm, opcode); | ||
| 1991 | } | ||
| 1992 | |||
| 1993 | /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ | ||
| 1707 | if (BPF_SRC(insn->code) == BPF_K && | 1994 | if (BPF_SRC(insn->code) == BPF_K && |
| 1708 | insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && | 1995 | insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && |
| 1709 | dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { | 1996 | dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { |
| @@ -1722,13 +2009,17 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
| 1722 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && | 2009 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && |
| 1723 | dst_reg->type == PTR_TO_PACKET && | 2010 | dst_reg->type == PTR_TO_PACKET && |
| 1724 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { | 2011 | regs[insn->src_reg].type == PTR_TO_PACKET_END) { |
| 1725 | find_good_pkt_pointers(env, dst_reg); | 2012 | find_good_pkt_pointers(this_branch, dst_reg); |
| 2013 | } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && | ||
| 2014 | dst_reg->type == PTR_TO_PACKET_END && | ||
| 2015 | regs[insn->src_reg].type == PTR_TO_PACKET) { | ||
| 2016 | find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); | ||
| 1726 | } else if (is_pointer_value(env, insn->dst_reg)) { | 2017 | } else if (is_pointer_value(env, insn->dst_reg)) { |
| 1727 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); | 2018 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); |
| 1728 | return -EACCES; | 2019 | return -EACCES; |
| 1729 | } | 2020 | } |
| 1730 | if (log_level) | 2021 | if (log_level) |
| 1731 | print_verifier_state(&env->cur_state); | 2022 | print_verifier_state(this_branch); |
| 1732 | return 0; | 2023 | return 0; |
| 1733 | } | 2024 | } |
| 1734 | 2025 | ||
| @@ -1741,9 +2032,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) | |||
| 1741 | } | 2032 | } |
| 1742 | 2033 | ||
| 1743 | /* verify BPF_LD_IMM64 instruction */ | 2034 | /* verify BPF_LD_IMM64 instruction */ |
| 1744 | static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | 2035 | static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) |
| 1745 | { | 2036 | { |
| 1746 | struct reg_state *regs = env->cur_state.regs; | 2037 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 1747 | int err; | 2038 | int err; |
| 1748 | 2039 | ||
| 1749 | if (BPF_SIZE(insn->code) != BPF_DW) { | 2040 | if (BPF_SIZE(insn->code) != BPF_DW) { |
| @@ -1759,9 +2050,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1759 | if (err) | 2050 | if (err) |
| 1760 | return err; | 2051 | return err; |
| 1761 | 2052 | ||
| 1762 | if (insn->src_reg == 0) | 2053 | if (insn->src_reg == 0) { |
| 1763 | /* generic move 64-bit immediate into a register */ | 2054 | /* generic move 64-bit immediate into a register, |
| 2055 | * only analyzer needs to collect the ld_imm value. | ||
| 2056 | */ | ||
| 2057 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; | ||
| 2058 | |||
| 2059 | if (!env->analyzer_ops) | ||
| 2060 | return 0; | ||
| 2061 | |||
| 2062 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 2063 | regs[insn->dst_reg].imm = imm; | ||
| 1764 | return 0; | 2064 | return 0; |
| 2065 | } | ||
| 1765 | 2066 | ||
| 1766 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ | 2067 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ |
| 1767 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); | 2068 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); |
| @@ -1798,11 +2099,11 @@ static bool may_access_skb(enum bpf_prog_type type) | |||
| 1798 | * Output: | 2099 | * Output: |
| 1799 | * R0 - 8/16/32-bit skb data converted to cpu endianness | 2100 | * R0 - 8/16/32-bit skb data converted to cpu endianness |
| 1800 | */ | 2101 | */ |
| 1801 | static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | 2102 | static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) |
| 1802 | { | 2103 | { |
| 1803 | struct reg_state *regs = env->cur_state.regs; | 2104 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 1804 | u8 mode = BPF_MODE(insn->code); | 2105 | u8 mode = BPF_MODE(insn->code); |
| 1805 | struct reg_state *reg; | 2106 | struct bpf_reg_state *reg; |
| 1806 | int i, err; | 2107 | int i, err; |
| 1807 | 2108 | ||
| 1808 | if (!may_access_skb(env->prog->type)) { | 2109 | if (!may_access_skb(env->prog->type)) { |
| @@ -1888,7 +2189,7 @@ enum { | |||
| 1888 | BRANCH = 2, | 2189 | BRANCH = 2, |
| 1889 | }; | 2190 | }; |
| 1890 | 2191 | ||
| 1891 | #define STATE_LIST_MARK ((struct verifier_state_list *) -1L) | 2192 | #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) |
| 1892 | 2193 | ||
| 1893 | static int *insn_stack; /* stack of insns to process */ | 2194 | static int *insn_stack; /* stack of insns to process */ |
| 1894 | static int cur_stack; /* current stack index */ | 2195 | static int cur_stack; /* current stack index */ |
| @@ -1899,7 +2200,7 @@ static int *insn_state; | |||
| 1899 | * w - next instruction | 2200 | * w - next instruction |
| 1900 | * e - edge | 2201 | * e - edge |
| 1901 | */ | 2202 | */ |
| 1902 | static int push_insn(int t, int w, int e, struct verifier_env *env) | 2203 | static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) |
| 1903 | { | 2204 | { |
| 1904 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) | 2205 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) |
| 1905 | return 0; | 2206 | return 0; |
| @@ -1940,7 +2241,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) | |||
| 1940 | /* non-recursive depth-first-search to detect loops in BPF program | 2241 | /* non-recursive depth-first-search to detect loops in BPF program |
| 1941 | * loop == back-edge in directed graph | 2242 | * loop == back-edge in directed graph |
| 1942 | */ | 2243 | */ |
| 1943 | static int check_cfg(struct verifier_env *env) | 2244 | static int check_cfg(struct bpf_verifier_env *env) |
| 1944 | { | 2245 | { |
| 1945 | struct bpf_insn *insns = env->prog->insnsi; | 2246 | struct bpf_insn *insns = env->prog->insnsi; |
| 1946 | int insn_cnt = env->prog->len; | 2247 | int insn_cnt = env->prog->len; |
| @@ -2049,7 +2350,8 @@ err_free: | |||
| 2049 | /* the following conditions reduce the number of explored insns | 2350 | /* the following conditions reduce the number of explored insns |
| 2050 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet | 2351 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet |
| 2051 | */ | 2352 | */ |
| 2052 | static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) | 2353 | static bool compare_ptrs_to_packet(struct bpf_reg_state *old, |
| 2354 | struct bpf_reg_state *cur) | ||
| 2053 | { | 2355 | { |
| 2054 | if (old->id != cur->id) | 2356 | if (old->id != cur->id) |
| 2055 | return false; | 2357 | return false; |
| @@ -2124,9 +2426,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) | |||
| 2124 | * whereas register type in current state is meaningful, it means that | 2426 | * whereas register type in current state is meaningful, it means that |
| 2125 | * the current state will reach 'bpf_exit' instruction safely | 2427 | * the current state will reach 'bpf_exit' instruction safely |
| 2126 | */ | 2428 | */ |
| 2127 | static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | 2429 | static bool states_equal(struct bpf_verifier_env *env, |
| 2430 | struct bpf_verifier_state *old, | ||
| 2431 | struct bpf_verifier_state *cur) | ||
| 2128 | { | 2432 | { |
| 2129 | struct reg_state *rold, *rcur; | 2433 | struct bpf_reg_state *rold, *rcur; |
| 2130 | int i; | 2434 | int i; |
| 2131 | 2435 | ||
| 2132 | for (i = 0; i < MAX_BPF_REG; i++) { | 2436 | for (i = 0; i < MAX_BPF_REG; i++) { |
| @@ -2136,6 +2440,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
| 2136 | if (memcmp(rold, rcur, sizeof(*rold)) == 0) | 2440 | if (memcmp(rold, rcur, sizeof(*rold)) == 0) |
| 2137 | continue; | 2441 | continue; |
| 2138 | 2442 | ||
| 2443 | /* If the ranges were not the same, but everything else was and | ||
| 2444 | * we didn't do a variable access into a map then we are a-ok. | ||
| 2445 | */ | ||
| 2446 | if (!env->varlen_map_value_access && | ||
| 2447 | rold->type == rcur->type && rold->imm == rcur->imm) | ||
| 2448 | continue; | ||
| 2449 | |||
| 2139 | if (rold->type == NOT_INIT || | 2450 | if (rold->type == NOT_INIT || |
| 2140 | (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) | 2451 | (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) |
| 2141 | continue; | 2452 | continue; |
| @@ -2166,9 +2477,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
| 2166 | * the same, check that stored pointers types | 2477 | * the same, check that stored pointers types |
| 2167 | * are the same as well. | 2478 | * are the same as well. |
| 2168 | * Ex: explored safe path could have stored | 2479 | * Ex: explored safe path could have stored |
| 2169 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} | 2480 | * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} |
| 2170 | * but current path has stored: | 2481 | * but current path has stored: |
| 2171 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} | 2482 | * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} |
| 2172 | * such verifier states are not equivalent. | 2483 | * such verifier states are not equivalent. |
| 2173 | * return false to continue verification of this path | 2484 | * return false to continue verification of this path |
| 2174 | */ | 2485 | */ |
| @@ -2179,10 +2490,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | |||
| 2179 | return true; | 2490 | return true; |
| 2180 | } | 2491 | } |
| 2181 | 2492 | ||
| 2182 | static int is_state_visited(struct verifier_env *env, int insn_idx) | 2493 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) |
| 2183 | { | 2494 | { |
| 2184 | struct verifier_state_list *new_sl; | 2495 | struct bpf_verifier_state_list *new_sl; |
| 2185 | struct verifier_state_list *sl; | 2496 | struct bpf_verifier_state_list *sl; |
| 2186 | 2497 | ||
| 2187 | sl = env->explored_states[insn_idx]; | 2498 | sl = env->explored_states[insn_idx]; |
| 2188 | if (!sl) | 2499 | if (!sl) |
| @@ -2192,7 +2503,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
| 2192 | return 0; | 2503 | return 0; |
| 2193 | 2504 | ||
| 2194 | while (sl != STATE_LIST_MARK) { | 2505 | while (sl != STATE_LIST_MARK) { |
| 2195 | if (states_equal(&sl->state, &env->cur_state)) | 2506 | if (states_equal(env, &sl->state, &env->cur_state)) |
| 2196 | /* reached equivalent register/stack state, | 2507 | /* reached equivalent register/stack state, |
| 2197 | * prune the search | 2508 | * prune the search |
| 2198 | */ | 2509 | */ |
| @@ -2206,7 +2517,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
| 2206 | * it will be rejected. Since there are no loops, we won't be | 2517 | * it will be rejected. Since there are no loops, we won't be |
| 2207 | * seeing this 'insn_idx' instruction again on the way to bpf_exit | 2518 | * seeing this 'insn_idx' instruction again on the way to bpf_exit |
| 2208 | */ | 2519 | */ |
| 2209 | new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); | 2520 | new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); |
| 2210 | if (!new_sl) | 2521 | if (!new_sl) |
| 2211 | return -ENOMEM; | 2522 | return -ENOMEM; |
| 2212 | 2523 | ||
| @@ -2217,11 +2528,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) | |||
| 2217 | return 0; | 2528 | return 0; |
| 2218 | } | 2529 | } |
| 2219 | 2530 | ||
| 2220 | static int do_check(struct verifier_env *env) | 2531 | static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, |
| 2532 | int insn_idx, int prev_insn_idx) | ||
| 2533 | { | ||
| 2534 | if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) | ||
| 2535 | return 0; | ||
| 2536 | |||
| 2537 | return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); | ||
| 2538 | } | ||
| 2539 | |||
| 2540 | static int do_check(struct bpf_verifier_env *env) | ||
| 2221 | { | 2541 | { |
| 2222 | struct verifier_state *state = &env->cur_state; | 2542 | struct bpf_verifier_state *state = &env->cur_state; |
| 2223 | struct bpf_insn *insns = env->prog->insnsi; | 2543 | struct bpf_insn *insns = env->prog->insnsi; |
| 2224 | struct reg_state *regs = state->regs; | 2544 | struct bpf_reg_state *regs = state->regs; |
| 2225 | int insn_cnt = env->prog->len; | 2545 | int insn_cnt = env->prog->len; |
| 2226 | int insn_idx, prev_insn_idx = 0; | 2546 | int insn_idx, prev_insn_idx = 0; |
| 2227 | int insn_processed = 0; | 2547 | int insn_processed = 0; |
| @@ -2229,6 +2549,7 @@ static int do_check(struct verifier_env *env) | |||
| 2229 | 2549 | ||
| 2230 | init_reg_state(regs); | 2550 | init_reg_state(regs); |
| 2231 | insn_idx = 0; | 2551 | insn_idx = 0; |
| 2552 | env->varlen_map_value_access = false; | ||
| 2232 | for (;;) { | 2553 | for (;;) { |
| 2233 | struct bpf_insn *insn; | 2554 | struct bpf_insn *insn; |
| 2234 | u8 class; | 2555 | u8 class; |
| @@ -2275,13 +2596,17 @@ static int do_check(struct verifier_env *env) | |||
| 2275 | print_bpf_insn(insn); | 2596 | print_bpf_insn(insn); |
| 2276 | } | 2597 | } |
| 2277 | 2598 | ||
| 2599 | err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); | ||
| 2600 | if (err) | ||
| 2601 | return err; | ||
| 2602 | |||
| 2278 | if (class == BPF_ALU || class == BPF_ALU64) { | 2603 | if (class == BPF_ALU || class == BPF_ALU64) { |
| 2279 | err = check_alu_op(env, insn); | 2604 | err = check_alu_op(env, insn); |
| 2280 | if (err) | 2605 | if (err) |
| 2281 | return err; | 2606 | return err; |
| 2282 | 2607 | ||
| 2283 | } else if (class == BPF_LDX) { | 2608 | } else if (class == BPF_LDX) { |
| 2284 | enum bpf_reg_type src_reg_type; | 2609 | enum bpf_reg_type *prev_src_type, src_reg_type; |
| 2285 | 2610 | ||
| 2286 | /* check for reserved fields is already done */ | 2611 | /* check for reserved fields is already done */ |
| 2287 | 2612 | ||
| @@ -2305,21 +2630,25 @@ static int do_check(struct verifier_env *env) | |||
| 2305 | if (err) | 2630 | if (err) |
| 2306 | return err; | 2631 | return err; |
| 2307 | 2632 | ||
| 2308 | if (BPF_SIZE(insn->code) != BPF_W) { | 2633 | reset_reg_range_values(regs, insn->dst_reg); |
| 2634 | if (BPF_SIZE(insn->code) != BPF_W && | ||
| 2635 | BPF_SIZE(insn->code) != BPF_DW) { | ||
| 2309 | insn_idx++; | 2636 | insn_idx++; |
| 2310 | continue; | 2637 | continue; |
| 2311 | } | 2638 | } |
| 2312 | 2639 | ||
| 2313 | if (insn->imm == 0) { | 2640 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; |
| 2641 | |||
| 2642 | if (*prev_src_type == NOT_INIT) { | ||
| 2314 | /* saw a valid insn | 2643 | /* saw a valid insn |
| 2315 | * dst_reg = *(u32 *)(src_reg + off) | 2644 | * dst_reg = *(u32 *)(src_reg + off) |
| 2316 | * use reserved 'imm' field to mark this insn | 2645 | * save type to validate intersecting paths |
| 2317 | */ | 2646 | */ |
| 2318 | insn->imm = src_reg_type; | 2647 | *prev_src_type = src_reg_type; |
| 2319 | 2648 | ||
| 2320 | } else if (src_reg_type != insn->imm && | 2649 | } else if (src_reg_type != *prev_src_type && |
| 2321 | (src_reg_type == PTR_TO_CTX || | 2650 | (src_reg_type == PTR_TO_CTX || |
| 2322 | insn->imm == PTR_TO_CTX)) { | 2651 | *prev_src_type == PTR_TO_CTX)) { |
| 2323 | /* ABuser program is trying to use the same insn | 2652 | /* ABuser program is trying to use the same insn |
| 2324 | * dst_reg = *(u32*) (src_reg + off) | 2653 | * dst_reg = *(u32*) (src_reg + off) |
| 2325 | * with different pointer types: | 2654 | * with different pointer types: |
| @@ -2332,7 +2661,7 @@ static int do_check(struct verifier_env *env) | |||
| 2332 | } | 2661 | } |
| 2333 | 2662 | ||
| 2334 | } else if (class == BPF_STX) { | 2663 | } else if (class == BPF_STX) { |
| 2335 | enum bpf_reg_type dst_reg_type; | 2664 | enum bpf_reg_type *prev_dst_type, dst_reg_type; |
| 2336 | 2665 | ||
| 2337 | if (BPF_MODE(insn->code) == BPF_XADD) { | 2666 | if (BPF_MODE(insn->code) == BPF_XADD) { |
| 2338 | err = check_xadd(env, insn); | 2667 | err = check_xadd(env, insn); |
| @@ -2360,11 +2689,13 @@ static int do_check(struct verifier_env *env) | |||
| 2360 | if (err) | 2689 | if (err) |
| 2361 | return err; | 2690 | return err; |
| 2362 | 2691 | ||
| 2363 | if (insn->imm == 0) { | 2692 | prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; |
| 2364 | insn->imm = dst_reg_type; | 2693 | |
| 2365 | } else if (dst_reg_type != insn->imm && | 2694 | if (*prev_dst_type == NOT_INIT) { |
| 2695 | *prev_dst_type = dst_reg_type; | ||
| 2696 | } else if (dst_reg_type != *prev_dst_type && | ||
| 2366 | (dst_reg_type == PTR_TO_CTX || | 2697 | (dst_reg_type == PTR_TO_CTX || |
| 2367 | insn->imm == PTR_TO_CTX)) { | 2698 | *prev_dst_type == PTR_TO_CTX)) { |
| 2368 | verbose("same insn cannot be used with different pointers\n"); | 2699 | verbose("same insn cannot be used with different pointers\n"); |
| 2369 | return -EINVAL; | 2700 | return -EINVAL; |
| 2370 | } | 2701 | } |
| @@ -2470,6 +2801,7 @@ process_bpf_exit: | |||
| 2470 | verbose("invalid BPF_LD mode\n"); | 2801 | verbose("invalid BPF_LD mode\n"); |
| 2471 | return -EINVAL; | 2802 | return -EINVAL; |
| 2472 | } | 2803 | } |
| 2804 | reset_reg_range_values(regs, insn->dst_reg); | ||
| 2473 | } else { | 2805 | } else { |
| 2474 | verbose("unknown insn class %d\n", class); | 2806 | verbose("unknown insn class %d\n", class); |
| 2475 | return -EINVAL; | 2807 | return -EINVAL; |
| @@ -2482,14 +2814,28 @@ process_bpf_exit: | |||
| 2482 | return 0; | 2814 | return 0; |
| 2483 | } | 2815 | } |
| 2484 | 2816 | ||
| 2817 | static int check_map_prog_compatibility(struct bpf_map *map, | ||
| 2818 | struct bpf_prog *prog) | ||
| 2819 | |||
| 2820 | { | ||
| 2821 | if (prog->type == BPF_PROG_TYPE_PERF_EVENT && | ||
| 2822 | (map->map_type == BPF_MAP_TYPE_HASH || | ||
| 2823 | map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && | ||
| 2824 | (map->map_flags & BPF_F_NO_PREALLOC)) { | ||
| 2825 | verbose("perf_event programs can only use preallocated hash map\n"); | ||
| 2826 | return -EINVAL; | ||
| 2827 | } | ||
| 2828 | return 0; | ||
| 2829 | } | ||
| 2830 | |||
| 2485 | /* look for pseudo eBPF instructions that access map FDs and | 2831 | /* look for pseudo eBPF instructions that access map FDs and |
| 2486 | * replace them with actual map pointers | 2832 | * replace them with actual map pointers |
| 2487 | */ | 2833 | */ |
| 2488 | static int replace_map_fd_with_map_ptr(struct verifier_env *env) | 2834 | static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) |
| 2489 | { | 2835 | { |
| 2490 | struct bpf_insn *insn = env->prog->insnsi; | 2836 | struct bpf_insn *insn = env->prog->insnsi; |
| 2491 | int insn_cnt = env->prog->len; | 2837 | int insn_cnt = env->prog->len; |
| 2492 | int i, j; | 2838 | int i, j, err; |
| 2493 | 2839 | ||
| 2494 | for (i = 0; i < insn_cnt; i++, insn++) { | 2840 | for (i = 0; i < insn_cnt; i++, insn++) { |
| 2495 | if (BPF_CLASS(insn->code) == BPF_LDX && | 2841 | if (BPF_CLASS(insn->code) == BPF_LDX && |
| @@ -2533,6 +2879,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) | |||
| 2533 | return PTR_ERR(map); | 2879 | return PTR_ERR(map); |
| 2534 | } | 2880 | } |
| 2535 | 2881 | ||
| 2882 | err = check_map_prog_compatibility(map, env->prog); | ||
| 2883 | if (err) { | ||
| 2884 | fdput(f); | ||
| 2885 | return err; | ||
| 2886 | } | ||
| 2887 | |||
| 2536 | /* store map pointer inside BPF_LD_IMM64 instruction */ | 2888 | /* store map pointer inside BPF_LD_IMM64 instruction */ |
| 2537 | insn[0].imm = (u32) (unsigned long) map; | 2889 | insn[0].imm = (u32) (unsigned long) map; |
| 2538 | insn[1].imm = ((u64) (unsigned long) map) >> 32; | 2890 | insn[1].imm = ((u64) (unsigned long) map) >> 32; |
| @@ -2576,7 +2928,7 @@ next_insn: | |||
| 2576 | } | 2928 | } |
| 2577 | 2929 | ||
| 2578 | /* drop refcnt of maps used by the rejected program */ | 2930 | /* drop refcnt of maps used by the rejected program */ |
| 2579 | static void release_maps(struct verifier_env *env) | 2931 | static void release_maps(struct bpf_verifier_env *env) |
| 2580 | { | 2932 | { |
| 2581 | int i; | 2933 | int i; |
| 2582 | 2934 | ||
| @@ -2585,7 +2937,7 @@ static void release_maps(struct verifier_env *env) | |||
| 2585 | } | 2937 | } |
| 2586 | 2938 | ||
| 2587 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ | 2939 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ |
| 2588 | static void convert_pseudo_ld_imm64(struct verifier_env *env) | 2940 | static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) |
| 2589 | { | 2941 | { |
| 2590 | struct bpf_insn *insn = env->prog->insnsi; | 2942 | struct bpf_insn *insn = env->prog->insnsi; |
| 2591 | int insn_cnt = env->prog->len; | 2943 | int insn_cnt = env->prog->len; |
| @@ -2599,62 +2951,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) | |||
| 2599 | /* convert load instructions that access fields of 'struct __sk_buff' | 2951 | /* convert load instructions that access fields of 'struct __sk_buff' |
| 2600 | * into sequence of instructions that access fields of 'struct sk_buff' | 2952 | * into sequence of instructions that access fields of 'struct sk_buff' |
| 2601 | */ | 2953 | */ |
| 2602 | static int convert_ctx_accesses(struct verifier_env *env) | 2954 | static int convert_ctx_accesses(struct bpf_verifier_env *env) |
| 2603 | { | 2955 | { |
| 2604 | struct bpf_insn *insn = env->prog->insnsi; | 2956 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; |
| 2605 | int insn_cnt = env->prog->len; | 2957 | const int insn_cnt = env->prog->len; |
| 2606 | struct bpf_insn insn_buf[16]; | 2958 | struct bpf_insn insn_buf[16], *insn; |
| 2607 | struct bpf_prog *new_prog; | 2959 | struct bpf_prog *new_prog; |
| 2608 | enum bpf_access_type type; | 2960 | enum bpf_access_type type; |
| 2609 | int i; | 2961 | int i, cnt, delta = 0; |
| 2610 | 2962 | ||
| 2611 | if (!env->prog->aux->ops->convert_ctx_access) | 2963 | if (ops->gen_prologue) { |
| 2964 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, | ||
| 2965 | env->prog); | ||
| 2966 | if (cnt >= ARRAY_SIZE(insn_buf)) { | ||
| 2967 | verbose("bpf verifier is misconfigured\n"); | ||
| 2968 | return -EINVAL; | ||
| 2969 | } else if (cnt) { | ||
| 2970 | new_prog = bpf_patch_insn_single(env->prog, 0, | ||
| 2971 | insn_buf, cnt); | ||
| 2972 | if (!new_prog) | ||
| 2973 | return -ENOMEM; | ||
| 2974 | env->prog = new_prog; | ||
| 2975 | delta += cnt - 1; | ||
| 2976 | } | ||
| 2977 | } | ||
| 2978 | |||
| 2979 | if (!ops->convert_ctx_access) | ||
| 2612 | return 0; | 2980 | return 0; |
| 2613 | 2981 | ||
| 2614 | for (i = 0; i < insn_cnt; i++, insn++) { | 2982 | insn = env->prog->insnsi + delta; |
| 2615 | u32 insn_delta, cnt; | ||
| 2616 | 2983 | ||
| 2617 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) | 2984 | for (i = 0; i < insn_cnt; i++, insn++) { |
| 2985 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || | ||
| 2986 | insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) | ||
| 2618 | type = BPF_READ; | 2987 | type = BPF_READ; |
| 2619 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) | 2988 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || |
| 2989 | insn->code == (BPF_STX | BPF_MEM | BPF_DW)) | ||
| 2620 | type = BPF_WRITE; | 2990 | type = BPF_WRITE; |
| 2621 | else | 2991 | else |
| 2622 | continue; | 2992 | continue; |
| 2623 | 2993 | ||
| 2624 | if (insn->imm != PTR_TO_CTX) { | 2994 | if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) |
| 2625 | /* clear internal mark */ | ||
| 2626 | insn->imm = 0; | ||
| 2627 | continue; | 2995 | continue; |
| 2628 | } | ||
| 2629 | 2996 | ||
| 2630 | cnt = env->prog->aux->ops-> | 2997 | cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, |
| 2631 | convert_ctx_access(type, insn->dst_reg, insn->src_reg, | 2998 | insn->off, insn_buf, env->prog); |
| 2632 | insn->off, insn_buf, env->prog); | ||
| 2633 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 2999 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { |
| 2634 | verbose("bpf verifier is misconfigured\n"); | 3000 | verbose("bpf verifier is misconfigured\n"); |
| 2635 | return -EINVAL; | 3001 | return -EINVAL; |
| 2636 | } | 3002 | } |
| 2637 | 3003 | ||
| 2638 | new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); | 3004 | new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, |
| 3005 | cnt); | ||
| 2639 | if (!new_prog) | 3006 | if (!new_prog) |
| 2640 | return -ENOMEM; | 3007 | return -ENOMEM; |
| 2641 | 3008 | ||
| 2642 | insn_delta = cnt - 1; | 3009 | delta += cnt - 1; |
| 2643 | 3010 | ||
| 2644 | /* keep walking new program and skip insns we just inserted */ | 3011 | /* keep walking new program and skip insns we just inserted */ |
| 2645 | env->prog = new_prog; | 3012 | env->prog = new_prog; |
| 2646 | insn = new_prog->insnsi + i + insn_delta; | 3013 | insn = new_prog->insnsi + i + delta; |
| 2647 | |||
| 2648 | insn_cnt += insn_delta; | ||
| 2649 | i += insn_delta; | ||
| 2650 | } | 3014 | } |
| 2651 | 3015 | ||
| 2652 | return 0; | 3016 | return 0; |
| 2653 | } | 3017 | } |
| 2654 | 3018 | ||
| 2655 | static void free_states(struct verifier_env *env) | 3019 | static void free_states(struct bpf_verifier_env *env) |
| 2656 | { | 3020 | { |
| 2657 | struct verifier_state_list *sl, *sln; | 3021 | struct bpf_verifier_state_list *sl, *sln; |
| 2658 | int i; | 3022 | int i; |
| 2659 | 3023 | ||
| 2660 | if (!env->explored_states) | 3024 | if (!env->explored_states) |
| @@ -2677,19 +3041,24 @@ static void free_states(struct verifier_env *env) | |||
| 2677 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | 3041 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) |
| 2678 | { | 3042 | { |
| 2679 | char __user *log_ubuf = NULL; | 3043 | char __user *log_ubuf = NULL; |
| 2680 | struct verifier_env *env; | 3044 | struct bpf_verifier_env *env; |
| 2681 | int ret = -EINVAL; | 3045 | int ret = -EINVAL; |
| 2682 | 3046 | ||
| 2683 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) | 3047 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) |
| 2684 | return -E2BIG; | 3048 | return -E2BIG; |
| 2685 | 3049 | ||
| 2686 | /* 'struct verifier_env' can be global, but since it's not small, | 3050 | /* 'struct bpf_verifier_env' can be global, but since it's not small, |
| 2687 | * allocate/free it every time bpf_check() is called | 3051 | * allocate/free it every time bpf_check() is called |
| 2688 | */ | 3052 | */ |
| 2689 | env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); | 3053 | env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); |
| 2690 | if (!env) | 3054 | if (!env) |
| 2691 | return -ENOMEM; | 3055 | return -ENOMEM; |
| 2692 | 3056 | ||
| 3057 | env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * | ||
| 3058 | (*prog)->len); | ||
| 3059 | ret = -ENOMEM; | ||
| 3060 | if (!env->insn_aux_data) | ||
| 3061 | goto err_free_env; | ||
| 2693 | env->prog = *prog; | 3062 | env->prog = *prog; |
| 2694 | 3063 | ||
| 2695 | /* grab the mutex to protect few globals used by verifier */ | 3064 | /* grab the mutex to protect few globals used by verifier */ |
| @@ -2708,12 +3077,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
| 2708 | /* log_* values have to be sane */ | 3077 | /* log_* values have to be sane */ |
| 2709 | if (log_size < 128 || log_size > UINT_MAX >> 8 || | 3078 | if (log_size < 128 || log_size > UINT_MAX >> 8 || |
| 2710 | log_level == 0 || log_ubuf == NULL) | 3079 | log_level == 0 || log_ubuf == NULL) |
| 2711 | goto free_env; | 3080 | goto err_unlock; |
| 2712 | 3081 | ||
| 2713 | ret = -ENOMEM; | 3082 | ret = -ENOMEM; |
| 2714 | log_buf = vmalloc(log_size); | 3083 | log_buf = vmalloc(log_size); |
| 2715 | if (!log_buf) | 3084 | if (!log_buf) |
| 2716 | goto free_env; | 3085 | goto err_unlock; |
| 2717 | } else { | 3086 | } else { |
| 2718 | log_level = 0; | 3087 | log_level = 0; |
| 2719 | } | 3088 | } |
| @@ -2723,7 +3092,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
| 2723 | goto skip_full_check; | 3092 | goto skip_full_check; |
| 2724 | 3093 | ||
| 2725 | env->explored_states = kcalloc(env->prog->len, | 3094 | env->explored_states = kcalloc(env->prog->len, |
| 2726 | sizeof(struct verifier_state_list *), | 3095 | sizeof(struct bpf_verifier_state_list *), |
| 2727 | GFP_USER); | 3096 | GFP_USER); |
| 2728 | ret = -ENOMEM; | 3097 | ret = -ENOMEM; |
| 2729 | if (!env->explored_states) | 3098 | if (!env->explored_states) |
| @@ -2782,14 +3151,67 @@ skip_full_check: | |||
| 2782 | free_log_buf: | 3151 | free_log_buf: |
| 2783 | if (log_level) | 3152 | if (log_level) |
| 2784 | vfree(log_buf); | 3153 | vfree(log_buf); |
| 2785 | free_env: | ||
| 2786 | if (!env->prog->aux->used_maps) | 3154 | if (!env->prog->aux->used_maps) |
| 2787 | /* if we didn't copy map pointers into bpf_prog_info, release | 3155 | /* if we didn't copy map pointers into bpf_prog_info, release |
| 2788 | * them now. Otherwise free_bpf_prog_info() will release them. | 3156 | * them now. Otherwise free_bpf_prog_info() will release them. |
| 2789 | */ | 3157 | */ |
| 2790 | release_maps(env); | 3158 | release_maps(env); |
| 2791 | *prog = env->prog; | 3159 | *prog = env->prog; |
| 3160 | err_unlock: | ||
| 3161 | mutex_unlock(&bpf_verifier_lock); | ||
| 3162 | vfree(env->insn_aux_data); | ||
| 3163 | err_free_env: | ||
| 2792 | kfree(env); | 3164 | kfree(env); |
| 3165 | return ret; | ||
| 3166 | } | ||
| 3167 | |||
| 3168 | int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, | ||
| 3169 | void *priv) | ||
| 3170 | { | ||
| 3171 | struct bpf_verifier_env *env; | ||
| 3172 | int ret; | ||
| 3173 | |||
| 3174 | env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); | ||
| 3175 | if (!env) | ||
| 3176 | return -ENOMEM; | ||
| 3177 | |||
| 3178 | env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * | ||
| 3179 | prog->len); | ||
| 3180 | ret = -ENOMEM; | ||
| 3181 | if (!env->insn_aux_data) | ||
| 3182 | goto err_free_env; | ||
| 3183 | env->prog = prog; | ||
| 3184 | env->analyzer_ops = ops; | ||
| 3185 | env->analyzer_priv = priv; | ||
| 3186 | |||
| 3187 | /* grab the mutex to protect few globals used by verifier */ | ||
| 3188 | mutex_lock(&bpf_verifier_lock); | ||
| 3189 | |||
| 3190 | log_level = 0; | ||
| 3191 | |||
| 3192 | env->explored_states = kcalloc(env->prog->len, | ||
| 3193 | sizeof(struct bpf_verifier_state_list *), | ||
| 3194 | GFP_KERNEL); | ||
| 3195 | ret = -ENOMEM; | ||
| 3196 | if (!env->explored_states) | ||
| 3197 | goto skip_full_check; | ||
| 3198 | |||
| 3199 | ret = check_cfg(env); | ||
| 3200 | if (ret < 0) | ||
| 3201 | goto skip_full_check; | ||
| 3202 | |||
| 3203 | env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); | ||
| 3204 | |||
| 3205 | ret = do_check(env); | ||
| 3206 | |||
| 3207 | skip_full_check: | ||
| 3208 | while (pop_stack(env, NULL) >= 0); | ||
| 3209 | free_states(env); | ||
| 3210 | |||
| 2793 | mutex_unlock(&bpf_verifier_lock); | 3211 | mutex_unlock(&bpf_verifier_lock); |
| 3212 | vfree(env->insn_aux_data); | ||
| 3213 | err_free_env: | ||
| 3214 | kfree(env); | ||
| 2794 | return ret; | 3215 | return ret; |
| 2795 | } | 3216 | } |
| 3217 | EXPORT_SYMBOL_GPL(bpf_analyzer); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..85bc9beb046d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -64,6 +64,9 @@ | |||
| 64 | #include <linux/file.h> | 64 | #include <linux/file.h> |
| 65 | #include <net/sock.h> | 65 | #include <net/sock.h> |
| 66 | 66 | ||
| 67 | #define CREATE_TRACE_POINTS | ||
| 68 | #include <trace/events/cgroup.h> | ||
| 69 | |||
| 67 | /* | 70 | /* |
| 68 | * pidlists linger the following amount before being destroyed. The goal | 71 | * pidlists linger the following amount before being destroyed. The goal |
| 69 | * is avoiding frequent destruction in the middle of consecutive read calls | 72 | * is avoiding frequent destruction in the middle of consecutive read calls |
| @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
| 1176 | struct cgroup *cgrp = &root->cgrp; | 1179 | struct cgroup *cgrp = &root->cgrp; |
| 1177 | struct cgrp_cset_link *link, *tmp_link; | 1180 | struct cgrp_cset_link *link, *tmp_link; |
| 1178 | 1181 | ||
| 1182 | trace_cgroup_destroy_root(root); | ||
| 1183 | |||
| 1179 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); | 1184 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
| 1180 | 1185 | ||
| 1181 | BUG_ON(atomic_read(&root->nr_cgrps)); | 1186 | BUG_ON(atomic_read(&root->nr_cgrps)); |
| @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1874 | strcpy(root->release_agent_path, opts.release_agent); | 1879 | strcpy(root->release_agent_path, opts.release_agent); |
| 1875 | spin_unlock(&release_agent_path_lock); | 1880 | spin_unlock(&release_agent_path_lock); |
| 1876 | } | 1881 | } |
| 1882 | |||
| 1883 | trace_cgroup_remount(root); | ||
| 1884 | |||
| 1877 | out_unlock: | 1885 | out_unlock: |
| 1878 | kfree(opts.release_agent); | 1886 | kfree(opts.release_agent); |
| 1879 | kfree(opts.name); | 1887 | kfree(opts.name); |
| @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) | |||
| 2031 | if (ret) | 2039 | if (ret) |
| 2032 | goto destroy_root; | 2040 | goto destroy_root; |
| 2033 | 2041 | ||
| 2042 | trace_cgroup_setup_root(root); | ||
| 2043 | |||
| 2034 | /* | 2044 | /* |
| 2035 | * There must be no failure case after here, since rebinding takes | 2045 | * There must be no failure case after here, since rebinding takes |
| 2036 | * care of subsystems' refcounts, which are explicitly dropped in | 2046 | * care of subsystems' refcounts, which are explicitly dropped in |
| @@ -2315,22 +2325,18 @@ static struct file_system_type cgroup2_fs_type = { | |||
| 2315 | .fs_flags = FS_USERNS_MOUNT, | 2325 | .fs_flags = FS_USERNS_MOUNT, |
| 2316 | }; | 2326 | }; |
| 2317 | 2327 | ||
| 2318 | static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, | 2328 | static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, |
| 2319 | struct cgroup_namespace *ns) | 2329 | struct cgroup_namespace *ns) |
| 2320 | { | 2330 | { |
| 2321 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); | 2331 | struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); |
| 2322 | int ret; | ||
| 2323 | 2332 | ||
| 2324 | ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); | 2333 | return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); |
| 2325 | if (ret < 0 || ret >= buflen) | ||
| 2326 | return NULL; | ||
| 2327 | return buf; | ||
| 2328 | } | 2334 | } |
| 2329 | 2335 | ||
| 2330 | char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, | 2336 | int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, |
| 2331 | struct cgroup_namespace *ns) | 2337 | struct cgroup_namespace *ns) |
| 2332 | { | 2338 | { |
| 2333 | char *ret; | 2339 | int ret; |
| 2334 | 2340 | ||
| 2335 | mutex_lock(&cgroup_mutex); | 2341 | mutex_lock(&cgroup_mutex); |
| 2336 | spin_lock_irq(&css_set_lock); | 2342 | spin_lock_irq(&css_set_lock); |
| @@ -2357,12 +2363,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); | |||
| 2357 | * | 2363 | * |
| 2358 | * Return value is the same as kernfs_path(). | 2364 | * Return value is the same as kernfs_path(). |
| 2359 | */ | 2365 | */ |
| 2360 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | 2366 | int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) |
| 2361 | { | 2367 | { |
| 2362 | struct cgroup_root *root; | 2368 | struct cgroup_root *root; |
| 2363 | struct cgroup *cgrp; | 2369 | struct cgroup *cgrp; |
| 2364 | int hierarchy_id = 1; | 2370 | int hierarchy_id = 1; |
| 2365 | char *path = NULL; | 2371 | int ret; |
| 2366 | 2372 | ||
| 2367 | mutex_lock(&cgroup_mutex); | 2373 | mutex_lock(&cgroup_mutex); |
| 2368 | spin_lock_irq(&css_set_lock); | 2374 | spin_lock_irq(&css_set_lock); |
| @@ -2371,16 +2377,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
| 2371 | 2377 | ||
| 2372 | if (root) { | 2378 | if (root) { |
| 2373 | cgrp = task_cgroup_from_root(task, root); | 2379 | cgrp = task_cgroup_from_root(task, root); |
| 2374 | path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); | 2380 | ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); |
| 2375 | } else { | 2381 | } else { |
| 2376 | /* if no hierarchy exists, everyone is in "/" */ | 2382 | /* if no hierarchy exists, everyone is in "/" */ |
| 2377 | if (strlcpy(buf, "/", buflen) < buflen) | 2383 | ret = strlcpy(buf, "/", buflen); |
| 2378 | path = buf; | ||
| 2379 | } | 2384 | } |
| 2380 | 2385 | ||
| 2381 | spin_unlock_irq(&css_set_lock); | 2386 | spin_unlock_irq(&css_set_lock); |
| 2382 | mutex_unlock(&cgroup_mutex); | 2387 | mutex_unlock(&cgroup_mutex); |
| 2383 | return path; | 2388 | return ret; |
| 2384 | } | 2389 | } |
| 2385 | EXPORT_SYMBOL_GPL(task_cgroup_path); | 2390 | EXPORT_SYMBOL_GPL(task_cgroup_path); |
| 2386 | 2391 | ||
| @@ -2830,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
| 2830 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); | 2835 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); |
| 2831 | 2836 | ||
| 2832 | cgroup_migrate_finish(&preloaded_csets); | 2837 | cgroup_migrate_finish(&preloaded_csets); |
| 2838 | |||
| 2839 | if (!ret) | ||
| 2840 | trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); | ||
| 2841 | |||
| 2833 | return ret; | 2842 | return ret; |
| 2834 | } | 2843 | } |
| 2835 | 2844 | ||
| @@ -3446,9 +3455,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 3446 | * Except for the root, subtree_control must be zero for a cgroup | 3455 | * Except for the root, subtree_control must be zero for a cgroup |
| 3447 | * with tasks so that child cgroups don't compete against tasks. | 3456 | * with tasks so that child cgroups don't compete against tasks. |
| 3448 | */ | 3457 | */ |
| 3449 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | 3458 | if (enable && cgroup_parent(cgrp)) { |
| 3450 | ret = -EBUSY; | 3459 | struct cgrp_cset_link *link; |
| 3451 | goto out_unlock; | 3460 | |
| 3461 | /* | ||
| 3462 | * Because namespaces pin csets too, @cgrp->cset_links | ||
| 3463 | * might not be empty even when @cgrp is empty. Walk and | ||
| 3464 | * verify each cset. | ||
| 3465 | */ | ||
| 3466 | spin_lock_irq(&css_set_lock); | ||
| 3467 | |||
| 3468 | ret = 0; | ||
| 3469 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | ||
| 3470 | if (css_set_populated(link->cset)) { | ||
| 3471 | ret = -EBUSY; | ||
| 3472 | break; | ||
| 3473 | } | ||
| 3474 | } | ||
| 3475 | |||
| 3476 | spin_unlock_irq(&css_set_lock); | ||
| 3477 | |||
| 3478 | if (ret) | ||
| 3479 | goto out_unlock; | ||
| 3452 | } | 3480 | } |
| 3453 | 3481 | ||
| 3454 | /* save and update control masks and prepare csses */ | 3482 | /* save and update control masks and prepare csses */ |
| @@ -3592,6 +3620,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
| 3592 | mutex_lock(&cgroup_mutex); | 3620 | mutex_lock(&cgroup_mutex); |
| 3593 | 3621 | ||
| 3594 | ret = kernfs_rename(kn, new_parent, new_name_str); | 3622 | ret = kernfs_rename(kn, new_parent, new_name_str); |
| 3623 | if (!ret) | ||
| 3624 | trace_cgroup_rename(cgrp); | ||
| 3595 | 3625 | ||
| 3596 | mutex_unlock(&cgroup_mutex); | 3626 | mutex_unlock(&cgroup_mutex); |
| 3597 | 3627 | ||
| @@ -3899,7 +3929,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) | |||
| 3899 | * cgroup_task_count - count the number of tasks in a cgroup. | 3929 | * cgroup_task_count - count the number of tasks in a cgroup. |
| 3900 | * @cgrp: the cgroup in question | 3930 | * @cgrp: the cgroup in question |
| 3901 | * | 3931 | * |
| 3902 | * Return the number of tasks in the cgroup. | 3932 | * Return the number of tasks in the cgroup. The returned number can be |
| 3933 | * higher than the actual number of tasks due to css_set references from | ||
| 3934 | * namespace roots and temporary usages. | ||
| 3903 | */ | 3935 | */ |
| 3904 | static int cgroup_task_count(const struct cgroup *cgrp) | 3936 | static int cgroup_task_count(const struct cgroup *cgrp) |
| 3905 | { | 3937 | { |
| @@ -4360,6 +4392,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
| 4360 | 4392 | ||
| 4361 | if (task) { | 4393 | if (task) { |
| 4362 | ret = cgroup_migrate(task, false, to->root); | 4394 | ret = cgroup_migrate(task, false, to->root); |
| 4395 | if (!ret) | ||
| 4396 | trace_cgroup_transfer_tasks(to, task, false); | ||
| 4363 | put_task_struct(task); | 4397 | put_task_struct(task); |
| 4364 | } | 4398 | } |
| 4365 | } while (task && !ret); | 4399 | } while (task && !ret); |
| @@ -5025,6 +5059,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 5025 | ss->css_released(css); | 5059 | ss->css_released(css); |
| 5026 | } else { | 5060 | } else { |
| 5027 | /* cgroup release path */ | 5061 | /* cgroup release path */ |
| 5062 | trace_cgroup_release(cgrp); | ||
| 5063 | |||
| 5028 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 5064 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
| 5029 | cgrp->id = -1; | 5065 | cgrp->id = -1; |
| 5030 | 5066 | ||
| @@ -5311,6 +5347,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
| 5311 | if (ret) | 5347 | if (ret) |
| 5312 | goto out_destroy; | 5348 | goto out_destroy; |
| 5313 | 5349 | ||
| 5350 | trace_cgroup_mkdir(cgrp); | ||
| 5351 | |||
| 5314 | /* let's create and online css's */ | 5352 | /* let's create and online css's */ |
| 5315 | kernfs_activate(kn); | 5353 | kernfs_activate(kn); |
| 5316 | 5354 | ||
| @@ -5486,6 +5524,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) | |||
| 5486 | 5524 | ||
| 5487 | ret = cgroup_destroy_locked(cgrp); | 5525 | ret = cgroup_destroy_locked(cgrp); |
| 5488 | 5526 | ||
| 5527 | if (!ret) | ||
| 5528 | trace_cgroup_rmdir(cgrp); | ||
| 5529 | |||
| 5489 | cgroup_kn_unlock(kn); | 5530 | cgroup_kn_unlock(kn); |
| 5490 | return ret; | 5531 | return ret; |
| 5491 | } | 5532 | } |
| @@ -5606,6 +5647,12 @@ int __init cgroup_init(void) | |||
| 5606 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5647 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
| 5607 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 5648 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
| 5608 | 5649 | ||
| 5650 | /* | ||
| 5651 | * The latency of the synchronize_sched() is too high for cgroups, | ||
| 5652 | * avoid it at the cost of forcing all readers into the slow path. | ||
| 5653 | */ | ||
| 5654 | rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); | ||
| 5655 | |||
| 5609 | get_user_ns(init_cgroup_ns.user_ns); | 5656 | get_user_ns(init_cgroup_ns.user_ns); |
| 5610 | 5657 | ||
| 5611 | mutex_lock(&cgroup_mutex); | 5658 | mutex_lock(&cgroup_mutex); |
| @@ -5716,7 +5763,7 @@ core_initcall(cgroup_wq_init); | |||
| 5716 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | 5763 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, |
| 5717 | struct pid *pid, struct task_struct *tsk) | 5764 | struct pid *pid, struct task_struct *tsk) |
| 5718 | { | 5765 | { |
| 5719 | char *buf, *path; | 5766 | char *buf; |
| 5720 | int retval; | 5767 | int retval; |
| 5721 | struct cgroup_root *root; | 5768 | struct cgroup_root *root; |
| 5722 | 5769 | ||
| @@ -5759,18 +5806,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
| 5759 | * " (deleted)" is appended to the cgroup path. | 5806 | * " (deleted)" is appended to the cgroup path. |
| 5760 | */ | 5807 | */ |
| 5761 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { | 5808 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { |
| 5762 | path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, | 5809 | retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, |
| 5763 | current->nsproxy->cgroup_ns); | 5810 | current->nsproxy->cgroup_ns); |
| 5764 | if (!path) { | 5811 | if (retval >= PATH_MAX) |
| 5765 | retval = -ENAMETOOLONG; | 5812 | retval = -ENAMETOOLONG; |
| 5813 | if (retval < 0) | ||
| 5766 | goto out_unlock; | 5814 | goto out_unlock; |
| 5767 | } | 5815 | |
| 5816 | seq_puts(m, buf); | ||
| 5768 | } else { | 5817 | } else { |
| 5769 | path = "/"; | 5818 | seq_puts(m, "/"); |
| 5770 | } | 5819 | } |
| 5771 | 5820 | ||
| 5772 | seq_puts(m, path); | ||
| 5773 | |||
| 5774 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) | 5821 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) |
| 5775 | seq_puts(m, " (deleted)\n"); | 5822 | seq_puts(m, " (deleted)\n"); |
| 5776 | else | 5823 | else |
| @@ -6035,8 +6082,9 @@ static void cgroup_release_agent(struct work_struct *work) | |||
| 6035 | { | 6082 | { |
| 6036 | struct cgroup *cgrp = | 6083 | struct cgroup *cgrp = |
| 6037 | container_of(work, struct cgroup, release_agent_work); | 6084 | container_of(work, struct cgroup, release_agent_work); |
| 6038 | char *pathbuf = NULL, *agentbuf = NULL, *path; | 6085 | char *pathbuf = NULL, *agentbuf = NULL; |
| 6039 | char *argv[3], *envp[3]; | 6086 | char *argv[3], *envp[3]; |
| 6087 | int ret; | ||
| 6040 | 6088 | ||
| 6041 | mutex_lock(&cgroup_mutex); | 6089 | mutex_lock(&cgroup_mutex); |
| 6042 | 6090 | ||
| @@ -6046,13 +6094,13 @@ static void cgroup_release_agent(struct work_struct *work) | |||
| 6046 | goto out; | 6094 | goto out; |
| 6047 | 6095 | ||
| 6048 | spin_lock_irq(&css_set_lock); | 6096 | spin_lock_irq(&css_set_lock); |
| 6049 | path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); | 6097 | ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); |
| 6050 | spin_unlock_irq(&css_set_lock); | 6098 | spin_unlock_irq(&css_set_lock); |
| 6051 | if (!path) | 6099 | if (ret < 0 || ret >= PATH_MAX) |
| 6052 | goto out; | 6100 | goto out; |
| 6053 | 6101 | ||
| 6054 | argv[0] = agentbuf; | 6102 | argv[0] = agentbuf; |
| 6055 | argv[1] = path; | 6103 | argv[1] = pathbuf; |
| 6056 | argv[2] = NULL; | 6104 | argv[2] = NULL; |
| 6057 | 6105 | ||
| 6058 | /* minimal command environment */ | 6106 | /* minimal command environment */ |
| @@ -6270,6 +6318,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
| 6270 | if (cgroup_sk_alloc_disabled) | 6318 | if (cgroup_sk_alloc_disabled) |
| 6271 | return; | 6319 | return; |
| 6272 | 6320 | ||
| 6321 | /* Socket clone path */ | ||
| 6322 | if (skcd->val) { | ||
| 6323 | cgroup_get(sock_cgroup_ptr(skcd)); | ||
| 6324 | return; | ||
| 6325 | } | ||
| 6326 | |||
| 6273 | rcu_read_lock(); | 6327 | rcu_read_lock(); |
| 6274 | 6328 | ||
| 6275 | while (true) { | 6329 | while (true) { |
| @@ -6295,6 +6349,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) | |||
| 6295 | 6349 | ||
| 6296 | /* cgroup namespaces */ | 6350 | /* cgroup namespaces */ |
| 6297 | 6351 | ||
| 6352 | static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) | ||
| 6353 | { | ||
| 6354 | return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); | ||
| 6355 | } | ||
| 6356 | |||
| 6357 | static void dec_cgroup_namespaces(struct ucounts *ucounts) | ||
| 6358 | { | ||
| 6359 | dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); | ||
| 6360 | } | ||
| 6361 | |||
| 6298 | static struct cgroup_namespace *alloc_cgroup_ns(void) | 6362 | static struct cgroup_namespace *alloc_cgroup_ns(void) |
| 6299 | { | 6363 | { |
| 6300 | struct cgroup_namespace *new_ns; | 6364 | struct cgroup_namespace *new_ns; |
| @@ -6316,6 +6380,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) | |||
| 6316 | void free_cgroup_ns(struct cgroup_namespace *ns) | 6380 | void free_cgroup_ns(struct cgroup_namespace *ns) |
| 6317 | { | 6381 | { |
| 6318 | put_css_set(ns->root_cset); | 6382 | put_css_set(ns->root_cset); |
| 6383 | dec_cgroup_namespaces(ns->ucounts); | ||
| 6319 | put_user_ns(ns->user_ns); | 6384 | put_user_ns(ns->user_ns); |
| 6320 | ns_free_inum(&ns->ns); | 6385 | ns_free_inum(&ns->ns); |
| 6321 | kfree(ns); | 6386 | kfree(ns); |
| @@ -6327,6 +6392,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
| 6327 | struct cgroup_namespace *old_ns) | 6392 | struct cgroup_namespace *old_ns) |
| 6328 | { | 6393 | { |
| 6329 | struct cgroup_namespace *new_ns; | 6394 | struct cgroup_namespace *new_ns; |
| 6395 | struct ucounts *ucounts; | ||
| 6330 | struct css_set *cset; | 6396 | struct css_set *cset; |
| 6331 | 6397 | ||
| 6332 | BUG_ON(!old_ns); | 6398 | BUG_ON(!old_ns); |
| @@ -6340,6 +6406,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
| 6340 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | 6406 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) |
| 6341 | return ERR_PTR(-EPERM); | 6407 | return ERR_PTR(-EPERM); |
| 6342 | 6408 | ||
| 6409 | ucounts = inc_cgroup_namespaces(user_ns); | ||
| 6410 | if (!ucounts) | ||
| 6411 | return ERR_PTR(-ENOSPC); | ||
| 6412 | |||
| 6343 | /* It is not safe to take cgroup_mutex here */ | 6413 | /* It is not safe to take cgroup_mutex here */ |
| 6344 | spin_lock_irq(&css_set_lock); | 6414 | spin_lock_irq(&css_set_lock); |
| 6345 | cset = task_css_set(current); | 6415 | cset = task_css_set(current); |
| @@ -6349,10 +6419,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, | |||
| 6349 | new_ns = alloc_cgroup_ns(); | 6419 | new_ns = alloc_cgroup_ns(); |
| 6350 | if (IS_ERR(new_ns)) { | 6420 | if (IS_ERR(new_ns)) { |
| 6351 | put_css_set(cset); | 6421 | put_css_set(cset); |
| 6422 | dec_cgroup_namespaces(ucounts); | ||
| 6352 | return new_ns; | 6423 | return new_ns; |
| 6353 | } | 6424 | } |
| 6354 | 6425 | ||
| 6355 | new_ns->user_ns = get_user_ns(user_ns); | 6426 | new_ns->user_ns = get_user_ns(user_ns); |
| 6427 | new_ns->ucounts = ucounts; | ||
| 6356 | new_ns->root_cset = cset; | 6428 | new_ns->root_cset = cset; |
| 6357 | 6429 | ||
| 6358 | return new_ns; | 6430 | return new_ns; |
| @@ -6403,12 +6475,18 @@ static void cgroupns_put(struct ns_common *ns) | |||
| 6403 | put_cgroup_ns(to_cg_ns(ns)); | 6475 | put_cgroup_ns(to_cg_ns(ns)); |
| 6404 | } | 6476 | } |
| 6405 | 6477 | ||
| 6478 | static struct user_namespace *cgroupns_owner(struct ns_common *ns) | ||
| 6479 | { | ||
| 6480 | return to_cg_ns(ns)->user_ns; | ||
| 6481 | } | ||
| 6482 | |||
| 6406 | const struct proc_ns_operations cgroupns_operations = { | 6483 | const struct proc_ns_operations cgroupns_operations = { |
| 6407 | .name = "cgroup", | 6484 | .name = "cgroup", |
| 6408 | .type = CLONE_NEWCGROUP, | 6485 | .type = CLONE_NEWCGROUP, |
| 6409 | .get = cgroupns_get, | 6486 | .get = cgroupns_get, |
| 6410 | .put = cgroupns_put, | 6487 | .put = cgroupns_put, |
| 6411 | .install = cgroupns_install, | 6488 | .install = cgroupns_install, |
| 6489 | .owner = cgroupns_owner, | ||
| 6412 | }; | 6490 | }; |
| 6413 | 6491 | ||
| 6414 | static __init int cgroup_namespaces_init(void) | 6492 | static __init int cgroup_namespaces_init(void) |
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 9f748ed7bea8..1a8f34f63601 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config | |||
| @@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y | |||
| 11 | CONFIG_ARMV8_DEPRECATED=y | 11 | CONFIG_ARMV8_DEPRECATED=y |
| 12 | CONFIG_ASHMEM=y | 12 | CONFIG_ASHMEM=y |
| 13 | CONFIG_AUDIT=y | 13 | CONFIG_AUDIT=y |
| 14 | CONFIG_BLK_DEV_DM=y | ||
| 15 | CONFIG_BLK_DEV_INITRD=y | 14 | CONFIG_BLK_DEV_INITRD=y |
| 16 | CONFIG_CGROUPS=y | 15 | CONFIG_CGROUPS=y |
| 17 | CONFIG_CGROUP_CPUACCT=y | 16 | CONFIG_CGROUP_CPUACCT=y |
| @@ -19,9 +18,7 @@ CONFIG_CGROUP_DEBUG=y | |||
| 19 | CONFIG_CGROUP_FREEZER=y | 18 | CONFIG_CGROUP_FREEZER=y |
| 20 | CONFIG_CGROUP_SCHED=y | 19 | CONFIG_CGROUP_SCHED=y |
| 21 | CONFIG_CP15_BARRIER_EMULATION=y | 20 | CONFIG_CP15_BARRIER_EMULATION=y |
| 22 | CONFIG_DM_CRYPT=y | 21 | CONFIG_DEFAULT_SECURITY_SELINUX=y |
| 23 | CONFIG_DM_VERITY=y | ||
| 24 | CONFIG_DM_VERITY_FEC=y | ||
| 25 | CONFIG_EMBEDDED=y | 22 | CONFIG_EMBEDDED=y |
| 26 | CONFIG_FB=y | 23 | CONFIG_FB=y |
| 27 | CONFIG_HIGH_RES_TIMERS=y | 24 | CONFIG_HIGH_RES_TIMERS=y |
| @@ -41,7 +38,6 @@ CONFIG_IPV6=y | |||
| 41 | CONFIG_IPV6_MIP6=y | 38 | CONFIG_IPV6_MIP6=y |
| 42 | CONFIG_IPV6_MULTIPLE_TABLES=y | 39 | CONFIG_IPV6_MULTIPLE_TABLES=y |
| 43 | CONFIG_IPV6_OPTIMISTIC_DAD=y | 40 | CONFIG_IPV6_OPTIMISTIC_DAD=y |
| 44 | CONFIG_IPV6_PRIVACY=y | ||
| 45 | CONFIG_IPV6_ROUTER_PREF=y | 41 | CONFIG_IPV6_ROUTER_PREF=y |
| 46 | CONFIG_IPV6_ROUTE_INFO=y | 42 | CONFIG_IPV6_ROUTE_INFO=y |
| 47 | CONFIG_IP_ADVANCED_ROUTER=y | 43 | CONFIG_IP_ADVANCED_ROUTER=y |
| @@ -135,6 +131,7 @@ CONFIG_PREEMPT=y | |||
| 135 | CONFIG_QUOTA=y | 131 | CONFIG_QUOTA=y |
| 136 | CONFIG_RTC_CLASS=y | 132 | CONFIG_RTC_CLASS=y |
| 137 | CONFIG_RT_GROUP_SCHED=y | 133 | CONFIG_RT_GROUP_SCHED=y |
| 134 | CONFIG_SECCOMP=y | ||
| 138 | CONFIG_SECURITY=y | 135 | CONFIG_SECURITY=y |
| 139 | CONFIG_SECURITY_NETWORK=y | 136 | CONFIG_SECURITY_NETWORK=y |
| 140 | CONFIG_SECURITY_SELINUX=y | 137 | CONFIG_SECURITY_SELINUX=y |
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index e3b953e966d2..297756be369c 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config | |||
| @@ -6,12 +6,16 @@ | |||
| 6 | # CONFIG_PM_WAKELOCKS_GC is not set | 6 | # CONFIG_PM_WAKELOCKS_GC is not set |
| 7 | # CONFIG_VT is not set | 7 | # CONFIG_VT is not set |
| 8 | CONFIG_BACKLIGHT_LCD_SUPPORT=y | 8 | CONFIG_BACKLIGHT_LCD_SUPPORT=y |
| 9 | CONFIG_BLK_DEV_DM=y | ||
| 9 | CONFIG_BLK_DEV_LOOP=y | 10 | CONFIG_BLK_DEV_LOOP=y |
| 10 | CONFIG_BLK_DEV_RAM=y | 11 | CONFIG_BLK_DEV_RAM=y |
| 11 | CONFIG_BLK_DEV_RAM_SIZE=8192 | 12 | CONFIG_BLK_DEV_RAM_SIZE=8192 |
| 12 | CONFIG_COMPACTION=y | 13 | CONFIG_COMPACTION=y |
| 13 | CONFIG_DEBUG_RODATA=y | 14 | CONFIG_DEBUG_RODATA=y |
| 15 | CONFIG_DM_CRYPT=y | ||
| 14 | CONFIG_DM_UEVENT=y | 16 | CONFIG_DM_UEVENT=y |
| 17 | CONFIG_DM_VERITY=y | ||
| 18 | CONFIG_DM_VERITY_FEC=y | ||
| 15 | CONFIG_DRAGONRISE_FF=y | 19 | CONFIG_DRAGONRISE_FF=y |
| 16 | CONFIG_ENABLE_DEFAULT_TRACERS=y | 20 | CONFIG_ENABLE_DEFAULT_TRACERS=y |
| 17 | CONFIG_EXT4_FS=y | 21 | CONFIG_EXT4_FS=y |
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 000000000000..8d9643767142 --- /dev/null +++ b/kernel/configs/kvm_guest.config | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | CONFIG_NET=y | ||
| 2 | CONFIG_NET_CORE=y | ||
| 3 | CONFIG_NETDEVICES=y | ||
| 4 | CONFIG_BLOCK=y | ||
| 5 | CONFIG_BLK_DEV=y | ||
| 6 | CONFIG_NETWORK_FILESYSTEMS=y | ||
| 7 | CONFIG_INET=y | ||
| 8 | CONFIG_TTY=y | ||
| 9 | CONFIG_SERIAL_8250=y | ||
| 10 | CONFIG_SERIAL_8250_CONSOLE=y | ||
| 11 | CONFIG_IP_PNP=y | ||
| 12 | CONFIG_IP_PNP_DHCP=y | ||
| 13 | CONFIG_BINFMT_ELF=y | ||
| 14 | CONFIG_PCI=y | ||
| 15 | CONFIG_PCI_MSI=y | ||
| 16 | CONFIG_DEBUG_KERNEL=y | ||
| 17 | CONFIG_VIRTUALIZATION=y | ||
| 18 | CONFIG_HYPERVISOR_GUEST=y | ||
| 19 | CONFIG_PARAVIRT=y | ||
| 20 | CONFIG_KVM_GUEST=y | ||
| 21 | CONFIG_VIRTIO=y | ||
| 22 | CONFIG_VIRTIO_PCI=y | ||
| 23 | CONFIG_VIRTIO_BLK=y | ||
| 24 | CONFIG_VIRTIO_CONSOLE=y | ||
| 25 | CONFIG_VIRTIO_NET=y | ||
| 26 | CONFIG_9P_FS=y | ||
| 27 | CONFIG_NET_9P=y | ||
| 28 | CONFIG_NET_9P_VIRTIO=y | ||
| 29 | CONFIG_SCSI_LOWLEVEL=y | ||
| 30 | CONFIG_SCSI_VIRTIO=y | ||
| 31 | CONFIG_VIRTIO_INPUT=y | ||
| 32 | CONFIG_DRM_VIRTIO_GPU=y | ||
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config | |||
| @@ -1,4 +1,12 @@ | |||
| 1 | # CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set | ||
| 1 | CONFIG_CC_OPTIMIZE_FOR_SIZE=y | 2 | CONFIG_CC_OPTIMIZE_FOR_SIZE=y |
| 3 | # CONFIG_KERNEL_GZIP is not set | ||
| 4 | # CONFIG_KERNEL_BZIP2 is not set | ||
| 5 | # CONFIG_KERNEL_LZMA is not set | ||
| 2 | CONFIG_KERNEL_XZ=y | 6 | CONFIG_KERNEL_XZ=y |
| 7 | # CONFIG_KERNEL_LZO is not set | ||
| 8 | # CONFIG_KERNEL_LZ4 is not set | ||
| 3 | CONFIG_OPTIMIZE_INLINING=y | 9 | CONFIG_OPTIMIZE_INLINING=y |
| 10 | # CONFIG_SLAB is not set | ||
| 11 | # CONFIG_SLUB is not set | ||
| 4 | CONFIG_SLOB=y | 12 | CONFIG_SLOB=y |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -23,6 +23,8 @@ | |||
| 23 | #include <linux/tick.h> | 23 | #include <linux/tick.h> |
| 24 | #include <linux/irq.h> | 24 | #include <linux/irq.h> |
| 25 | #include <linux/smpboot.h> | 25 | #include <linux/smpboot.h> |
| 26 | #include <linux/relay.h> | ||
| 27 | #include <linux/slab.h> | ||
| 26 | 28 | ||
| 27 | #include <trace/events/power.h> | 29 | #include <trace/events/power.h> |
| 28 | #define CREATE_TRACE_POINTS | 30 | #define CREATE_TRACE_POINTS |
| @@ -37,8 +39,9 @@ | |||
| 37 | * @thread: Pointer to the hotplug thread | 39 | * @thread: Pointer to the hotplug thread |
| 38 | * @should_run: Thread should execute | 40 | * @should_run: Thread should execute |
| 39 | * @rollback: Perform a rollback | 41 | * @rollback: Perform a rollback |
| 40 | * @cb_stat: The state for a single callback (install/uninstall) | 42 | * @single: Single callback invocation |
| 41 | * @cb: Single callback function (install/uninstall) | 43 | * @bringup: Single callback bringup or teardown selector |
| 44 | * @cb_state: The state for a single callback (install/uninstall) | ||
| 42 | * @result: Result of the operation | 45 | * @result: Result of the operation |
| 43 | * @done: Signal completion to the issuer of the task | 46 | * @done: Signal completion to the issuer of the task |
| 44 | */ | 47 | */ |
| @@ -49,8 +52,10 @@ struct cpuhp_cpu_state { | |||
| 49 | struct task_struct *thread; | 52 | struct task_struct *thread; |
| 50 | bool should_run; | 53 | bool should_run; |
| 51 | bool rollback; | 54 | bool rollback; |
| 55 | bool single; | ||
| 56 | bool bringup; | ||
| 57 | struct hlist_node *node; | ||
| 52 | enum cpuhp_state cb_state; | 58 | enum cpuhp_state cb_state; |
| 53 | int (*cb)(unsigned int cpu); | ||
| 54 | int result; | 59 | int result; |
| 55 | struct completion done; | 60 | struct completion done; |
| 56 | #endif | 61 | #endif |
| @@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); | |||
| 68 | * @cant_stop: Bringup/teardown can't be stopped at this step | 73 | * @cant_stop: Bringup/teardown can't be stopped at this step |
| 69 | */ | 74 | */ |
| 70 | struct cpuhp_step { | 75 | struct cpuhp_step { |
| 71 | const char *name; | 76 | const char *name; |
| 72 | int (*startup)(unsigned int cpu); | 77 | union { |
| 73 | int (*teardown)(unsigned int cpu); | 78 | int (*single)(unsigned int cpu); |
| 74 | bool skip_onerr; | 79 | int (*multi)(unsigned int cpu, |
| 75 | bool cant_stop; | 80 | struct hlist_node *node); |
| 81 | } startup; | ||
| 82 | union { | ||
| 83 | int (*single)(unsigned int cpu); | ||
| 84 | int (*multi)(unsigned int cpu, | ||
| 85 | struct hlist_node *node); | ||
| 86 | } teardown; | ||
| 87 | struct hlist_head list; | ||
| 88 | bool skip_onerr; | ||
| 89 | bool cant_stop; | ||
| 90 | bool multi_instance; | ||
| 76 | }; | 91 | }; |
| 77 | 92 | ||
| 78 | static DEFINE_MUTEX(cpuhp_state_mutex); | 93 | static DEFINE_MUTEX(cpuhp_state_mutex); |
| 79 | static struct cpuhp_step cpuhp_bp_states[]; | 94 | static struct cpuhp_step cpuhp_bp_states[]; |
| 80 | static struct cpuhp_step cpuhp_ap_states[]; | 95 | static struct cpuhp_step cpuhp_ap_states[]; |
| 81 | 96 | ||
| 97 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
| 98 | { | ||
| 99 | /* | ||
| 100 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
| 101 | * purposes as that state is handled explicitly in cpu_down. | ||
| 102 | */ | ||
| 103 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
| 104 | } | ||
| 105 | |||
| 106 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | ||
| 107 | { | ||
| 108 | struct cpuhp_step *sp; | ||
| 109 | |||
| 110 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
| 111 | return sp + state; | ||
| 112 | } | ||
| 113 | |||
| 82 | /** | 114 | /** |
| 83 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state | 115 | * cpuhp_invoke_callback _ Invoke the callbacks for a given state |
| 84 | * @cpu: The cpu for which the callback should be invoked | 116 | * @cpu: The cpu for which the callback should be invoked |
| 85 | * @step: The step in the state machine | 117 | * @step: The step in the state machine |
| 86 | * @cb: The callback function to invoke | 118 | * @bringup: True if the bringup callback should be invoked |
| 87 | * | 119 | * |
| 88 | * Called from cpu hotplug and from the state register machinery | 120 | * Called from cpu hotplug and from the state register machinery. |
| 89 | */ | 121 | */ |
| 90 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, | 122 | static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, |
| 91 | int (*cb)(unsigned int)) | 123 | bool bringup, struct hlist_node *node) |
| 92 | { | 124 | { |
| 93 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 125 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| 94 | int ret = 0; | 126 | struct cpuhp_step *step = cpuhp_get_step(state); |
| 95 | 127 | int (*cbm)(unsigned int cpu, struct hlist_node *node); | |
| 96 | if (cb) { | 128 | int (*cb)(unsigned int cpu); |
| 97 | trace_cpuhp_enter(cpu, st->target, step, cb); | 129 | int ret, cnt; |
| 130 | |||
| 131 | if (!step->multi_instance) { | ||
| 132 | cb = bringup ? step->startup.single : step->teardown.single; | ||
| 133 | if (!cb) | ||
| 134 | return 0; | ||
| 135 | trace_cpuhp_enter(cpu, st->target, state, cb); | ||
| 98 | ret = cb(cpu); | 136 | ret = cb(cpu); |
| 99 | trace_cpuhp_exit(cpu, st->state, step, ret); | 137 | trace_cpuhp_exit(cpu, st->state, state, ret); |
| 138 | return ret; | ||
| 139 | } | ||
| 140 | cbm = bringup ? step->startup.multi : step->teardown.multi; | ||
| 141 | if (!cbm) | ||
| 142 | return 0; | ||
| 143 | |||
| 144 | /* Single invocation for instance add/remove */ | ||
| 145 | if (node) { | ||
| 146 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | ||
| 147 | ret = cbm(cpu, node); | ||
| 148 | trace_cpuhp_exit(cpu, st->state, state, ret); | ||
| 149 | return ret; | ||
| 150 | } | ||
| 151 | |||
| 152 | /* State transition. Invoke on all instances */ | ||
| 153 | cnt = 0; | ||
| 154 | hlist_for_each(node, &step->list) { | ||
| 155 | trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); | ||
| 156 | ret = cbm(cpu, node); | ||
| 157 | trace_cpuhp_exit(cpu, st->state, state, ret); | ||
| 158 | if (ret) | ||
| 159 | goto err; | ||
| 160 | cnt++; | ||
| 161 | } | ||
| 162 | return 0; | ||
| 163 | err: | ||
| 164 | /* Rollback the instances if one failed */ | ||
| 165 | cbm = !bringup ? step->startup.multi : step->teardown.multi; | ||
| 166 | if (!cbm) | ||
| 167 | return ret; | ||
| 168 | |||
| 169 | hlist_for_each(node, &step->list) { | ||
| 170 | if (!cnt--) | ||
| 171 | break; | ||
| 172 | cbm(cpu, node); | ||
| 100 | } | 173 | } |
| 101 | return ret; | 174 | return ret; |
| 102 | } | 175 | } |
| @@ -155,7 +228,7 @@ static struct { | |||
| 155 | .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), | 228 | .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), |
| 156 | .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), | 229 | .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), |
| 157 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 230 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 158 | .dep_map = {.name = "cpu_hotplug.lock" }, | 231 | .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), |
| 159 | #endif | 232 | #endif |
| 160 | }; | 233 | }; |
| 161 | 234 | ||
| @@ -260,10 +333,17 @@ void cpu_hotplug_disable(void) | |||
| 260 | } | 333 | } |
| 261 | EXPORT_SYMBOL_GPL(cpu_hotplug_disable); | 334 | EXPORT_SYMBOL_GPL(cpu_hotplug_disable); |
| 262 | 335 | ||
| 336 | static void __cpu_hotplug_enable(void) | ||
| 337 | { | ||
| 338 | if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) | ||
| 339 | return; | ||
| 340 | cpu_hotplug_disabled--; | ||
| 341 | } | ||
| 342 | |||
| 263 | void cpu_hotplug_enable(void) | 343 | void cpu_hotplug_enable(void) |
| 264 | { | 344 | { |
| 265 | cpu_maps_update_begin(); | 345 | cpu_maps_update_begin(); |
| 266 | WARN_ON(--cpu_hotplug_disabled < 0); | 346 | __cpu_hotplug_enable(); |
| 267 | cpu_maps_update_done(); | 347 | cpu_maps_update_done(); |
| 268 | } | 348 | } |
| 269 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | 349 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
| @@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu) | |||
| 330 | return 0; | 410 | return 0; |
| 331 | } | 411 | } |
| 332 | 412 | ||
| 333 | static int notify_starting(unsigned int cpu) | ||
| 334 | { | ||
| 335 | cpu_notify(CPU_STARTING, cpu); | ||
| 336 | return 0; | ||
| 337 | } | ||
| 338 | |||
| 339 | static int bringup_wait_for_ap(unsigned int cpu) | 413 | static int bringup_wait_for_ap(unsigned int cpu) |
| 340 | { | 414 | { |
| 341 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 415 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| @@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu) | |||
| 349 | struct task_struct *idle = idle_thread_get(cpu); | 423 | struct task_struct *idle = idle_thread_get(cpu); |
| 350 | int ret; | 424 | int ret; |
| 351 | 425 | ||
| 426 | /* | ||
| 427 | * Some architectures have to walk the irq descriptors to | ||
| 428 | * setup the vector space for the cpu which comes online. | ||
| 429 | * Prevent irq alloc/free across the bringup. | ||
| 430 | */ | ||
| 431 | irq_lock_sparse(); | ||
| 432 | |||
| 352 | /* Arch-specific enabling code. */ | 433 | /* Arch-specific enabling code. */ |
| 353 | ret = __cpu_up(cpu, idle); | 434 | ret = __cpu_up(cpu, idle); |
| 435 | irq_unlock_sparse(); | ||
| 354 | if (ret) { | 436 | if (ret) { |
| 355 | cpu_notify(CPU_UP_CANCELED, cpu); | 437 | cpu_notify(CPU_UP_CANCELED, cpu); |
| 356 | return ret; | 438 | return ret; |
| @@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu) | |||
| 363 | /* | 445 | /* |
| 364 | * Hotplug state machine related functions | 446 | * Hotplug state machine related functions |
| 365 | */ | 447 | */ |
| 366 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, | 448 | static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) |
| 367 | struct cpuhp_step *steps) | ||
| 368 | { | 449 | { |
| 369 | for (st->state++; st->state < st->target; st->state++) { | 450 | for (st->state++; st->state < st->target; st->state++) { |
| 370 | struct cpuhp_step *step = steps + st->state; | 451 | struct cpuhp_step *step = cpuhp_get_step(st->state); |
| 371 | 452 | ||
| 372 | if (!step->skip_onerr) | 453 | if (!step->skip_onerr) |
| 373 | cpuhp_invoke_callback(cpu, st->state, step->startup); | 454 | cpuhp_invoke_callback(cpu, st->state, true, NULL); |
| 374 | } | 455 | } |
| 375 | } | 456 | } |
| 376 | 457 | ||
| 377 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | 458 | static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, |
| 378 | struct cpuhp_step *steps, enum cpuhp_state target) | 459 | enum cpuhp_state target) |
| 379 | { | 460 | { |
| 380 | enum cpuhp_state prev_state = st->state; | 461 | enum cpuhp_state prev_state = st->state; |
| 381 | int ret = 0; | 462 | int ret = 0; |
| 382 | 463 | ||
| 383 | for (; st->state > target; st->state--) { | 464 | for (; st->state > target; st->state--) { |
| 384 | struct cpuhp_step *step = steps + st->state; | 465 | ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); |
| 385 | |||
| 386 | ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); | ||
| 387 | if (ret) { | 466 | if (ret) { |
| 388 | st->target = prev_state; | 467 | st->target = prev_state; |
| 389 | undo_cpu_down(cpu, st, steps); | 468 | undo_cpu_down(cpu, st); |
| 390 | break; | 469 | break; |
| 391 | } | 470 | } |
| 392 | } | 471 | } |
| 393 | return ret; | 472 | return ret; |
| 394 | } | 473 | } |
| 395 | 474 | ||
| 396 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, | 475 | static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) |
| 397 | struct cpuhp_step *steps) | ||
| 398 | { | 476 | { |
| 399 | for (st->state--; st->state > st->target; st->state--) { | 477 | for (st->state--; st->state > st->target; st->state--) { |
| 400 | struct cpuhp_step *step = steps + st->state; | 478 | struct cpuhp_step *step = cpuhp_get_step(st->state); |
| 401 | 479 | ||
| 402 | if (!step->skip_onerr) | 480 | if (!step->skip_onerr) |
| 403 | cpuhp_invoke_callback(cpu, st->state, step->teardown); | 481 | cpuhp_invoke_callback(cpu, st->state, false, NULL); |
| 404 | } | 482 | } |
| 405 | } | 483 | } |
| 406 | 484 | ||
| 407 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | 485 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, |
| 408 | struct cpuhp_step *steps, enum cpuhp_state target) | 486 | enum cpuhp_state target) |
| 409 | { | 487 | { |
| 410 | enum cpuhp_state prev_state = st->state; | 488 | enum cpuhp_state prev_state = st->state; |
| 411 | int ret = 0; | 489 | int ret = 0; |
| 412 | 490 | ||
| 413 | while (st->state < target) { | 491 | while (st->state < target) { |
| 414 | struct cpuhp_step *step; | ||
| 415 | |||
| 416 | st->state++; | 492 | st->state++; |
| 417 | step = steps + st->state; | 493 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); |
| 418 | ret = cpuhp_invoke_callback(cpu, st->state, step->startup); | ||
| 419 | if (ret) { | 494 | if (ret) { |
| 420 | st->target = prev_state; | 495 | st->target = prev_state; |
| 421 | undo_cpu_up(cpu, st, steps); | 496 | undo_cpu_up(cpu, st); |
| 422 | break; | 497 | break; |
| 423 | } | 498 | } |
| 424 | } | 499 | } |
| @@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) | |||
| 447 | { | 522 | { |
| 448 | enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); | 523 | enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); |
| 449 | 524 | ||
| 450 | return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); | 525 | return cpuhp_down_callbacks(cpu, st, target); |
| 451 | } | 526 | } |
| 452 | 527 | ||
| 453 | /* Execute the online startup callbacks. Used to be CPU_ONLINE */ | 528 | /* Execute the online startup callbacks. Used to be CPU_ONLINE */ |
| 454 | static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) | 529 | static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) |
| 455 | { | 530 | { |
| 456 | return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); | 531 | return cpuhp_up_callbacks(cpu, st, st->target); |
| 457 | } | 532 | } |
| 458 | 533 | ||
| 459 | /* | 534 | /* |
| @@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
| 476 | st->should_run = false; | 551 | st->should_run = false; |
| 477 | 552 | ||
| 478 | /* Single callback invocation for [un]install ? */ | 553 | /* Single callback invocation for [un]install ? */ |
| 479 | if (st->cb) { | 554 | if (st->single) { |
| 480 | if (st->cb_state < CPUHP_AP_ONLINE) { | 555 | if (st->cb_state < CPUHP_AP_ONLINE) { |
| 481 | local_irq_disable(); | 556 | local_irq_disable(); |
| 482 | ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); | 557 | ret = cpuhp_invoke_callback(cpu, st->cb_state, |
| 558 | st->bringup, st->node); | ||
| 483 | local_irq_enable(); | 559 | local_irq_enable(); |
| 484 | } else { | 560 | } else { |
| 485 | ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); | 561 | ret = cpuhp_invoke_callback(cpu, st->cb_state, |
| 562 | st->bringup, st->node); | ||
| 486 | } | 563 | } |
| 487 | } else if (st->rollback) { | 564 | } else if (st->rollback) { |
| 488 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); | 565 | BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); |
| 489 | 566 | ||
| 490 | undo_cpu_down(cpu, st, cpuhp_ap_states); | 567 | undo_cpu_down(cpu, st); |
| 491 | /* | 568 | /* |
| 492 | * This is a momentary workaround to keep the notifier users | 569 | * This is a momentary workaround to keep the notifier users |
| 493 | * happy. Will go away once we got rid of the notifiers. | 570 | * happy. Will go away once we got rid of the notifiers. |
| @@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
| 509 | } | 586 | } |
| 510 | 587 | ||
| 511 | /* Invoke a single callback on a remote cpu */ | 588 | /* Invoke a single callback on a remote cpu */ |
| 512 | static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | 589 | static int |
| 513 | int (*cb)(unsigned int)) | 590 | cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, |
| 591 | struct hlist_node *node) | ||
| 514 | { | 592 | { |
| 515 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 593 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| 516 | 594 | ||
| @@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
| 522 | * we invoke the thread function directly. | 600 | * we invoke the thread function directly. |
| 523 | */ | 601 | */ |
| 524 | if (!st->thread) | 602 | if (!st->thread) |
| 525 | return cpuhp_invoke_callback(cpu, state, cb); | 603 | return cpuhp_invoke_callback(cpu, state, bringup, node); |
| 526 | 604 | ||
| 527 | st->cb_state = state; | 605 | st->cb_state = state; |
| 528 | st->cb = cb; | 606 | st->single = true; |
| 607 | st->bringup = bringup; | ||
| 608 | st->node = node; | ||
| 609 | |||
| 529 | /* | 610 | /* |
| 530 | * Make sure the above stores are visible before should_run becomes | 611 | * Make sure the above stores are visible before should_run becomes |
| 531 | * true. Paired with the mb() above in cpuhp_thread_fun() | 612 | * true. Paired with the mb() above in cpuhp_thread_fun() |
| @@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, | |||
| 541 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) | 622 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) |
| 542 | { | 623 | { |
| 543 | st->result = 0; | 624 | st->result = 0; |
| 544 | st->cb = NULL; | 625 | st->single = false; |
| 545 | /* | 626 | /* |
| 546 | * Make sure the above stores are visible before should_run becomes | 627 | * Make sure the above stores are visible before should_run becomes |
| 547 | * true. Paired with the mb() above in cpuhp_thread_fun() | 628 | * true. Paired with the mb() above in cpuhp_thread_fun() |
| @@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu) | |||
| 674 | return err; | 755 | return err; |
| 675 | } | 756 | } |
| 676 | 757 | ||
| 677 | static int notify_dying(unsigned int cpu) | ||
| 678 | { | ||
| 679 | cpu_notify(CPU_DYING, cpu); | ||
| 680 | return 0; | ||
| 681 | } | ||
| 682 | |||
| 683 | /* Take this CPU down. */ | 758 | /* Take this CPU down. */ |
| 684 | static int take_cpu_down(void *_param) | 759 | static int take_cpu_down(void *_param) |
| 685 | { | 760 | { |
| @@ -692,12 +767,16 @@ static int take_cpu_down(void *_param) | |||
| 692 | if (err < 0) | 767 | if (err < 0) |
| 693 | return err; | 768 | return err; |
| 694 | 769 | ||
| 770 | /* | ||
| 771 | * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not | ||
| 772 | * do this step again. | ||
| 773 | */ | ||
| 774 | WARN_ON(st->state != CPUHP_TEARDOWN_CPU); | ||
| 775 | st->state--; | ||
| 695 | /* Invoke the former CPU_DYING callbacks */ | 776 | /* Invoke the former CPU_DYING callbacks */ |
| 696 | for (; st->state > target; st->state--) { | 777 | for (; st->state > target; st->state--) |
| 697 | struct cpuhp_step *step = cpuhp_ap_states + st->state; | 778 | cpuhp_invoke_callback(cpu, st->state, false, NULL); |
| 698 | 779 | ||
| 699 | cpuhp_invoke_callback(cpu, st->state, step->teardown); | ||
| 700 | } | ||
| 701 | /* Give up timekeeping duties */ | 780 | /* Give up timekeeping duties */ |
| 702 | tick_handover_do_timer(); | 781 | tick_handover_do_timer(); |
| 703 | /* Park the stopper thread */ | 782 | /* Park the stopper thread */ |
| @@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu) | |||
| 734 | BUG_ON(cpu_online(cpu)); | 813 | BUG_ON(cpu_online(cpu)); |
| 735 | 814 | ||
| 736 | /* | 815 | /* |
| 737 | * The migration_call() CPU_DYING callback will have removed all | 816 | * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all |
| 738 | * runnable tasks from the cpu, there's only the idle task left now | 817 | * runnable tasks from the cpu, there's only the idle task left now |
| 739 | * that the migration thread is done doing the stop_machine thing. | 818 | * that the migration thread is done doing the stop_machine thing. |
| 740 | * | 819 | * |
| @@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void) | |||
| 787 | #define notify_down_prepare NULL | 866 | #define notify_down_prepare NULL |
| 788 | #define takedown_cpu NULL | 867 | #define takedown_cpu NULL |
| 789 | #define notify_dead NULL | 868 | #define notify_dead NULL |
| 790 | #define notify_dying NULL | ||
| 791 | #endif | 869 | #endif |
| 792 | 870 | ||
| 793 | #ifdef CONFIG_HOTPLUG_CPU | 871 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
| 836 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need | 914 | * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need |
| 837 | * to do the further cleanups. | 915 | * to do the further cleanups. |
| 838 | */ | 916 | */ |
| 839 | ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); | 917 | ret = cpuhp_down_callbacks(cpu, st, target); |
| 840 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { | 918 | if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { |
| 841 | st->target = prev_state; | 919 | st->target = prev_state; |
| 842 | st->rollback = true; | 920 | st->rollback = true; |
| @@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down); | |||
| 877 | #endif /*CONFIG_HOTPLUG_CPU*/ | 955 | #endif /*CONFIG_HOTPLUG_CPU*/ |
| 878 | 956 | ||
| 879 | /** | 957 | /** |
| 880 | * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers | 958 | * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU |
| 881 | * @cpu: cpu that just started | 959 | * @cpu: cpu that just started |
| 882 | * | 960 | * |
| 883 | * This function calls the cpu_chain notifiers with CPU_STARTING. | ||
| 884 | * It must be called by the arch code on the new cpu, before the new cpu | 961 | * It must be called by the arch code on the new cpu, before the new cpu |
| 885 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). | 962 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). |
| 886 | */ | 963 | */ |
| @@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu) | |||
| 889 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 966 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| 890 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); | 967 | enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); |
| 891 | 968 | ||
| 969 | rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ | ||
| 892 | while (st->state < target) { | 970 | while (st->state < target) { |
| 893 | struct cpuhp_step *step; | ||
| 894 | |||
| 895 | st->state++; | 971 | st->state++; |
| 896 | step = cpuhp_ap_states + st->state; | 972 | cpuhp_invoke_callback(cpu, st->state, true, NULL); |
| 897 | cpuhp_invoke_callback(cpu, st->state, step->startup); | ||
| 898 | } | 973 | } |
| 899 | } | 974 | } |
| 900 | 975 | ||
| @@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | |||
| 979 | * responsible for bringing it up to the target state. | 1054 | * responsible for bringing it up to the target state. |
| 980 | */ | 1055 | */ |
| 981 | target = min((int)target, CPUHP_BRINGUP_CPU); | 1056 | target = min((int)target, CPUHP_BRINGUP_CPU); |
| 982 | ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); | 1057 | ret = cpuhp_up_callbacks(cpu, st, target); |
| 983 | out: | 1058 | out: |
| 984 | cpu_hotplug_done(); | 1059 | cpu_hotplug_done(); |
| 985 | return ret; | 1060 | return ret; |
| @@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up); | |||
| 1024 | #ifdef CONFIG_PM_SLEEP_SMP | 1099 | #ifdef CONFIG_PM_SLEEP_SMP |
| 1025 | static cpumask_var_t frozen_cpus; | 1100 | static cpumask_var_t frozen_cpus; |
| 1026 | 1101 | ||
| 1027 | int disable_nonboot_cpus(void) | 1102 | int freeze_secondary_cpus(int primary) |
| 1028 | { | 1103 | { |
| 1029 | int cpu, first_cpu, error = 0; | 1104 | int cpu, error = 0; |
| 1030 | 1105 | ||
| 1031 | cpu_maps_update_begin(); | 1106 | cpu_maps_update_begin(); |
| 1032 | first_cpu = cpumask_first(cpu_online_mask); | 1107 | if (!cpu_online(primary)) |
| 1108 | primary = cpumask_first(cpu_online_mask); | ||
| 1033 | /* | 1109 | /* |
| 1034 | * We take down all of the non-boot CPUs in one shot to avoid races | 1110 | * We take down all of the non-boot CPUs in one shot to avoid races |
| 1035 | * with the userspace trying to use the CPU hotplug at the same time | 1111 | * with the userspace trying to use the CPU hotplug at the same time |
| @@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void) | |||
| 1038 | 1114 | ||
| 1039 | pr_info("Disabling non-boot CPUs ...\n"); | 1115 | pr_info("Disabling non-boot CPUs ...\n"); |
| 1040 | for_each_online_cpu(cpu) { | 1116 | for_each_online_cpu(cpu) { |
| 1041 | if (cpu == first_cpu) | 1117 | if (cpu == primary) |
| 1042 | continue; | 1118 | continue; |
| 1043 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); | 1119 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); |
| 1044 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); | 1120 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); |
| @@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void) | |||
| 1081 | 1157 | ||
| 1082 | /* Allow everyone to use the CPU hotplug again */ | 1158 | /* Allow everyone to use the CPU hotplug again */ |
| 1083 | cpu_maps_update_begin(); | 1159 | cpu_maps_update_begin(); |
| 1084 | WARN_ON(--cpu_hotplug_disabled < 0); | 1160 | __cpu_hotplug_enable(); |
| 1085 | if (cpumask_empty(frozen_cpus)) | 1161 | if (cpumask_empty(frozen_cpus)) |
| 1086 | goto out; | 1162 | goto out; |
| 1087 | 1163 | ||
| @@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init); | |||
| 1170 | static struct cpuhp_step cpuhp_bp_states[] = { | 1246 | static struct cpuhp_step cpuhp_bp_states[] = { |
| 1171 | [CPUHP_OFFLINE] = { | 1247 | [CPUHP_OFFLINE] = { |
| 1172 | .name = "offline", | 1248 | .name = "offline", |
| 1173 | .startup = NULL, | 1249 | .startup.single = NULL, |
| 1174 | .teardown = NULL, | 1250 | .teardown.single = NULL, |
| 1175 | }, | 1251 | }, |
| 1176 | #ifdef CONFIG_SMP | 1252 | #ifdef CONFIG_SMP |
| 1177 | [CPUHP_CREATE_THREADS]= { | 1253 | [CPUHP_CREATE_THREADS]= { |
| 1178 | .name = "threads:create", | 1254 | .name = "threads:prepare", |
| 1179 | .startup = smpboot_create_threads, | 1255 | .startup.single = smpboot_create_threads, |
| 1180 | .teardown = NULL, | 1256 | .teardown.single = NULL, |
| 1181 | .cant_stop = true, | 1257 | .cant_stop = true, |
| 1182 | }, | 1258 | }, |
| 1183 | [CPUHP_PERF_PREPARE] = { | 1259 | [CPUHP_PERF_PREPARE] = { |
| 1184 | .name = "perf prepare", | 1260 | .name = "perf:prepare", |
| 1185 | .startup = perf_event_init_cpu, | 1261 | .startup.single = perf_event_init_cpu, |
| 1186 | .teardown = perf_event_exit_cpu, | 1262 | .teardown.single = perf_event_exit_cpu, |
| 1187 | }, | 1263 | }, |
| 1188 | [CPUHP_WORKQUEUE_PREP] = { | 1264 | [CPUHP_WORKQUEUE_PREP] = { |
| 1189 | .name = "workqueue prepare", | 1265 | .name = "workqueue:prepare", |
| 1190 | .startup = workqueue_prepare_cpu, | 1266 | .startup.single = workqueue_prepare_cpu, |
| 1191 | .teardown = NULL, | 1267 | .teardown.single = NULL, |
| 1192 | }, | 1268 | }, |
| 1193 | [CPUHP_HRTIMERS_PREPARE] = { | 1269 | [CPUHP_HRTIMERS_PREPARE] = { |
| 1194 | .name = "hrtimers prepare", | 1270 | .name = "hrtimers:prepare", |
| 1195 | .startup = hrtimers_prepare_cpu, | 1271 | .startup.single = hrtimers_prepare_cpu, |
| 1196 | .teardown = hrtimers_dead_cpu, | 1272 | .teardown.single = hrtimers_dead_cpu, |
| 1197 | }, | 1273 | }, |
| 1198 | [CPUHP_SMPCFD_PREPARE] = { | 1274 | [CPUHP_SMPCFD_PREPARE] = { |
| 1199 | .name = "SMPCFD prepare", | 1275 | .name = "smpcfd:prepare", |
| 1200 | .startup = smpcfd_prepare_cpu, | 1276 | .startup.single = smpcfd_prepare_cpu, |
| 1201 | .teardown = smpcfd_dead_cpu, | 1277 | .teardown.single = smpcfd_dead_cpu, |
| 1278 | }, | ||
| 1279 | [CPUHP_RELAY_PREPARE] = { | ||
| 1280 | .name = "relay:prepare", | ||
| 1281 | .startup.single = relay_prepare_cpu, | ||
| 1282 | .teardown.single = NULL, | ||
| 1283 | }, | ||
| 1284 | [CPUHP_SLAB_PREPARE] = { | ||
| 1285 | .name = "slab:prepare", | ||
| 1286 | .startup.single = slab_prepare_cpu, | ||
| 1287 | .teardown.single = slab_dead_cpu, | ||
| 1202 | }, | 1288 | }, |
| 1203 | [CPUHP_RCUTREE_PREP] = { | 1289 | [CPUHP_RCUTREE_PREP] = { |
| 1204 | .name = "RCU-tree prepare", | 1290 | .name = "RCU/tree:prepare", |
| 1205 | .startup = rcutree_prepare_cpu, | 1291 | .startup.single = rcutree_prepare_cpu, |
| 1206 | .teardown = rcutree_dead_cpu, | 1292 | .teardown.single = rcutree_dead_cpu, |
| 1207 | }, | 1293 | }, |
| 1208 | /* | 1294 | /* |
| 1209 | * Preparatory and dead notifiers. Will be replaced once the notifiers | 1295 | * Preparatory and dead notifiers. Will be replaced once the notifiers |
| @@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1211 | */ | 1297 | */ |
| 1212 | [CPUHP_NOTIFY_PREPARE] = { | 1298 | [CPUHP_NOTIFY_PREPARE] = { |
| 1213 | .name = "notify:prepare", | 1299 | .name = "notify:prepare", |
| 1214 | .startup = notify_prepare, | 1300 | .startup.single = notify_prepare, |
| 1215 | .teardown = notify_dead, | 1301 | .teardown.single = notify_dead, |
| 1216 | .skip_onerr = true, | 1302 | .skip_onerr = true, |
| 1217 | .cant_stop = true, | 1303 | .cant_stop = true, |
| 1218 | }, | 1304 | }, |
| @@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1222 | * otherwise a RCU stall occurs. | 1308 | * otherwise a RCU stall occurs. |
| 1223 | */ | 1309 | */ |
| 1224 | [CPUHP_TIMERS_DEAD] = { | 1310 | [CPUHP_TIMERS_DEAD] = { |
| 1225 | .name = "timers dead", | 1311 | .name = "timers:dead", |
| 1226 | .startup = NULL, | 1312 | .startup.single = NULL, |
| 1227 | .teardown = timers_dead_cpu, | 1313 | .teardown.single = timers_dead_cpu, |
| 1228 | }, | 1314 | }, |
| 1229 | /* Kicks the plugged cpu into life */ | 1315 | /* Kicks the plugged cpu into life */ |
| 1230 | [CPUHP_BRINGUP_CPU] = { | 1316 | [CPUHP_BRINGUP_CPU] = { |
| 1231 | .name = "cpu:bringup", | 1317 | .name = "cpu:bringup", |
| 1232 | .startup = bringup_cpu, | 1318 | .startup.single = bringup_cpu, |
| 1233 | .teardown = NULL, | 1319 | .teardown.single = NULL, |
| 1234 | .cant_stop = true, | 1320 | .cant_stop = true, |
| 1235 | }, | 1321 | }, |
| 1236 | [CPUHP_AP_SMPCFD_DYING] = { | 1322 | [CPUHP_AP_SMPCFD_DYING] = { |
| 1237 | .startup = NULL, | 1323 | .name = "smpcfd:dying", |
| 1238 | .teardown = smpcfd_dying_cpu, | 1324 | .startup.single = NULL, |
| 1325 | .teardown.single = smpcfd_dying_cpu, | ||
| 1239 | }, | 1326 | }, |
| 1240 | /* | 1327 | /* |
| 1241 | * Handled on controll processor until the plugged processor manages | 1328 | * Handled on controll processor until the plugged processor manages |
| @@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1243 | */ | 1330 | */ |
| 1244 | [CPUHP_TEARDOWN_CPU] = { | 1331 | [CPUHP_TEARDOWN_CPU] = { |
| 1245 | .name = "cpu:teardown", | 1332 | .name = "cpu:teardown", |
| 1246 | .startup = NULL, | 1333 | .startup.single = NULL, |
| 1247 | .teardown = takedown_cpu, | 1334 | .teardown.single = takedown_cpu, |
| 1248 | .cant_stop = true, | 1335 | .cant_stop = true, |
| 1249 | }, | 1336 | }, |
| 1250 | #else | 1337 | #else |
| @@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1270 | /* First state is scheduler control. Interrupts are disabled */ | 1357 | /* First state is scheduler control. Interrupts are disabled */ |
| 1271 | [CPUHP_AP_SCHED_STARTING] = { | 1358 | [CPUHP_AP_SCHED_STARTING] = { |
| 1272 | .name = "sched:starting", | 1359 | .name = "sched:starting", |
| 1273 | .startup = sched_cpu_starting, | 1360 | .startup.single = sched_cpu_starting, |
| 1274 | .teardown = sched_cpu_dying, | 1361 | .teardown.single = sched_cpu_dying, |
| 1275 | }, | 1362 | }, |
| 1276 | [CPUHP_AP_RCUTREE_DYING] = { | 1363 | [CPUHP_AP_RCUTREE_DYING] = { |
| 1277 | .startup = NULL, | 1364 | .name = "RCU/tree:dying", |
| 1278 | .teardown = rcutree_dying_cpu, | 1365 | .startup.single = NULL, |
| 1279 | }, | 1366 | .teardown.single = rcutree_dying_cpu, |
| 1280 | /* | ||
| 1281 | * Low level startup/teardown notifiers. Run with interrupts | ||
| 1282 | * disabled. Will be removed once the notifiers are converted to | ||
| 1283 | * states. | ||
| 1284 | */ | ||
| 1285 | [CPUHP_AP_NOTIFY_STARTING] = { | ||
| 1286 | .name = "notify:starting", | ||
| 1287 | .startup = notify_starting, | ||
| 1288 | .teardown = notify_dying, | ||
| 1289 | .skip_onerr = true, | ||
| 1290 | .cant_stop = true, | ||
| 1291 | }, | 1367 | }, |
| 1292 | /* Entry state on starting. Interrupts enabled from here on. Transient | 1368 | /* Entry state on starting. Interrupts enabled from here on. Transient |
| 1293 | * state for synchronsization */ | 1369 | * state for synchronsization */ |
| @@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1296 | }, | 1372 | }, |
| 1297 | /* Handle smpboot threads park/unpark */ | 1373 | /* Handle smpboot threads park/unpark */ |
| 1298 | [CPUHP_AP_SMPBOOT_THREADS] = { | 1374 | [CPUHP_AP_SMPBOOT_THREADS] = { |
| 1299 | .name = "smpboot:threads", | 1375 | .name = "smpboot/threads:online", |
| 1300 | .startup = smpboot_unpark_threads, | 1376 | .startup.single = smpboot_unpark_threads, |
| 1301 | .teardown = NULL, | 1377 | .teardown.single = NULL, |
| 1302 | }, | 1378 | }, |
| 1303 | [CPUHP_AP_PERF_ONLINE] = { | 1379 | [CPUHP_AP_PERF_ONLINE] = { |
| 1304 | .name = "perf online", | 1380 | .name = "perf:online", |
| 1305 | .startup = perf_event_init_cpu, | 1381 | .startup.single = perf_event_init_cpu, |
| 1306 | .teardown = perf_event_exit_cpu, | 1382 | .teardown.single = perf_event_exit_cpu, |
| 1307 | }, | 1383 | }, |
| 1308 | [CPUHP_AP_WORKQUEUE_ONLINE] = { | 1384 | [CPUHP_AP_WORKQUEUE_ONLINE] = { |
| 1309 | .name = "workqueue online", | 1385 | .name = "workqueue:online", |
| 1310 | .startup = workqueue_online_cpu, | 1386 | .startup.single = workqueue_online_cpu, |
| 1311 | .teardown = workqueue_offline_cpu, | 1387 | .teardown.single = workqueue_offline_cpu, |
| 1312 | }, | 1388 | }, |
| 1313 | [CPUHP_AP_RCUTREE_ONLINE] = { | 1389 | [CPUHP_AP_RCUTREE_ONLINE] = { |
| 1314 | .name = "RCU-tree online", | 1390 | .name = "RCU/tree:online", |
| 1315 | .startup = rcutree_online_cpu, | 1391 | .startup.single = rcutree_online_cpu, |
| 1316 | .teardown = rcutree_offline_cpu, | 1392 | .teardown.single = rcutree_offline_cpu, |
| 1317 | }, | 1393 | }, |
| 1318 | 1394 | ||
| 1319 | /* | 1395 | /* |
| @@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1322 | */ | 1398 | */ |
| 1323 | [CPUHP_AP_NOTIFY_ONLINE] = { | 1399 | [CPUHP_AP_NOTIFY_ONLINE] = { |
| 1324 | .name = "notify:online", | 1400 | .name = "notify:online", |
| 1325 | .startup = notify_online, | 1401 | .startup.single = notify_online, |
| 1326 | .teardown = notify_down_prepare, | 1402 | .teardown.single = notify_down_prepare, |
| 1327 | .skip_onerr = true, | 1403 | .skip_onerr = true, |
| 1328 | }, | 1404 | }, |
| 1329 | #endif | 1405 | #endif |
| @@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1335 | /* Last state is scheduler control setting the cpu active */ | 1411 | /* Last state is scheduler control setting the cpu active */ |
| 1336 | [CPUHP_AP_ACTIVE] = { | 1412 | [CPUHP_AP_ACTIVE] = { |
| 1337 | .name = "sched:active", | 1413 | .name = "sched:active", |
| 1338 | .startup = sched_cpu_activate, | 1414 | .startup.single = sched_cpu_activate, |
| 1339 | .teardown = sched_cpu_deactivate, | 1415 | .teardown.single = sched_cpu_deactivate, |
| 1340 | }, | 1416 | }, |
| 1341 | #endif | 1417 | #endif |
| 1342 | 1418 | ||
| 1343 | /* CPU is fully up and running. */ | 1419 | /* CPU is fully up and running. */ |
| 1344 | [CPUHP_ONLINE] = { | 1420 | [CPUHP_ONLINE] = { |
| 1345 | .name = "online", | 1421 | .name = "online", |
| 1346 | .startup = NULL, | 1422 | .startup.single = NULL, |
| 1347 | .teardown = NULL, | 1423 | .teardown.single = NULL, |
| 1348 | }, | 1424 | }, |
| 1349 | }; | 1425 | }; |
| 1350 | 1426 | ||
| @@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state) | |||
| 1356 | return 0; | 1432 | return 0; |
| 1357 | } | 1433 | } |
| 1358 | 1434 | ||
| 1359 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
| 1360 | { | ||
| 1361 | /* | ||
| 1362 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
| 1363 | * purposes as that state is handled explicitely in cpu_down. | ||
| 1364 | */ | ||
| 1365 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | ||
| 1369 | { | ||
| 1370 | struct cpuhp_step *sp; | ||
| 1371 | |||
| 1372 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
| 1373 | return sp + state; | ||
| 1374 | } | ||
| 1375 | |||
| 1376 | static void cpuhp_store_callbacks(enum cpuhp_state state, | 1435 | static void cpuhp_store_callbacks(enum cpuhp_state state, |
| 1377 | const char *name, | 1436 | const char *name, |
| 1378 | int (*startup)(unsigned int cpu), | 1437 | int (*startup)(unsigned int cpu), |
| 1379 | int (*teardown)(unsigned int cpu)) | 1438 | int (*teardown)(unsigned int cpu), |
| 1439 | bool multi_instance) | ||
| 1380 | { | 1440 | { |
| 1381 | /* (Un)Install the callbacks for further cpu hotplug operations */ | 1441 | /* (Un)Install the callbacks for further cpu hotplug operations */ |
| 1382 | struct cpuhp_step *sp; | 1442 | struct cpuhp_step *sp; |
| 1383 | 1443 | ||
| 1384 | mutex_lock(&cpuhp_state_mutex); | 1444 | mutex_lock(&cpuhp_state_mutex); |
| 1385 | sp = cpuhp_get_step(state); | 1445 | sp = cpuhp_get_step(state); |
| 1386 | sp->startup = startup; | 1446 | sp->startup.single = startup; |
| 1387 | sp->teardown = teardown; | 1447 | sp->teardown.single = teardown; |
| 1388 | sp->name = name; | 1448 | sp->name = name; |
| 1449 | sp->multi_instance = multi_instance; | ||
| 1450 | INIT_HLIST_HEAD(&sp->list); | ||
| 1389 | mutex_unlock(&cpuhp_state_mutex); | 1451 | mutex_unlock(&cpuhp_state_mutex); |
| 1390 | } | 1452 | } |
| 1391 | 1453 | ||
| 1392 | static void *cpuhp_get_teardown_cb(enum cpuhp_state state) | 1454 | static void *cpuhp_get_teardown_cb(enum cpuhp_state state) |
| 1393 | { | 1455 | { |
| 1394 | return cpuhp_get_step(state)->teardown; | 1456 | return cpuhp_get_step(state)->teardown.single; |
| 1395 | } | 1457 | } |
| 1396 | 1458 | ||
| 1397 | /* | 1459 | /* |
| 1398 | * Call the startup/teardown function for a step either on the AP or | 1460 | * Call the startup/teardown function for a step either on the AP or |
| 1399 | * on the current CPU. | 1461 | * on the current CPU. |
| 1400 | */ | 1462 | */ |
| 1401 | static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | 1463 | static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, |
| 1402 | int (*cb)(unsigned int), bool bringup) | 1464 | struct hlist_node *node) |
| 1403 | { | 1465 | { |
| 1466 | struct cpuhp_step *sp = cpuhp_get_step(state); | ||
| 1404 | int ret; | 1467 | int ret; |
| 1405 | 1468 | ||
| 1406 | if (!cb) | 1469 | if ((bringup && !sp->startup.single) || |
| 1470 | (!bringup && !sp->teardown.single)) | ||
| 1407 | return 0; | 1471 | return 0; |
| 1408 | /* | 1472 | /* |
| 1409 | * The non AP bound callbacks can fail on bringup. On teardown | 1473 | * The non AP bound callbacks can fail on bringup. On teardown |
| @@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | |||
| 1411 | */ | 1475 | */ |
| 1412 | #ifdef CONFIG_SMP | 1476 | #ifdef CONFIG_SMP |
| 1413 | if (cpuhp_is_ap_state(state)) | 1477 | if (cpuhp_is_ap_state(state)) |
| 1414 | ret = cpuhp_invoke_ap_callback(cpu, state, cb); | 1478 | ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); |
| 1415 | else | 1479 | else |
| 1416 | ret = cpuhp_invoke_callback(cpu, state, cb); | 1480 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); |
| 1417 | #else | 1481 | #else |
| 1418 | ret = cpuhp_invoke_callback(cpu, state, cb); | 1482 | ret = cpuhp_invoke_callback(cpu, state, bringup, node); |
| 1419 | #endif | 1483 | #endif |
| 1420 | BUG_ON(ret && !bringup); | 1484 | BUG_ON(ret && !bringup); |
| 1421 | return ret; | 1485 | return ret; |
| @@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, | |||
| 1427 | * Note: The teardown callbacks for rollback are not allowed to fail! | 1491 | * Note: The teardown callbacks for rollback are not allowed to fail! |
| 1428 | */ | 1492 | */ |
| 1429 | static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, | 1493 | static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, |
| 1430 | int (*teardown)(unsigned int cpu)) | 1494 | struct hlist_node *node) |
| 1431 | { | 1495 | { |
| 1432 | int cpu; | 1496 | int cpu; |
| 1433 | 1497 | ||
| 1434 | if (!teardown) | ||
| 1435 | return; | ||
| 1436 | |||
| 1437 | /* Roll back the already executed steps on the other cpus */ | 1498 | /* Roll back the already executed steps on the other cpus */ |
| 1438 | for_each_present_cpu(cpu) { | 1499 | for_each_present_cpu(cpu) { |
| 1439 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 1500 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| @@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, | |||
| 1444 | 1505 | ||
| 1445 | /* Did we invoke the startup call on that cpu ? */ | 1506 | /* Did we invoke the startup call on that cpu ? */ |
| 1446 | if (cpustate >= state) | 1507 | if (cpustate >= state) |
| 1447 | cpuhp_issue_call(cpu, state, teardown, false); | 1508 | cpuhp_issue_call(cpu, state, false, node); |
| 1448 | } | 1509 | } |
| 1449 | } | 1510 | } |
| 1450 | 1511 | ||
| @@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
| 1471 | return -ENOSPC; | 1532 | return -ENOSPC; |
| 1472 | } | 1533 | } |
| 1473 | 1534 | ||
| 1535 | int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, | ||
| 1536 | bool invoke) | ||
| 1537 | { | ||
| 1538 | struct cpuhp_step *sp; | ||
| 1539 | int cpu; | ||
| 1540 | int ret; | ||
| 1541 | |||
| 1542 | sp = cpuhp_get_step(state); | ||
| 1543 | if (sp->multi_instance == false) | ||
| 1544 | return -EINVAL; | ||
| 1545 | |||
| 1546 | get_online_cpus(); | ||
| 1547 | |||
| 1548 | if (!invoke || !sp->startup.multi) | ||
| 1549 | goto add_node; | ||
| 1550 | |||
| 1551 | /* | ||
| 1552 | * Try to call the startup callback for each present cpu | ||
| 1553 | * depending on the hotplug state of the cpu. | ||
| 1554 | */ | ||
| 1555 | for_each_present_cpu(cpu) { | ||
| 1556 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | ||
| 1557 | int cpustate = st->state; | ||
| 1558 | |||
| 1559 | if (cpustate < state) | ||
| 1560 | continue; | ||
| 1561 | |||
| 1562 | ret = cpuhp_issue_call(cpu, state, true, node); | ||
| 1563 | if (ret) { | ||
| 1564 | if (sp->teardown.multi) | ||
| 1565 | cpuhp_rollback_install(cpu, state, node); | ||
| 1566 | goto err; | ||
| 1567 | } | ||
| 1568 | } | ||
| 1569 | add_node: | ||
| 1570 | ret = 0; | ||
| 1571 | mutex_lock(&cpuhp_state_mutex); | ||
| 1572 | hlist_add_head(node, &sp->list); | ||
| 1573 | mutex_unlock(&cpuhp_state_mutex); | ||
| 1574 | |||
| 1575 | err: | ||
| 1576 | put_online_cpus(); | ||
| 1577 | return ret; | ||
| 1578 | } | ||
| 1579 | EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); | ||
| 1580 | |||
| 1474 | /** | 1581 | /** |
| 1475 | * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state | 1582 | * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state |
| 1476 | * @state: The state to setup | 1583 | * @state: The state to setup |
| @@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
| 1484 | int __cpuhp_setup_state(enum cpuhp_state state, | 1591 | int __cpuhp_setup_state(enum cpuhp_state state, |
| 1485 | const char *name, bool invoke, | 1592 | const char *name, bool invoke, |
| 1486 | int (*startup)(unsigned int cpu), | 1593 | int (*startup)(unsigned int cpu), |
| 1487 | int (*teardown)(unsigned int cpu)) | 1594 | int (*teardown)(unsigned int cpu), |
| 1595 | bool multi_instance) | ||
| 1488 | { | 1596 | { |
| 1489 | int cpu, ret = 0; | 1597 | int cpu, ret = 0; |
| 1490 | int dyn_state = 0; | 1598 | int dyn_state = 0; |
| @@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, | |||
| 1503 | state = ret; | 1611 | state = ret; |
| 1504 | } | 1612 | } |
| 1505 | 1613 | ||
| 1506 | cpuhp_store_callbacks(state, name, startup, teardown); | 1614 | cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); |
| 1507 | 1615 | ||
| 1508 | if (!invoke || !startup) | 1616 | if (!invoke || !startup) |
| 1509 | goto out; | 1617 | goto out; |
| @@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, | |||
| 1519 | if (cpustate < state) | 1627 | if (cpustate < state) |
| 1520 | continue; | 1628 | continue; |
| 1521 | 1629 | ||
| 1522 | ret = cpuhp_issue_call(cpu, state, startup, true); | 1630 | ret = cpuhp_issue_call(cpu, state, true, NULL); |
| 1523 | if (ret) { | 1631 | if (ret) { |
| 1524 | cpuhp_rollback_install(cpu, state, teardown); | 1632 | if (teardown) |
| 1525 | cpuhp_store_callbacks(state, NULL, NULL, NULL); | 1633 | cpuhp_rollback_install(cpu, state, NULL); |
| 1634 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); | ||
| 1526 | goto out; | 1635 | goto out; |
| 1527 | } | 1636 | } |
| 1528 | } | 1637 | } |
| @@ -1534,6 +1643,42 @@ out: | |||
| 1534 | } | 1643 | } |
| 1535 | EXPORT_SYMBOL(__cpuhp_setup_state); | 1644 | EXPORT_SYMBOL(__cpuhp_setup_state); |
| 1536 | 1645 | ||
| 1646 | int __cpuhp_state_remove_instance(enum cpuhp_state state, | ||
| 1647 | struct hlist_node *node, bool invoke) | ||
| 1648 | { | ||
| 1649 | struct cpuhp_step *sp = cpuhp_get_step(state); | ||
| 1650 | int cpu; | ||
| 1651 | |||
| 1652 | BUG_ON(cpuhp_cb_check(state)); | ||
| 1653 | |||
| 1654 | if (!sp->multi_instance) | ||
| 1655 | return -EINVAL; | ||
| 1656 | |||
| 1657 | get_online_cpus(); | ||
| 1658 | if (!invoke || !cpuhp_get_teardown_cb(state)) | ||
| 1659 | goto remove; | ||
| 1660 | /* | ||
| 1661 | * Call the teardown callback for each present cpu depending | ||
| 1662 | * on the hotplug state of the cpu. This function is not | ||
| 1663 | * allowed to fail currently! | ||
| 1664 | */ | ||
| 1665 | for_each_present_cpu(cpu) { | ||
| 1666 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | ||
| 1667 | int cpustate = st->state; | ||
| 1668 | |||
| 1669 | if (cpustate >= state) | ||
| 1670 | cpuhp_issue_call(cpu, state, false, node); | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | remove: | ||
| 1674 | mutex_lock(&cpuhp_state_mutex); | ||
| 1675 | hlist_del(node); | ||
| 1676 | mutex_unlock(&cpuhp_state_mutex); | ||
| 1677 | put_online_cpus(); | ||
| 1678 | |||
| 1679 | return 0; | ||
| 1680 | } | ||
| 1681 | EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); | ||
| 1537 | /** | 1682 | /** |
| 1538 | * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state | 1683 | * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state |
| 1539 | * @state: The state to remove | 1684 | * @state: The state to remove |
| @@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state); | |||
| 1545 | */ | 1690 | */ |
| 1546 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | 1691 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) |
| 1547 | { | 1692 | { |
| 1548 | int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); | 1693 | struct cpuhp_step *sp = cpuhp_get_step(state); |
| 1549 | int cpu; | 1694 | int cpu; |
| 1550 | 1695 | ||
| 1551 | BUG_ON(cpuhp_cb_check(state)); | 1696 | BUG_ON(cpuhp_cb_check(state)); |
| 1552 | 1697 | ||
| 1553 | get_online_cpus(); | 1698 | get_online_cpus(); |
| 1554 | 1699 | ||
| 1555 | if (!invoke || !teardown) | 1700 | if (sp->multi_instance) { |
| 1701 | WARN(!hlist_empty(&sp->list), | ||
| 1702 | "Error: Removing state %d which has instances left.\n", | ||
| 1703 | state); | ||
| 1704 | goto remove; | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | if (!invoke || !cpuhp_get_teardown_cb(state)) | ||
| 1556 | goto remove; | 1708 | goto remove; |
| 1557 | 1709 | ||
| 1558 | /* | 1710 | /* |
| @@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | |||
| 1565 | int cpustate = st->state; | 1717 | int cpustate = st->state; |
| 1566 | 1718 | ||
| 1567 | if (cpustate >= state) | 1719 | if (cpustate >= state) |
| 1568 | cpuhp_issue_call(cpu, state, teardown, false); | 1720 | cpuhp_issue_call(cpu, state, false, NULL); |
| 1569 | } | 1721 | } |
| 1570 | remove: | 1722 | remove: |
| 1571 | cpuhp_store_callbacks(state, NULL, NULL, NULL); | 1723 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); |
| 1572 | put_online_cpus(); | 1724 | put_online_cpus(); |
| 1573 | } | 1725 | } |
| 1574 | EXPORT_SYMBOL(__cpuhp_remove_state); | 1726 | EXPORT_SYMBOL(__cpuhp_remove_state); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..29f815d2ef7e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { | |||
| 325 | /* | 325 | /* |
| 326 | * Return in pmask the portion of a cpusets's cpus_allowed that | 326 | * Return in pmask the portion of a cpusets's cpus_allowed that |
| 327 | * are online. If none are online, walk up the cpuset hierarchy | 327 | * are online. If none are online, walk up the cpuset hierarchy |
| 328 | * until we find one that does have some online cpus. The top | 328 | * until we find one that does have some online cpus. |
| 329 | * cpuset always has some cpus online. | ||
| 330 | * | 329 | * |
| 331 | * One way or another, we guarantee to return some non-empty subset | 330 | * One way or another, we guarantee to return some non-empty subset |
| 332 | * of cpu_online_mask. | 331 | * of cpu_online_mask. |
| @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { | |||
| 335 | */ | 334 | */ |
| 336 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 335 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
| 337 | { | 336 | { |
| 338 | while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) | 337 | while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { |
| 339 | cs = parent_cs(cs); | 338 | cs = parent_cs(cs); |
| 339 | if (unlikely(!cs)) { | ||
| 340 | /* | ||
| 341 | * The top cpuset doesn't have any online cpu as a | ||
| 342 | * consequence of a race between cpuset_hotplug_work | ||
| 343 | * and cpu hotplug notifier. But we know the top | ||
| 344 | * cpuset's effective_cpus is on its way to to be | ||
| 345 | * identical to cpu_online_mask. | ||
| 346 | */ | ||
| 347 | cpumask_copy(pmask, cpu_online_mask); | ||
| 348 | return; | ||
| 349 | } | ||
| 350 | } | ||
| 340 | cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); | 351 | cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); |
| 341 | } | 352 | } |
| 342 | 353 | ||
| @@ -2069,6 +2080,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
| 2069 | mutex_unlock(&cpuset_mutex); | 2080 | mutex_unlock(&cpuset_mutex); |
| 2070 | } | 2081 | } |
| 2071 | 2082 | ||
| 2083 | /* | ||
| 2084 | * Make sure the new task conform to the current state of its parent, | ||
| 2085 | * which could have been changed by cpuset just after it inherits the | ||
| 2086 | * state from the parent and before it sits on the cgroup's task list. | ||
| 2087 | */ | ||
| 2088 | static void cpuset_fork(struct task_struct *task) | ||
| 2089 | { | ||
| 2090 | if (task_css_is_root(task, cpuset_cgrp_id)) | ||
| 2091 | return; | ||
| 2092 | |||
| 2093 | set_cpus_allowed_ptr(task, ¤t->cpus_allowed); | ||
| 2094 | task->mems_allowed = current->mems_allowed; | ||
| 2095 | } | ||
| 2096 | |||
| 2072 | struct cgroup_subsys cpuset_cgrp_subsys = { | 2097 | struct cgroup_subsys cpuset_cgrp_subsys = { |
| 2073 | .css_alloc = cpuset_css_alloc, | 2098 | .css_alloc = cpuset_css_alloc, |
| 2074 | .css_online = cpuset_css_online, | 2099 | .css_online = cpuset_css_online, |
| @@ -2079,6 +2104,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { | |||
| 2079 | .attach = cpuset_attach, | 2104 | .attach = cpuset_attach, |
| 2080 | .post_attach = cpuset_post_attach, | 2105 | .post_attach = cpuset_post_attach, |
| 2081 | .bind = cpuset_bind, | 2106 | .bind = cpuset_bind, |
| 2107 | .fork = cpuset_fork, | ||
| 2082 | .legacy_cftypes = files, | 2108 | .legacy_cftypes = files, |
| 2083 | .early_init = true, | 2109 | .early_init = true, |
| 2084 | }; | 2110 | }; |
| @@ -2689,7 +2715,7 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2689 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, | 2715 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
| 2690 | struct pid *pid, struct task_struct *tsk) | 2716 | struct pid *pid, struct task_struct *tsk) |
| 2691 | { | 2717 | { |
| 2692 | char *buf, *p; | 2718 | char *buf; |
| 2693 | struct cgroup_subsys_state *css; | 2719 | struct cgroup_subsys_state *css; |
| 2694 | int retval; | 2720 | int retval; |
| 2695 | 2721 | ||
| @@ -2698,14 +2724,15 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, | |||
| 2698 | if (!buf) | 2724 | if (!buf) |
| 2699 | goto out; | 2725 | goto out; |
| 2700 | 2726 | ||
| 2701 | retval = -ENAMETOOLONG; | ||
| 2702 | css = task_get_css(tsk, cpuset_cgrp_id); | 2727 | css = task_get_css(tsk, cpuset_cgrp_id); |
| 2703 | p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, | 2728 | retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, |
| 2704 | current->nsproxy->cgroup_ns); | 2729 | current->nsproxy->cgroup_ns); |
| 2705 | css_put(css); | 2730 | css_put(css); |
| 2706 | if (!p) | 2731 | if (retval >= PATH_MAX) |
| 2732 | retval = -ENAMETOOLONG; | ||
| 2733 | if (retval < 0) | ||
| 2707 | goto out_free; | 2734 | goto out_free; |
| 2708 | seq_puts(m, p); | 2735 | seq_puts(m, buf); |
| 2709 | seq_putc(m, '\n'); | 2736 | seq_putc(m, '\n'); |
| 2710 | retval = 0; | 2737 | retval = 0; |
| 2711 | out_free: | 2738 | out_free: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1903b8f3a705..c6e47e97b33f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -242,18 +242,6 @@ unlock: | |||
| 242 | return ret; | 242 | return ret; |
| 243 | } | 243 | } |
| 244 | 244 | ||
| 245 | static void event_function_local(struct perf_event *event, event_f func, void *data) | ||
| 246 | { | ||
| 247 | struct event_function_struct efs = { | ||
| 248 | .event = event, | ||
| 249 | .func = func, | ||
| 250 | .data = data, | ||
| 251 | }; | ||
| 252 | |||
| 253 | int ret = event_function(&efs); | ||
| 254 | WARN_ON_ONCE(ret); | ||
| 255 | } | ||
| 256 | |||
| 257 | static void event_function_call(struct perf_event *event, event_f func, void *data) | 245 | static void event_function_call(struct perf_event *event, event_f func, void *data) |
| 258 | { | 246 | { |
| 259 | struct perf_event_context *ctx = event->ctx; | 247 | struct perf_event_context *ctx = event->ctx; |
| @@ -303,6 +291,54 @@ again: | |||
| 303 | raw_spin_unlock_irq(&ctx->lock); | 291 | raw_spin_unlock_irq(&ctx->lock); |
| 304 | } | 292 | } |
| 305 | 293 | ||
| 294 | /* | ||
| 295 | * Similar to event_function_call() + event_function(), but hard assumes IRQs | ||
| 296 | * are already disabled and we're on the right CPU. | ||
| 297 | */ | ||
| 298 | static void event_function_local(struct perf_event *event, event_f func, void *data) | ||
| 299 | { | ||
| 300 | struct perf_event_context *ctx = event->ctx; | ||
| 301 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 302 | struct task_struct *task = READ_ONCE(ctx->task); | ||
| 303 | struct perf_event_context *task_ctx = NULL; | ||
| 304 | |||
| 305 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 306 | |||
| 307 | if (task) { | ||
| 308 | if (task == TASK_TOMBSTONE) | ||
| 309 | return; | ||
| 310 | |||
| 311 | task_ctx = ctx; | ||
| 312 | } | ||
| 313 | |||
| 314 | perf_ctx_lock(cpuctx, task_ctx); | ||
| 315 | |||
| 316 | task = ctx->task; | ||
| 317 | if (task == TASK_TOMBSTONE) | ||
| 318 | goto unlock; | ||
| 319 | |||
| 320 | if (task) { | ||
| 321 | /* | ||
| 322 | * We must be either inactive or active and the right task, | ||
| 323 | * otherwise we're screwed, since we cannot IPI to somewhere | ||
| 324 | * else. | ||
| 325 | */ | ||
| 326 | if (ctx->is_active) { | ||
| 327 | if (WARN_ON_ONCE(task != current)) | ||
| 328 | goto unlock; | ||
| 329 | |||
| 330 | if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) | ||
| 331 | goto unlock; | ||
| 332 | } | ||
| 333 | } else { | ||
| 334 | WARN_ON_ONCE(&cpuctx->ctx != ctx); | ||
| 335 | } | ||
| 336 | |||
| 337 | func(event, cpuctx, ctx, data); | ||
| 338 | unlock: | ||
| 339 | perf_ctx_unlock(cpuctx, task_ctx); | ||
| 340 | } | ||
| 341 | |||
| 306 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 342 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
| 307 | PERF_FLAG_FD_OUTPUT |\ | 343 | PERF_FLAG_FD_OUTPUT |\ |
| 308 | PERF_FLAG_PID_CGROUP |\ | 344 | PERF_FLAG_PID_CGROUP |\ |
| @@ -1439,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1439 | if (event->group_leader == event) { | 1475 | if (event->group_leader == event) { |
| 1440 | struct list_head *list; | 1476 | struct list_head *list; |
| 1441 | 1477 | ||
| 1442 | if (is_software_event(event)) | 1478 | event->group_caps = event->event_caps; |
| 1443 | event->group_flags |= PERF_GROUP_SOFTWARE; | ||
| 1444 | 1479 | ||
| 1445 | list = ctx_group_list(event, ctx); | 1480 | list = ctx_group_list(event, ctx); |
| 1446 | list_add_tail(&event->group_entry, list); | 1481 | list_add_tail(&event->group_entry, list); |
| @@ -1594,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event) | |||
| 1594 | 1629 | ||
| 1595 | WARN_ON_ONCE(group_leader->ctx != event->ctx); | 1630 | WARN_ON_ONCE(group_leader->ctx != event->ctx); |
| 1596 | 1631 | ||
| 1597 | if (group_leader->group_flags & PERF_GROUP_SOFTWARE && | 1632 | group_leader->group_caps &= event->event_caps; |
| 1598 | !is_software_event(event)) | ||
| 1599 | group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; | ||
| 1600 | 1633 | ||
| 1601 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 1634 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
| 1602 | group_leader->nr_siblings++; | 1635 | group_leader->nr_siblings++; |
| @@ -1687,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event) | |||
| 1687 | sibling->group_leader = sibling; | 1720 | sibling->group_leader = sibling; |
| 1688 | 1721 | ||
| 1689 | /* Inherit group flags from the previous leader */ | 1722 | /* Inherit group flags from the previous leader */ |
| 1690 | sibling->group_flags = event->group_flags; | 1723 | sibling->group_caps = event->group_caps; |
| 1691 | 1724 | ||
| 1692 | WARN_ON_ONCE(sibling->ctx != event->ctx); | 1725 | WARN_ON_ONCE(sibling->ctx != event->ctx); |
| 1693 | } | 1726 | } |
| @@ -1796,6 +1829,8 @@ group_sched_out(struct perf_event *group_event, | |||
| 1796 | struct perf_event *event; | 1829 | struct perf_event *event; |
| 1797 | int state = group_event->state; | 1830 | int state = group_event->state; |
| 1798 | 1831 | ||
| 1832 | perf_pmu_disable(ctx->pmu); | ||
| 1833 | |||
| 1799 | event_sched_out(group_event, cpuctx, ctx); | 1834 | event_sched_out(group_event, cpuctx, ctx); |
| 1800 | 1835 | ||
| 1801 | /* | 1836 | /* |
| @@ -1804,6 +1839,8 @@ group_sched_out(struct perf_event *group_event, | |||
| 1804 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 1839 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
| 1805 | event_sched_out(event, cpuctx, ctx); | 1840 | event_sched_out(event, cpuctx, ctx); |
| 1806 | 1841 | ||
| 1842 | perf_pmu_enable(ctx->pmu); | ||
| 1843 | |||
| 1807 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) | 1844 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
| 1808 | cpuctx->exclusive = 0; | 1845 | cpuctx->exclusive = 0; |
| 1809 | } | 1846 | } |
| @@ -2109,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event, | |||
| 2109 | /* | 2146 | /* |
| 2110 | * Groups consisting entirely of software events can always go on. | 2147 | * Groups consisting entirely of software events can always go on. |
| 2111 | */ | 2148 | */ |
| 2112 | if (event->group_flags & PERF_GROUP_SOFTWARE) | 2149 | if (event->group_caps & PERF_EV_CAP_SOFTWARE) |
| 2113 | return 1; | 2150 | return 1; |
| 2114 | /* | 2151 | /* |
| 2115 | * If an exclusive group is already on, no other hardware | 2152 | * If an exclusive group is already on, no other hardware |
| @@ -2455,16 +2492,16 @@ static int __perf_event_stop(void *info) | |||
| 2455 | * while restarting. | 2492 | * while restarting. |
| 2456 | */ | 2493 | */ |
| 2457 | if (sd->restart) | 2494 | if (sd->restart) |
| 2458 | event->pmu->start(event, PERF_EF_START); | 2495 | event->pmu->start(event, 0); |
| 2459 | 2496 | ||
| 2460 | return 0; | 2497 | return 0; |
| 2461 | } | 2498 | } |
| 2462 | 2499 | ||
| 2463 | static int perf_event_restart(struct perf_event *event) | 2500 | static int perf_event_stop(struct perf_event *event, int restart) |
| 2464 | { | 2501 | { |
| 2465 | struct stop_event_data sd = { | 2502 | struct stop_event_data sd = { |
| 2466 | .event = event, | 2503 | .event = event, |
| 2467 | .restart = 1, | 2504 | .restart = restart, |
| 2468 | }; | 2505 | }; |
| 2469 | int ret = 0; | 2506 | int ret = 0; |
| 2470 | 2507 | ||
| @@ -2801,19 +2838,36 @@ unlock: | |||
| 2801 | } | 2838 | } |
| 2802 | } | 2839 | } |
| 2803 | 2840 | ||
| 2841 | static DEFINE_PER_CPU(struct list_head, sched_cb_list); | ||
| 2842 | |||
| 2804 | void perf_sched_cb_dec(struct pmu *pmu) | 2843 | void perf_sched_cb_dec(struct pmu *pmu) |
| 2805 | { | 2844 | { |
| 2845 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 2846 | |||
| 2806 | this_cpu_dec(perf_sched_cb_usages); | 2847 | this_cpu_dec(perf_sched_cb_usages); |
| 2848 | |||
| 2849 | if (!--cpuctx->sched_cb_usage) | ||
| 2850 | list_del(&cpuctx->sched_cb_entry); | ||
| 2807 | } | 2851 | } |
| 2808 | 2852 | ||
| 2853 | |||
| 2809 | void perf_sched_cb_inc(struct pmu *pmu) | 2854 | void perf_sched_cb_inc(struct pmu *pmu) |
| 2810 | { | 2855 | { |
| 2856 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 2857 | |||
| 2858 | if (!cpuctx->sched_cb_usage++) | ||
| 2859 | list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); | ||
| 2860 | |||
| 2811 | this_cpu_inc(perf_sched_cb_usages); | 2861 | this_cpu_inc(perf_sched_cb_usages); |
| 2812 | } | 2862 | } |
| 2813 | 2863 | ||
| 2814 | /* | 2864 | /* |
| 2815 | * This function provides the context switch callback to the lower code | 2865 | * This function provides the context switch callback to the lower code |
| 2816 | * layer. It is invoked ONLY when the context switch callback is enabled. | 2866 | * layer. It is invoked ONLY when the context switch callback is enabled. |
| 2867 | * | ||
| 2868 | * This callback is relevant even to per-cpu events; for example multi event | ||
| 2869 | * PEBS requires this to provide PID/TID information. This requires we flush | ||
| 2870 | * all queued PEBS records before we context switch to a new task. | ||
| 2817 | */ | 2871 | */ |
| 2818 | static void perf_pmu_sched_task(struct task_struct *prev, | 2872 | static void perf_pmu_sched_task(struct task_struct *prev, |
| 2819 | struct task_struct *next, | 2873 | struct task_struct *next, |
| @@ -2821,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, | |||
| 2821 | { | 2875 | { |
| 2822 | struct perf_cpu_context *cpuctx; | 2876 | struct perf_cpu_context *cpuctx; |
| 2823 | struct pmu *pmu; | 2877 | struct pmu *pmu; |
| 2824 | unsigned long flags; | ||
| 2825 | 2878 | ||
| 2826 | if (prev == next) | 2879 | if (prev == next) |
| 2827 | return; | 2880 | return; |
| 2828 | 2881 | ||
| 2829 | local_irq_save(flags); | 2882 | list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { |
| 2830 | 2883 | pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ | |
| 2831 | rcu_read_lock(); | ||
| 2832 | |||
| 2833 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 2834 | if (pmu->sched_task) { | ||
| 2835 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 2836 | 2884 | ||
| 2837 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | 2885 | if (WARN_ON_ONCE(!pmu->sched_task)) |
| 2838 | 2886 | continue; | |
| 2839 | perf_pmu_disable(pmu); | ||
| 2840 | 2887 | ||
| 2841 | pmu->sched_task(cpuctx->task_ctx, sched_in); | 2888 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
| 2889 | perf_pmu_disable(pmu); | ||
| 2842 | 2890 | ||
| 2843 | perf_pmu_enable(pmu); | 2891 | pmu->sched_task(cpuctx->task_ctx, sched_in); |
| 2844 | 2892 | ||
| 2845 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 2893 | perf_pmu_enable(pmu); |
| 2846 | } | 2894 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
| 2847 | } | 2895 | } |
| 2848 | |||
| 2849 | rcu_read_unlock(); | ||
| 2850 | |||
| 2851 | local_irq_restore(flags); | ||
| 2852 | } | 2896 | } |
| 2853 | 2897 | ||
| 2854 | static void perf_event_switch(struct task_struct *task, | 2898 | static void perf_event_switch(struct task_struct *task, |
| @@ -3380,6 +3424,22 @@ struct perf_read_data { | |||
| 3380 | int ret; | 3424 | int ret; |
| 3381 | }; | 3425 | }; |
| 3382 | 3426 | ||
| 3427 | static int find_cpu_to_read(struct perf_event *event, int local_cpu) | ||
| 3428 | { | ||
| 3429 | int event_cpu = event->oncpu; | ||
| 3430 | u16 local_pkg, event_pkg; | ||
| 3431 | |||
| 3432 | if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { | ||
| 3433 | event_pkg = topology_physical_package_id(event_cpu); | ||
| 3434 | local_pkg = topology_physical_package_id(local_cpu); | ||
| 3435 | |||
| 3436 | if (event_pkg == local_pkg) | ||
| 3437 | return local_cpu; | ||
| 3438 | } | ||
| 3439 | |||
| 3440 | return event_cpu; | ||
| 3441 | } | ||
| 3442 | |||
| 3383 | /* | 3443 | /* |
| 3384 | * Cross CPU call to read the hardware event | 3444 | * Cross CPU call to read the hardware event |
| 3385 | */ | 3445 | */ |
| @@ -3501,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event) | |||
| 3501 | 3561 | ||
| 3502 | static int perf_event_read(struct perf_event *event, bool group) | 3562 | static int perf_event_read(struct perf_event *event, bool group) |
| 3503 | { | 3563 | { |
| 3504 | int ret = 0; | 3564 | int ret = 0, cpu_to_read, local_cpu; |
| 3505 | 3565 | ||
| 3506 | /* | 3566 | /* |
| 3507 | * If event is enabled and currently active on a CPU, update the | 3567 | * If event is enabled and currently active on a CPU, update the |
| @@ -3513,8 +3573,22 @@ static int perf_event_read(struct perf_event *event, bool group) | |||
| 3513 | .group = group, | 3573 | .group = group, |
| 3514 | .ret = 0, | 3574 | .ret = 0, |
| 3515 | }; | 3575 | }; |
| 3516 | smp_call_function_single(event->oncpu, | 3576 | |
| 3517 | __perf_event_read, &data, 1); | 3577 | local_cpu = get_cpu(); |
| 3578 | cpu_to_read = find_cpu_to_read(event, local_cpu); | ||
| 3579 | put_cpu(); | ||
| 3580 | |||
| 3581 | /* | ||
| 3582 | * Purposely ignore the smp_call_function_single() return | ||
| 3583 | * value. | ||
| 3584 | * | ||
| 3585 | * If event->oncpu isn't a valid CPU it means the event got | ||
| 3586 | * scheduled out and that will have updated the event count. | ||
| 3587 | * | ||
| 3588 | * Therefore, either way, we'll have an up-to-date event count | ||
| 3589 | * after this. | ||
| 3590 | */ | ||
| 3591 | (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); | ||
| 3518 | ret = data.ret; | 3592 | ret = data.ret; |
| 3519 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { | 3593 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { |
| 3520 | struct perf_event_context *ctx = event->ctx; | 3594 | struct perf_event_context *ctx = event->ctx; |
| @@ -3884,7 +3958,7 @@ static void exclusive_event_destroy(struct perf_event *event) | |||
| 3884 | 3958 | ||
| 3885 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) | 3959 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) |
| 3886 | { | 3960 | { |
| 3887 | if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && | 3961 | if ((e1->pmu == e2->pmu) && |
| 3888 | (e1->cpu == e2->cpu || | 3962 | (e1->cpu == e2->cpu || |
| 3889 | e1->cpu == -1 || | 3963 | e1->cpu == -1 || |
| 3890 | e2->cpu == -1)) | 3964 | e2->cpu == -1)) |
| @@ -4800,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event, | |||
| 4800 | spin_unlock_irqrestore(&rb->event_lock, flags); | 4874 | spin_unlock_irqrestore(&rb->event_lock, flags); |
| 4801 | } | 4875 | } |
| 4802 | 4876 | ||
| 4877 | /* | ||
| 4878 | * Avoid racing with perf_mmap_close(AUX): stop the event | ||
| 4879 | * before swizzling the event::rb pointer; if it's getting | ||
| 4880 | * unmapped, its aux_mmap_count will be 0 and it won't | ||
| 4881 | * restart. See the comment in __perf_pmu_output_stop(). | ||
| 4882 | * | ||
| 4883 | * Data will inevitably be lost when set_output is done in | ||
| 4884 | * mid-air, but then again, whoever does it like this is | ||
| 4885 | * not in for the data anyway. | ||
| 4886 | */ | ||
| 4887 | if (has_aux(event)) | ||
| 4888 | perf_event_stop(event, 0); | ||
| 4889 | |||
| 4803 | rcu_assign_pointer(event->rb, rb); | 4890 | rcu_assign_pointer(event->rb, rb); |
| 4804 | 4891 | ||
| 4805 | if (old_rb) { | 4892 | if (old_rb) { |
| @@ -5292,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, | |||
| 5292 | struct pt_regs *regs, u64 mask) | 5379 | struct pt_regs *regs, u64 mask) |
| 5293 | { | 5380 | { |
| 5294 | int bit; | 5381 | int bit; |
| 5382 | DECLARE_BITMAP(_mask, 64); | ||
| 5295 | 5383 | ||
| 5296 | for_each_set_bit(bit, (const unsigned long *) &mask, | 5384 | bitmap_from_u64(_mask, mask); |
| 5297 | sizeof(mask) * BITS_PER_BYTE) { | 5385 | for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { |
| 5298 | u64 val; | 5386 | u64 val; |
| 5299 | 5387 | ||
| 5300 | val = perf_reg_value(regs, bit); | 5388 | val = perf_reg_value(regs, bit); |
| @@ -6075,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) | |||
| 6075 | raw_spin_unlock_irqrestore(&ifh->lock, flags); | 6163 | raw_spin_unlock_irqrestore(&ifh->lock, flags); |
| 6076 | 6164 | ||
| 6077 | if (restart) | 6165 | if (restart) |
| 6078 | perf_event_restart(event); | 6166 | perf_event_stop(event, 1); |
| 6079 | } | 6167 | } |
| 6080 | 6168 | ||
| 6081 | void perf_event_exec(void) | 6169 | void perf_event_exec(void) |
| @@ -6119,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) | |||
| 6119 | 6207 | ||
| 6120 | /* | 6208 | /* |
| 6121 | * In case of inheritance, it will be the parent that links to the | 6209 | * In case of inheritance, it will be the parent that links to the |
| 6122 | * ring-buffer, but it will be the child that's actually using it: | 6210 | * ring-buffer, but it will be the child that's actually using it. |
| 6211 | * | ||
| 6212 | * We are using event::rb to determine if the event should be stopped, | ||
| 6213 | * however this may race with ring_buffer_attach() (through set_output), | ||
| 6214 | * which will make us skip the event that actually needs to be stopped. | ||
| 6215 | * So ring_buffer_attach() has to stop an aux event before re-assigning | ||
| 6216 | * its rb pointer. | ||
| 6123 | */ | 6217 | */ |
| 6124 | if (rcu_dereference(parent->rb) == rb) | 6218 | if (rcu_dereference(parent->rb) == rb) |
| 6125 | ro->err = __perf_event_stop(&sd); | 6219 | ro->err = __perf_event_stop(&sd); |
| @@ -6129,7 +6223,7 @@ static int __perf_pmu_output_stop(void *info) | |||
| 6129 | { | 6223 | { |
| 6130 | struct perf_event *event = info; | 6224 | struct perf_event *event = info; |
| 6131 | struct pmu *pmu = event->pmu; | 6225 | struct pmu *pmu = event->pmu; |
| 6132 | struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 6226 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 6133 | struct remote_output ro = { | 6227 | struct remote_output ro = { |
| 6134 | .rb = event->rb, | 6228 | .rb = event->rb, |
| 6135 | }; | 6229 | }; |
| @@ -6584,15 +6678,6 @@ got_name: | |||
| 6584 | } | 6678 | } |
| 6585 | 6679 | ||
| 6586 | /* | 6680 | /* |
| 6587 | * Whether this @filter depends on a dynamic object which is not loaded | ||
| 6588 | * yet or its load addresses are not known. | ||
| 6589 | */ | ||
| 6590 | static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) | ||
| 6591 | { | ||
| 6592 | return filter->filter && filter->inode; | ||
| 6593 | } | ||
| 6594 | |||
| 6595 | /* | ||
| 6596 | * Check whether inode and address range match filter criteria. | 6681 | * Check whether inode and address range match filter criteria. |
| 6597 | */ | 6682 | */ |
| 6598 | static bool perf_addr_filter_match(struct perf_addr_filter *filter, | 6683 | static bool perf_addr_filter_match(struct perf_addr_filter *filter, |
| @@ -6642,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) | |||
| 6642 | raw_spin_unlock_irqrestore(&ifh->lock, flags); | 6727 | raw_spin_unlock_irqrestore(&ifh->lock, flags); |
| 6643 | 6728 | ||
| 6644 | if (restart) | 6729 | if (restart) |
| 6645 | perf_event_restart(event); | 6730 | perf_event_stop(event, 1); |
| 6646 | } | 6731 | } |
| 6647 | 6732 | ||
| 6648 | /* | 6733 | /* |
| @@ -6653,6 +6738,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) | |||
| 6653 | struct perf_event_context *ctx; | 6738 | struct perf_event_context *ctx; |
| 6654 | int ctxn; | 6739 | int ctxn; |
| 6655 | 6740 | ||
| 6741 | /* | ||
| 6742 | * Data tracing isn't supported yet and as such there is no need | ||
| 6743 | * to keep track of anything that isn't related to executable code: | ||
| 6744 | */ | ||
| 6745 | if (!(vma->vm_flags & VM_EXEC)) | ||
| 6746 | return; | ||
| 6747 | |||
| 6656 | rcu_read_lock(); | 6748 | rcu_read_lock(); |
| 6657 | for_each_task_context_nr(ctxn) { | 6749 | for_each_task_context_nr(ctxn) { |
| 6658 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 6750 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
| @@ -6987,7 +7079,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
| 6987 | irq_work_queue(&event->pending); | 7079 | irq_work_queue(&event->pending); |
| 6988 | } | 7080 | } |
| 6989 | 7081 | ||
| 6990 | event->overflow_handler(event, data, regs); | 7082 | READ_ONCE(event->overflow_handler)(event, data, regs); |
| 6991 | 7083 | ||
| 6992 | if (*perf_event_fasync(event) && event->pending_kill) { | 7084 | if (*perf_event_fasync(event) && event->pending_kill) { |
| 6993 | event->pending_wakeup = 1; | 7085 | event->pending_wakeup = 1; |
| @@ -7602,11 +7694,83 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 7602 | ftrace_profile_free_filter(event); | 7694 | ftrace_profile_free_filter(event); |
| 7603 | } | 7695 | } |
| 7604 | 7696 | ||
| 7697 | #ifdef CONFIG_BPF_SYSCALL | ||
| 7698 | static void bpf_overflow_handler(struct perf_event *event, | ||
| 7699 | struct perf_sample_data *data, | ||
| 7700 | struct pt_regs *regs) | ||
| 7701 | { | ||
| 7702 | struct bpf_perf_event_data_kern ctx = { | ||
| 7703 | .data = data, | ||
| 7704 | .regs = regs, | ||
| 7705 | }; | ||
| 7706 | int ret = 0; | ||
| 7707 | |||
| 7708 | preempt_disable(); | ||
| 7709 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) | ||
| 7710 | goto out; | ||
| 7711 | rcu_read_lock(); | ||
| 7712 | ret = BPF_PROG_RUN(event->prog, (void *)&ctx); | ||
| 7713 | rcu_read_unlock(); | ||
| 7714 | out: | ||
| 7715 | __this_cpu_dec(bpf_prog_active); | ||
| 7716 | preempt_enable(); | ||
| 7717 | if (!ret) | ||
| 7718 | return; | ||
| 7719 | |||
| 7720 | event->orig_overflow_handler(event, data, regs); | ||
| 7721 | } | ||
| 7722 | |||
| 7723 | static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) | ||
| 7724 | { | ||
| 7725 | struct bpf_prog *prog; | ||
| 7726 | |||
| 7727 | if (event->overflow_handler_context) | ||
| 7728 | /* hw breakpoint or kernel counter */ | ||
| 7729 | return -EINVAL; | ||
| 7730 | |||
| 7731 | if (event->prog) | ||
| 7732 | return -EEXIST; | ||
| 7733 | |||
| 7734 | prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); | ||
| 7735 | if (IS_ERR(prog)) | ||
| 7736 | return PTR_ERR(prog); | ||
| 7737 | |||
| 7738 | event->prog = prog; | ||
| 7739 | event->orig_overflow_handler = READ_ONCE(event->overflow_handler); | ||
| 7740 | WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); | ||
| 7741 | return 0; | ||
| 7742 | } | ||
| 7743 | |||
| 7744 | static void perf_event_free_bpf_handler(struct perf_event *event) | ||
| 7745 | { | ||
| 7746 | struct bpf_prog *prog = event->prog; | ||
| 7747 | |||
| 7748 | if (!prog) | ||
| 7749 | return; | ||
| 7750 | |||
| 7751 | WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); | ||
| 7752 | event->prog = NULL; | ||
| 7753 | bpf_prog_put(prog); | ||
| 7754 | } | ||
| 7755 | #else | ||
| 7756 | static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) | ||
| 7757 | { | ||
| 7758 | return -EOPNOTSUPP; | ||
| 7759 | } | ||
| 7760 | static void perf_event_free_bpf_handler(struct perf_event *event) | ||
| 7761 | { | ||
| 7762 | } | ||
| 7763 | #endif | ||
| 7764 | |||
| 7605 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | 7765 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) |
| 7606 | { | 7766 | { |
| 7607 | bool is_kprobe, is_tracepoint; | 7767 | bool is_kprobe, is_tracepoint; |
| 7608 | struct bpf_prog *prog; | 7768 | struct bpf_prog *prog; |
| 7609 | 7769 | ||
| 7770 | if (event->attr.type == PERF_TYPE_HARDWARE || | ||
| 7771 | event->attr.type == PERF_TYPE_SOFTWARE) | ||
| 7772 | return perf_event_set_bpf_handler(event, prog_fd); | ||
| 7773 | |||
| 7610 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 7774 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| 7611 | return -EINVAL; | 7775 | return -EINVAL; |
| 7612 | 7776 | ||
| @@ -7647,6 +7811,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) | |||
| 7647 | { | 7811 | { |
| 7648 | struct bpf_prog *prog; | 7812 | struct bpf_prog *prog; |
| 7649 | 7813 | ||
| 7814 | perf_event_free_bpf_handler(event); | ||
| 7815 | |||
| 7650 | if (!event->tp_event) | 7816 | if (!event->tp_event) |
| 7651 | return; | 7817 | return; |
| 7652 | 7818 | ||
| @@ -7805,7 +7971,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) | |||
| 7805 | list_for_each_entry(filter, &ifh->list, entry) { | 7971 | list_for_each_entry(filter, &ifh->list, entry) { |
| 7806 | event->addr_filters_offs[count] = 0; | 7972 | event->addr_filters_offs[count] = 0; |
| 7807 | 7973 | ||
| 7808 | if (perf_addr_filter_needs_mmap(filter)) | 7974 | /* |
| 7975 | * Adjust base offset if the filter is associated to a binary | ||
| 7976 | * that needs to be mapped: | ||
| 7977 | */ | ||
| 7978 | if (filter->inode) | ||
| 7809 | event->addr_filters_offs[count] = | 7979 | event->addr_filters_offs[count] = |
| 7810 | perf_addr_filter_apply(filter, mm); | 7980 | perf_addr_filter_apply(filter, mm); |
| 7811 | 7981 | ||
| @@ -7820,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) | |||
| 7820 | mmput(mm); | 7990 | mmput(mm); |
| 7821 | 7991 | ||
| 7822 | restart: | 7992 | restart: |
| 7823 | perf_event_restart(event); | 7993 | perf_event_stop(event, 1); |
| 7824 | } | 7994 | } |
| 7825 | 7995 | ||
| 7826 | /* | 7996 | /* |
| @@ -7936,8 +8106,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 7936 | goto fail; | 8106 | goto fail; |
| 7937 | } | 8107 | } |
| 7938 | 8108 | ||
| 7939 | if (token == IF_SRC_FILE) { | 8109 | if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { |
| 7940 | filename = match_strdup(&args[2]); | 8110 | int fpos = filter->range ? 2 : 1; |
| 8111 | |||
| 8112 | filename = match_strdup(&args[fpos]); | ||
| 7941 | if (!filename) { | 8113 | if (!filename) { |
| 7942 | ret = -ENOMEM; | 8114 | ret = -ENOMEM; |
| 7943 | goto fail; | 8115 | goto fail; |
| @@ -8957,6 +9129,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 8957 | if (!overflow_handler && parent_event) { | 9129 | if (!overflow_handler && parent_event) { |
| 8958 | overflow_handler = parent_event->overflow_handler; | 9130 | overflow_handler = parent_event->overflow_handler; |
| 8959 | context = parent_event->overflow_handler_context; | 9131 | context = parent_event->overflow_handler_context; |
| 9132 | #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) | ||
| 9133 | if (overflow_handler == bpf_overflow_handler) { | ||
| 9134 | struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); | ||
| 9135 | |||
| 9136 | if (IS_ERR(prog)) { | ||
| 9137 | err = PTR_ERR(prog); | ||
| 9138 | goto err_ns; | ||
| 9139 | } | ||
| 9140 | event->prog = prog; | ||
| 9141 | event->orig_overflow_handler = | ||
| 9142 | parent_event->orig_overflow_handler; | ||
| 9143 | } | ||
| 9144 | #endif | ||
| 8960 | } | 9145 | } |
| 8961 | 9146 | ||
| 8962 | if (overflow_handler) { | 9147 | if (overflow_handler) { |
| @@ -9437,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9437 | goto err_alloc; | 9622 | goto err_alloc; |
| 9438 | } | 9623 | } |
| 9439 | 9624 | ||
| 9625 | if (pmu->task_ctx_nr == perf_sw_context) | ||
| 9626 | event->event_caps |= PERF_EV_CAP_SOFTWARE; | ||
| 9627 | |||
| 9440 | if (group_leader && | 9628 | if (group_leader && |
| 9441 | (is_software_event(event) != is_software_event(group_leader))) { | 9629 | (is_software_event(event) != is_software_event(group_leader))) { |
| 9442 | if (is_software_event(event)) { | 9630 | if (is_software_event(event)) { |
| @@ -9450,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9450 | */ | 9638 | */ |
| 9451 | pmu = group_leader->pmu; | 9639 | pmu = group_leader->pmu; |
| 9452 | } else if (is_software_event(group_leader) && | 9640 | } else if (is_software_event(group_leader) && |
| 9453 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | 9641 | (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { |
| 9454 | /* | 9642 | /* |
| 9455 | * In case the group is a pure software group, and we | 9643 | * In case the group is a pure software group, and we |
| 9456 | * try to add a hardware event, move the whole group to | 9644 | * try to add a hardware event, move the whole group to |
| @@ -10385,6 +10573,8 @@ static void __init perf_event_init_all_cpus(void) | |||
| 10385 | 10573 | ||
| 10386 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); | 10574 | INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); |
| 10387 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); | 10575 | raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); |
| 10576 | |||
| 10577 | INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); | ||
| 10388 | } | 10578 | } |
| 10389 | } | 10579 | } |
| 10390 | 10580 | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
| 330 | if (!rb) | 330 | if (!rb) |
| 331 | return NULL; | 331 | return NULL; |
| 332 | 332 | ||
| 333 | if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) | 333 | if (!rb_has_aux(rb)) |
| 334 | goto err; | 334 | goto err; |
| 335 | 335 | ||
| 336 | /* | 336 | /* |
| 337 | * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), | 337 | * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), |
| 338 | * the aux buffer is in perf_mmap_close(), about to get freed. | 338 | * about to get freed, so we leave immediately. |
| 339 | * | ||
| 340 | * Checking rb::aux_mmap_count and rb::refcount has to be done in | ||
| 341 | * the same order, see perf_mmap_close. Otherwise we end up freeing | ||
| 342 | * aux pages in this path, which is a bug, because in_atomic(). | ||
| 339 | */ | 343 | */ |
| 340 | if (!atomic_read(&rb->aux_mmap_count)) | 344 | if (!atomic_read(&rb->aux_mmap_count)) |
| 341 | goto err_put; | 345 | goto err; |
| 346 | |||
| 347 | if (!atomic_inc_not_zero(&rb->aux_refcount)) | ||
| 348 | goto err; | ||
| 342 | 349 | ||
| 343 | /* | 350 | /* |
| 344 | * Nesting is not supported for AUX area, make sure nested | 351 | * Nesting is not supported for AUX area, make sure nested |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b7a525ab2083..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) | |||
| 150 | * Returns 0 on success, -EFAULT on failure. | 150 | * Returns 0 on success, -EFAULT on failure. |
| 151 | */ | 151 | */ |
| 152 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | 152 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, |
| 153 | struct page *page, struct page *kpage) | 153 | struct page *old_page, struct page *new_page) |
| 154 | { | 154 | { |
| 155 | struct mm_struct *mm = vma->vm_mm; | 155 | struct mm_struct *mm = vma->vm_mm; |
| 156 | spinlock_t *ptl; | 156 | spinlock_t *ptl; |
| @@ -161,48 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 161 | const unsigned long mmun_end = addr + PAGE_SIZE; | 161 | const unsigned long mmun_end = addr + PAGE_SIZE; |
| 162 | struct mem_cgroup *memcg; | 162 | struct mem_cgroup *memcg; |
| 163 | 163 | ||
| 164 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, | 164 | err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, |
| 165 | false); | 165 | false); |
| 166 | if (err) | 166 | if (err) |
| 167 | return err; | 167 | return err; |
| 168 | 168 | ||
| 169 | /* For try_to_free_swap() and munlock_vma_page() below */ | 169 | /* For try_to_free_swap() and munlock_vma_page() below */ |
| 170 | lock_page(page); | 170 | lock_page(old_page); |
| 171 | 171 | ||
| 172 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 172 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
| 173 | err = -EAGAIN; | 173 | err = -EAGAIN; |
| 174 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 174 | ptep = page_check_address(old_page, mm, addr, &ptl, 0); |
| 175 | if (!ptep) | 175 | if (!ptep) { |
| 176 | mem_cgroup_cancel_charge(new_page, memcg, false); | ||
| 176 | goto unlock; | 177 | goto unlock; |
| 178 | } | ||
| 177 | 179 | ||
| 178 | get_page(kpage); | 180 | get_page(new_page); |
| 179 | page_add_new_anon_rmap(kpage, vma, addr, false); | 181 | page_add_new_anon_rmap(new_page, vma, addr, false); |
| 180 | mem_cgroup_commit_charge(kpage, memcg, false, false); | 182 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
| 181 | lru_cache_add_active_or_unevictable(kpage, vma); | 183 | lru_cache_add_active_or_unevictable(new_page, vma); |
| 182 | 184 | ||
| 183 | if (!PageAnon(page)) { | 185 | if (!PageAnon(old_page)) { |
| 184 | dec_mm_counter(mm, mm_counter_file(page)); | 186 | dec_mm_counter(mm, mm_counter_file(old_page)); |
| 185 | inc_mm_counter(mm, MM_ANONPAGES); | 187 | inc_mm_counter(mm, MM_ANONPAGES); |
| 186 | } | 188 | } |
| 187 | 189 | ||
| 188 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 190 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
| 189 | ptep_clear_flush_notify(vma, addr, ptep); | 191 | ptep_clear_flush_notify(vma, addr, ptep); |
| 190 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 192 | set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); |
| 191 | 193 | ||
| 192 | page_remove_rmap(page, false); | 194 | page_remove_rmap(old_page, false); |
| 193 | if (!page_mapped(page)) | 195 | if (!page_mapped(old_page)) |
| 194 | try_to_free_swap(page); | 196 | try_to_free_swap(old_page); |
| 195 | pte_unmap_unlock(ptep, ptl); | 197 | pte_unmap_unlock(ptep, ptl); |
| 196 | 198 | ||
| 197 | if (vma->vm_flags & VM_LOCKED) | 199 | if (vma->vm_flags & VM_LOCKED) |
| 198 | munlock_vma_page(page); | 200 | munlock_vma_page(old_page); |
| 199 | put_page(page); | 201 | put_page(old_page); |
| 200 | 202 | ||
| 201 | err = 0; | 203 | err = 0; |
| 202 | unlock: | 204 | unlock: |
| 203 | mem_cgroup_cancel_charge(kpage, memcg, false); | ||
| 204 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 205 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
| 205 | unlock_page(page); | 206 | unlock_page(old_page); |
| 206 | return err; | 207 | return err; |
| 207 | } | 208 | } |
| 208 | 209 | ||
| @@ -299,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, | |||
| 299 | 300 | ||
| 300 | retry: | 301 | retry: |
| 301 | /* Read the page with vaddr into memory */ | 302 | /* Read the page with vaddr into memory */ |
| 302 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); | 303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, |
| 304 | &vma); | ||
| 303 | if (ret <= 0) | 305 | if (ret <= 0) |
| 304 | return ret; | 306 | return ret; |
| 305 | 307 | ||
| @@ -1709,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
| 1709 | * but we treat this as a 'remote' access since it is | 1711 | * but we treat this as a 'remote' access since it is |
| 1710 | * essentially a kernel access to the memory. | 1712 | * essentially a kernel access to the memory. |
| 1711 | */ | 1713 | */ |
| 1712 | result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); | 1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, |
| 1715 | NULL); | ||
| 1713 | if (result < 0) | 1716 | if (result < 0) |
| 1714 | return result; | 1717 | return result; |
| 1715 | 1718 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 2f974ae042a6..9d68c45ebbe3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) | |||
| 511 | mm_update_next_owner(mm); | 511 | mm_update_next_owner(mm); |
| 512 | mmput(mm); | 512 | mmput(mm); |
| 513 | if (test_thread_flag(TIF_MEMDIE)) | 513 | if (test_thread_flag(TIF_MEMDIE)) |
| 514 | exit_oom_victim(tsk); | 514 | exit_oom_victim(); |
| 515 | } | 515 | } |
| 516 | 516 | ||
| 517 | static struct task_struct *find_alive_thread(struct task_struct *p) | 517 | static struct task_struct *find_alive_thread(struct task_struct *p) |
| @@ -725,7 +725,7 @@ static void check_stack_usage(void) | |||
| 725 | static inline void check_stack_usage(void) {} | 725 | static inline void check_stack_usage(void) {} |
| 726 | #endif | 726 | #endif |
| 727 | 727 | ||
| 728 | void do_exit(long code) | 728 | void __noreturn do_exit(long code) |
| 729 | { | 729 | { |
| 730 | struct task_struct *tsk = current; | 730 | struct task_struct *tsk = current; |
| 731 | int group_dead; | 731 | int group_dead; |
| @@ -848,12 +848,7 @@ void do_exit(long code) | |||
| 848 | TASKS_RCU(preempt_enable()); | 848 | TASKS_RCU(preempt_enable()); |
| 849 | exit_notify(tsk, group_dead); | 849 | exit_notify(tsk, group_dead); |
| 850 | proc_exit_connector(tsk); | 850 | proc_exit_connector(tsk); |
| 851 | #ifdef CONFIG_NUMA | 851 | mpol_put_task_policy(tsk); |
| 852 | task_lock(tsk); | ||
| 853 | mpol_put(tsk->mempolicy); | ||
| 854 | tsk->mempolicy = NULL; | ||
| 855 | task_unlock(tsk); | ||
| 856 | #endif | ||
| 857 | #ifdef CONFIG_FUTEX | 852 | #ifdef CONFIG_FUTEX |
| 858 | if (unlikely(current->pi_state_cache)) | 853 | if (unlikely(current->pi_state_cache)) |
| 859 | kfree(current->pi_state_cache); | 854 | kfree(current->pi_state_cache); |
| @@ -887,29 +882,7 @@ void do_exit(long code) | |||
| 887 | exit_rcu(); | 882 | exit_rcu(); |
| 888 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); | 883 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); |
| 889 | 884 | ||
| 890 | /* | 885 | do_task_dead(); |
| 891 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
| 892 | * when the following two conditions become true. | ||
| 893 | * - There is race condition of mmap_sem (It is acquired by | ||
| 894 | * exit_mm()), and | ||
| 895 | * - SMI occurs before setting TASK_RUNINNG. | ||
| 896 | * (or hypervisor of virtual machine switches to other guest) | ||
| 897 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
| 898 | * | ||
| 899 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
| 900 | * is held by try_to_wake_up() | ||
| 901 | */ | ||
| 902 | smp_mb(); | ||
| 903 | raw_spin_unlock_wait(&tsk->pi_lock); | ||
| 904 | |||
| 905 | /* causes final put_task_struct in finish_task_switch(). */ | ||
| 906 | tsk->state = TASK_DEAD; | ||
| 907 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
| 908 | schedule(); | ||
| 909 | BUG(); | ||
| 910 | /* Avoid "noreturn function does return". */ | ||
| 911 | for (;;) | ||
| 912 | cpu_relax(); /* For when BUG is null */ | ||
| 913 | } | 886 | } |
| 914 | EXPORT_SYMBOL_GPL(do_exit); | 887 | EXPORT_SYMBOL_GPL(do_exit); |
| 915 | 888 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..623259fc794d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
| 158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | 158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a |
| 159 | * kmemcache based allocator. | 159 | * kmemcache based allocator. |
| 160 | */ | 160 | */ |
| 161 | # if THREAD_SIZE >= PAGE_SIZE | 161 | # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) |
| 162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | 162 | |
| 163 | int node) | 163 | #ifdef CONFIG_VMAP_STACK |
| 164 | /* | ||
| 165 | * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB | ||
| 166 | * flush. Try to minimize the number of calls by caching stacks. | ||
| 167 | */ | ||
| 168 | #define NR_CACHED_STACKS 2 | ||
| 169 | static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); | ||
| 170 | #endif | ||
| 171 | |||
| 172 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | ||
| 164 | { | 173 | { |
| 174 | #ifdef CONFIG_VMAP_STACK | ||
| 175 | void *stack; | ||
| 176 | int i; | ||
| 177 | |||
| 178 | local_irq_disable(); | ||
| 179 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
| 180 | struct vm_struct *s = this_cpu_read(cached_stacks[i]); | ||
| 181 | |||
| 182 | if (!s) | ||
| 183 | continue; | ||
| 184 | this_cpu_write(cached_stacks[i], NULL); | ||
| 185 | |||
| 186 | tsk->stack_vm_area = s; | ||
| 187 | local_irq_enable(); | ||
| 188 | return s->addr; | ||
| 189 | } | ||
| 190 | local_irq_enable(); | ||
| 191 | |||
| 192 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | ||
| 193 | VMALLOC_START, VMALLOC_END, | ||
| 194 | THREADINFO_GFP | __GFP_HIGHMEM, | ||
| 195 | PAGE_KERNEL, | ||
| 196 | 0, node, __builtin_return_address(0)); | ||
| 197 | |||
| 198 | /* | ||
| 199 | * We can't call find_vm_area() in interrupt context, and | ||
| 200 | * free_thread_stack() can be called in interrupt context, | ||
| 201 | * so cache the vm_struct. | ||
| 202 | */ | ||
| 203 | if (stack) | ||
| 204 | tsk->stack_vm_area = find_vm_area(stack); | ||
| 205 | return stack; | ||
| 206 | #else | ||
| 165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 207 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
| 166 | THREAD_SIZE_ORDER); | 208 | THREAD_SIZE_ORDER); |
| 167 | 209 | ||
| 168 | return page ? page_address(page) : NULL; | 210 | return page ? page_address(page) : NULL; |
| 211 | #endif | ||
| 169 | } | 212 | } |
| 170 | 213 | ||
| 171 | static inline void free_thread_stack(unsigned long *stack) | 214 | static inline void free_thread_stack(struct task_struct *tsk) |
| 172 | { | 215 | { |
| 173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); | 216 | #ifdef CONFIG_VMAP_STACK |
| 217 | if (task_stack_vm_area(tsk)) { | ||
| 218 | unsigned long flags; | ||
| 219 | int i; | ||
| 220 | |||
| 221 | local_irq_save(flags); | ||
| 222 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
| 223 | if (this_cpu_read(cached_stacks[i])) | ||
| 224 | continue; | ||
| 225 | |||
| 226 | this_cpu_write(cached_stacks[i], tsk->stack_vm_area); | ||
| 227 | local_irq_restore(flags); | ||
| 228 | return; | ||
| 229 | } | ||
| 230 | local_irq_restore(flags); | ||
| 231 | |||
| 232 | vfree(tsk->stack); | ||
| 233 | return; | ||
| 234 | } | ||
| 235 | #endif | ||
| 236 | |||
| 237 | __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); | ||
| 174 | } | 238 | } |
| 175 | # else | 239 | # else |
| 176 | static struct kmem_cache *thread_stack_cache; | 240 | static struct kmem_cache *thread_stack_cache; |
| @@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | |||
| 181 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); | 245 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
| 182 | } | 246 | } |
| 183 | 247 | ||
| 184 | static void free_thread_stack(unsigned long *stack) | 248 | static void free_thread_stack(struct task_struct *tsk) |
| 185 | { | 249 | { |
| 186 | kmem_cache_free(thread_stack_cache, stack); | 250 | kmem_cache_free(thread_stack_cache, tsk->stack); |
| 187 | } | 251 | } |
| 188 | 252 | ||
| 189 | void thread_stack_cache_init(void) | 253 | void thread_stack_cache_init(void) |
| @@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep; | |||
| 213 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 277 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
| 214 | static struct kmem_cache *mm_cachep; | 278 | static struct kmem_cache *mm_cachep; |
| 215 | 279 | ||
| 216 | static void account_kernel_stack(unsigned long *stack, int account) | 280 | static void account_kernel_stack(struct task_struct *tsk, int account) |
| 217 | { | 281 | { |
| 218 | /* All stack pages are in the same zone and belong to the same memcg. */ | 282 | void *stack = task_stack_page(tsk); |
| 219 | struct page *first_page = virt_to_page(stack); | 283 | struct vm_struct *vm = task_stack_vm_area(tsk); |
| 284 | |||
| 285 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); | ||
| 286 | |||
| 287 | if (vm) { | ||
| 288 | int i; | ||
| 220 | 289 | ||
| 221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | 290 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
| 222 | THREAD_SIZE / 1024 * account); | ||
| 223 | 291 | ||
| 224 | memcg_kmem_update_page_stat( | 292 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
| 225 | first_page, MEMCG_KERNEL_STACK_KB, | 293 | mod_zone_page_state(page_zone(vm->pages[i]), |
| 226 | account * (THREAD_SIZE / 1024)); | 294 | NR_KERNEL_STACK_KB, |
| 295 | PAGE_SIZE / 1024 * account); | ||
| 296 | } | ||
| 297 | |||
| 298 | /* All stack pages belong to the same memcg. */ | ||
| 299 | memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, | ||
| 300 | account * (THREAD_SIZE / 1024)); | ||
| 301 | } else { | ||
| 302 | /* | ||
| 303 | * All stack pages are in the same zone and belong to the | ||
| 304 | * same memcg. | ||
| 305 | */ | ||
| 306 | struct page *first_page = virt_to_page(stack); | ||
| 307 | |||
| 308 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
| 309 | THREAD_SIZE / 1024 * account); | ||
| 310 | |||
| 311 | memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, | ||
| 312 | account * (THREAD_SIZE / 1024)); | ||
| 313 | } | ||
| 227 | } | 314 | } |
| 228 | 315 | ||
| 229 | void free_task(struct task_struct *tsk) | 316 | static void release_task_stack(struct task_struct *tsk) |
| 230 | { | 317 | { |
| 231 | account_kernel_stack(tsk->stack, -1); | 318 | account_kernel_stack(tsk, -1); |
| 232 | arch_release_thread_stack(tsk->stack); | 319 | arch_release_thread_stack(tsk->stack); |
| 233 | free_thread_stack(tsk->stack); | 320 | free_thread_stack(tsk); |
| 321 | tsk->stack = NULL; | ||
| 322 | #ifdef CONFIG_VMAP_STACK | ||
| 323 | tsk->stack_vm_area = NULL; | ||
| 324 | #endif | ||
| 325 | } | ||
| 326 | |||
| 327 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
| 328 | void put_task_stack(struct task_struct *tsk) | ||
| 329 | { | ||
| 330 | if (atomic_dec_and_test(&tsk->stack_refcount)) | ||
| 331 | release_task_stack(tsk); | ||
| 332 | } | ||
| 333 | #endif | ||
| 334 | |||
| 335 | void free_task(struct task_struct *tsk) | ||
| 336 | { | ||
| 337 | #ifndef CONFIG_THREAD_INFO_IN_TASK | ||
| 338 | /* | ||
| 339 | * The task is finally done with both the stack and thread_info, | ||
| 340 | * so free both. | ||
| 341 | */ | ||
| 342 | release_task_stack(tsk); | ||
| 343 | #else | ||
| 344 | /* | ||
| 345 | * If the task had a separate stack allocation, it should be gone | ||
| 346 | * by now. | ||
| 347 | */ | ||
| 348 | WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); | ||
| 349 | #endif | ||
| 234 | rt_mutex_debug_task_free(tsk); | 350 | rt_mutex_debug_task_free(tsk); |
| 235 | ftrace_graph_exit_task(tsk); | 351 | ftrace_graph_exit_task(tsk); |
| 236 | put_seccomp_filter(tsk); | 352 | put_seccomp_filter(tsk); |
| @@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) | |||
| 243 | { | 359 | { |
| 244 | taskstats_tgid_free(sig); | 360 | taskstats_tgid_free(sig); |
| 245 | sched_autogroup_exit(sig); | 361 | sched_autogroup_exit(sig); |
| 362 | /* | ||
| 363 | * __mmdrop is not safe to call from softirq context on x86 due to | ||
| 364 | * pgd_dtor so postpone it to the async context | ||
| 365 | */ | ||
| 366 | if (sig->oom_mm) | ||
| 367 | mmdrop_async(sig->oom_mm); | ||
| 246 | kmem_cache_free(signal_cachep, sig); | 368 | kmem_cache_free(signal_cachep, sig); |
| 247 | } | 369 | } |
| 248 | 370 | ||
| @@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly; | |||
| 302 | 424 | ||
| 303 | void __init fork_init(void) | 425 | void __init fork_init(void) |
| 304 | { | 426 | { |
| 427 | int i; | ||
| 305 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 428 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
| 306 | #ifndef ARCH_MIN_TASKALIGN | 429 | #ifndef ARCH_MIN_TASKALIGN |
| 307 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 430 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
| @@ -321,6 +444,10 @@ void __init fork_init(void) | |||
| 321 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | 444 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; |
| 322 | init_task.signal->rlim[RLIMIT_SIGPENDING] = | 445 | init_task.signal->rlim[RLIMIT_SIGPENDING] = |
| 323 | init_task.signal->rlim[RLIMIT_NPROC]; | 446 | init_task.signal->rlim[RLIMIT_NPROC]; |
| 447 | |||
| 448 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
| 449 | init_user_ns.ucount_max[i] = max_threads/2; | ||
| 450 | } | ||
| 324 | } | 451 | } |
| 325 | 452 | ||
| 326 | int __weak arch_dup_task_struct(struct task_struct *dst, | 453 | int __weak arch_dup_task_struct(struct task_struct *dst, |
| @@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 342 | { | 469 | { |
| 343 | struct task_struct *tsk; | 470 | struct task_struct *tsk; |
| 344 | unsigned long *stack; | 471 | unsigned long *stack; |
| 472 | struct vm_struct *stack_vm_area; | ||
| 345 | int err; | 473 | int err; |
| 346 | 474 | ||
| 347 | if (node == NUMA_NO_NODE) | 475 | if (node == NUMA_NO_NODE) |
| @@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 354 | if (!stack) | 482 | if (!stack) |
| 355 | goto free_tsk; | 483 | goto free_tsk; |
| 356 | 484 | ||
| 485 | stack_vm_area = task_stack_vm_area(tsk); | ||
| 486 | |||
| 357 | err = arch_dup_task_struct(tsk, orig); | 487 | err = arch_dup_task_struct(tsk, orig); |
| 488 | |||
| 489 | /* | ||
| 490 | * arch_dup_task_struct() clobbers the stack-related fields. Make | ||
| 491 | * sure they're properly initialized before using any stack-related | ||
| 492 | * functions again. | ||
| 493 | */ | ||
| 494 | tsk->stack = stack; | ||
| 495 | #ifdef CONFIG_VMAP_STACK | ||
| 496 | tsk->stack_vm_area = stack_vm_area; | ||
| 497 | #endif | ||
| 498 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
| 499 | atomic_set(&tsk->stack_refcount, 1); | ||
| 500 | #endif | ||
| 501 | |||
| 358 | if (err) | 502 | if (err) |
| 359 | goto free_stack; | 503 | goto free_stack; |
| 360 | 504 | ||
| 361 | tsk->stack = stack; | ||
| 362 | #ifdef CONFIG_SECCOMP | 505 | #ifdef CONFIG_SECCOMP |
| 363 | /* | 506 | /* |
| 364 | * We must handle setting up seccomp filters once we're under | 507 | * We must handle setting up seccomp filters once we're under |
| @@ -390,21 +533,22 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 390 | tsk->task_frag.page = NULL; | 533 | tsk->task_frag.page = NULL; |
| 391 | tsk->wake_q.next = NULL; | 534 | tsk->wake_q.next = NULL; |
| 392 | 535 | ||
| 393 | account_kernel_stack(stack, 1); | 536 | account_kernel_stack(tsk, 1); |
| 394 | 537 | ||
| 395 | kcov_task_init(tsk); | 538 | kcov_task_init(tsk); |
| 396 | 539 | ||
| 397 | return tsk; | 540 | return tsk; |
| 398 | 541 | ||
| 399 | free_stack: | 542 | free_stack: |
| 400 | free_thread_stack(stack); | 543 | free_thread_stack(tsk); |
| 401 | free_tsk: | 544 | free_tsk: |
| 402 | free_task_struct(tsk); | 545 | free_task_struct(tsk); |
| 403 | return NULL; | 546 | return NULL; |
| 404 | } | 547 | } |
| 405 | 548 | ||
| 406 | #ifdef CONFIG_MMU | 549 | #ifdef CONFIG_MMU |
| 407 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | 550 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
| 551 | struct mm_struct *oldmm) | ||
| 408 | { | 552 | { |
| 409 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; | 553 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
| 410 | struct rb_node **rb_link, *rb_parent; | 554 | struct rb_node **rb_link, *rb_parent; |
| @@ -711,6 +855,7 @@ static inline void __mmput(struct mm_struct *mm) | |||
| 711 | ksm_exit(mm); | 855 | ksm_exit(mm); |
| 712 | khugepaged_exit(mm); /* must run before exit_mmap */ | 856 | khugepaged_exit(mm); /* must run before exit_mmap */ |
| 713 | exit_mmap(mm); | 857 | exit_mmap(mm); |
| 858 | mm_put_huge_zero_page(mm); | ||
| 714 | set_mm_exe_file(mm, NULL); | 859 | set_mm_exe_file(mm, NULL); |
| 715 | if (!list_empty(&mm->mmlist)) { | 860 | if (!list_empty(&mm->mmlist)) { |
| 716 | spin_lock(&mmlist_lock); | 861 | spin_lock(&mmlist_lock); |
| @@ -719,6 +864,7 @@ static inline void __mmput(struct mm_struct *mm) | |||
| 719 | } | 864 | } |
| 720 | if (mm->binfmt) | 865 | if (mm->binfmt) |
| 721 | module_put(mm->binfmt->module); | 866 | module_put(mm->binfmt->module); |
| 867 | set_bit(MMF_OOM_SKIP, &mm->flags); | ||
| 722 | mmdrop(mm); | 868 | mmdrop(mm); |
| 723 | } | 869 | } |
| 724 | 870 | ||
| @@ -799,6 +945,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) | |||
| 799 | EXPORT_SYMBOL(get_mm_exe_file); | 945 | EXPORT_SYMBOL(get_mm_exe_file); |
| 800 | 946 | ||
| 801 | /** | 947 | /** |
| 948 | * get_task_exe_file - acquire a reference to the task's executable file | ||
| 949 | * | ||
| 950 | * Returns %NULL if task's mm (if any) has no associated executable file or | ||
| 951 | * this is a kernel thread with borrowed mm (see the comment above get_task_mm). | ||
| 952 | * User must release file via fput(). | ||
| 953 | */ | ||
| 954 | struct file *get_task_exe_file(struct task_struct *task) | ||
| 955 | { | ||
| 956 | struct file *exe_file = NULL; | ||
| 957 | struct mm_struct *mm; | ||
| 958 | |||
| 959 | task_lock(task); | ||
| 960 | mm = task->mm; | ||
| 961 | if (mm) { | ||
| 962 | if (!(task->flags & PF_KTHREAD)) | ||
| 963 | exe_file = get_mm_exe_file(mm); | ||
| 964 | } | ||
| 965 | task_unlock(task); | ||
| 966 | return exe_file; | ||
| 967 | } | ||
| 968 | EXPORT_SYMBOL(get_task_exe_file); | ||
| 969 | |||
| 970 | /** | ||
| 802 | * get_task_mm - acquire a reference to the task's mm | 971 | * get_task_mm - acquire a reference to the task's mm |
| 803 | * | 972 | * |
| 804 | * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning | 973 | * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning |
| @@ -913,14 +1082,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
| 913 | deactivate_mm(tsk, mm); | 1082 | deactivate_mm(tsk, mm); |
| 914 | 1083 | ||
| 915 | /* | 1084 | /* |
| 916 | * If we're exiting normally, clear a user-space tid field if | 1085 | * Signal userspace if we're not exiting with a core dump |
| 917 | * requested. We leave this alone when dying by signal, to leave | 1086 | * because we want to leave the value intact for debugging |
| 918 | * the value intact in a core dump, and to save the unnecessary | 1087 | * purposes. |
| 919 | * trouble, say, a killed vfork parent shouldn't touch this mm. | ||
| 920 | * Userland only wants this done for a sys_exit. | ||
| 921 | */ | 1088 | */ |
| 922 | if (tsk->clear_child_tid) { | 1089 | if (tsk->clear_child_tid) { |
| 923 | if (!(tsk->flags & PF_SIGNALED) && | 1090 | if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && |
| 924 | atomic_read(&mm->mm_users) > 1) { | 1091 | atomic_read(&mm->mm_users) > 1) { |
| 925 | /* | 1092 | /* |
| 926 | * We don't check the error code - if userspace has | 1093 | * We don't check the error code - if userspace has |
| @@ -1275,7 +1442,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) | |||
| 1275 | * parts of the process environment (as per the clone | 1442 | * parts of the process environment (as per the clone |
| 1276 | * flags). The actual kick-off is left to the caller. | 1443 | * flags). The actual kick-off is left to the caller. |
| 1277 | */ | 1444 | */ |
| 1278 | static struct task_struct *copy_process(unsigned long clone_flags, | 1445 | static __latent_entropy struct task_struct *copy_process( |
| 1446 | unsigned long clone_flags, | ||
| 1279 | unsigned long stack_start, | 1447 | unsigned long stack_start, |
| 1280 | unsigned long stack_size, | 1448 | unsigned long stack_size, |
| 1281 | int __user *child_tidptr, | 1449 | int __user *child_tidptr, |
| @@ -1404,7 +1572,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1404 | p->real_start_time = ktime_get_boot_ns(); | 1572 | p->real_start_time = ktime_get_boot_ns(); |
| 1405 | p->io_context = NULL; | 1573 | p->io_context = NULL; |
| 1406 | p->audit_context = NULL; | 1574 | p->audit_context = NULL; |
| 1407 | threadgroup_change_begin(current); | ||
| 1408 | cgroup_fork(p); | 1575 | cgroup_fork(p); |
| 1409 | #ifdef CONFIG_NUMA | 1576 | #ifdef CONFIG_NUMA |
| 1410 | p->mempolicy = mpol_dup(p->mempolicy); | 1577 | p->mempolicy = mpol_dup(p->mempolicy); |
| @@ -1556,6 +1723,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1556 | INIT_LIST_HEAD(&p->thread_group); | 1723 | INIT_LIST_HEAD(&p->thread_group); |
| 1557 | p->task_works = NULL; | 1724 | p->task_works = NULL; |
| 1558 | 1725 | ||
| 1726 | threadgroup_change_begin(current); | ||
| 1559 | /* | 1727 | /* |
| 1560 | * Ensure that the cgroup subsystem policies allow the new process to be | 1728 | * Ensure that the cgroup subsystem policies allow the new process to be |
| 1561 | * forked. It should be noted the the new process's css_set can be changed | 1729 | * forked. It should be noted the the new process's css_set can be changed |
| @@ -1656,6 +1824,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1656 | bad_fork_cancel_cgroup: | 1824 | bad_fork_cancel_cgroup: |
| 1657 | cgroup_cancel_fork(p); | 1825 | cgroup_cancel_fork(p); |
| 1658 | bad_fork_free_pid: | 1826 | bad_fork_free_pid: |
| 1827 | threadgroup_change_end(current); | ||
| 1659 | if (pid != &init_struct_pid) | 1828 | if (pid != &init_struct_pid) |
| 1660 | free_pid(pid); | 1829 | free_pid(pid); |
| 1661 | bad_fork_cleanup_thread: | 1830 | bad_fork_cleanup_thread: |
| @@ -1688,12 +1857,12 @@ bad_fork_cleanup_policy: | |||
| 1688 | mpol_put(p->mempolicy); | 1857 | mpol_put(p->mempolicy); |
| 1689 | bad_fork_cleanup_threadgroup_lock: | 1858 | bad_fork_cleanup_threadgroup_lock: |
| 1690 | #endif | 1859 | #endif |
| 1691 | threadgroup_change_end(current); | ||
| 1692 | delayacct_tsk_free(p); | 1860 | delayacct_tsk_free(p); |
| 1693 | bad_fork_cleanup_count: | 1861 | bad_fork_cleanup_count: |
| 1694 | atomic_dec(&p->cred->user->processes); | 1862 | atomic_dec(&p->cred->user->processes); |
| 1695 | exit_creds(p); | 1863 | exit_creds(p); |
| 1696 | bad_fork_free: | 1864 | bad_fork_free: |
| 1865 | put_task_stack(p); | ||
| 1697 | free_task(p); | 1866 | free_task(p); |
| 1698 | fork_out: | 1867 | fork_out: |
| 1699 | return ERR_PTR(retval); | 1868 | return ERR_PTR(retval); |
| @@ -1759,6 +1928,7 @@ long _do_fork(unsigned long clone_flags, | |||
| 1759 | 1928 | ||
| 1760 | p = copy_process(clone_flags, stack_start, stack_size, | 1929 | p = copy_process(clone_flags, stack_start, stack_size, |
| 1761 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); | 1930 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
| 1931 | add_latent_entropy(); | ||
| 1762 | /* | 1932 | /* |
| 1763 | * Do this prior waking up the new thread - the thread pointer | 1933 | * Do this prior waking up the new thread - the thread pointer |
| 1764 | * might get invalid after that point, if the thread exits quickly. | 1934 | * might get invalid after that point, if the thread exits quickly. |
diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..2c4be467fecd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) | |||
| 381 | #endif | 381 | #endif |
| 382 | } | 382 | } |
| 383 | 383 | ||
| 384 | /* | 384 | /** |
| 385 | * We hash on the keys returned from get_futex_key (see below). | 385 | * hash_futex - Return the hash bucket in the global hash |
| 386 | * @key: Pointer to the futex key for which the hash is calculated | ||
| 387 | * | ||
| 388 | * We hash on the keys returned from get_futex_key (see below) and return the | ||
| 389 | * corresponding hash bucket in the global hash. | ||
| 386 | */ | 390 | */ |
| 387 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | 391 | static struct futex_hash_bucket *hash_futex(union futex_key *key) |
| 388 | { | 392 | { |
| @@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) | |||
| 392 | return &futex_queues[hash & (futex_hashsize - 1)]; | 396 | return &futex_queues[hash & (futex_hashsize - 1)]; |
| 393 | } | 397 | } |
| 394 | 398 | ||
| 395 | /* | 399 | |
| 400 | /** | ||
| 401 | * match_futex - Check whether two futex keys are equal | ||
| 402 | * @key1: Pointer to key1 | ||
| 403 | * @key2: Pointer to key2 | ||
| 404 | * | ||
| 396 | * Return 1 if two futex_keys are equal, 0 otherwise. | 405 | * Return 1 if two futex_keys are equal, 0 otherwise. |
| 397 | */ | 406 | */ |
| 398 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | 407 | static inline int match_futex(union futex_key *key1, union futex_key *key2) |
diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -7,55 +7,31 @@ | |||
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
| 9 | #include <linux/user_namespace.h> | 9 | #include <linux/user_namespace.h> |
| 10 | #include <linux/vmalloc.h> | ||
| 10 | #include <asm/uaccess.h> | 11 | #include <asm/uaccess.h> |
| 11 | 12 | ||
| 12 | struct group_info *groups_alloc(int gidsetsize) | 13 | struct group_info *groups_alloc(int gidsetsize) |
| 13 | { | 14 | { |
| 14 | struct group_info *group_info; | 15 | struct group_info *gi; |
| 15 | int nblocks; | 16 | unsigned int len; |
| 16 | int i; | 17 | |
| 17 | 18 | len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; | |
| 18 | nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; | 19 | gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); |
| 19 | /* Make sure we always allocate at least one indirect block pointer */ | 20 | if (!gi) |
| 20 | nblocks = nblocks ? : 1; | 21 | gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); |
| 21 | group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); | 22 | if (!gi) |
| 22 | if (!group_info) | ||
| 23 | return NULL; | 23 | return NULL; |
| 24 | group_info->ngroups = gidsetsize; | ||
| 25 | group_info->nblocks = nblocks; | ||
| 26 | atomic_set(&group_info->usage, 1); | ||
| 27 | |||
| 28 | if (gidsetsize <= NGROUPS_SMALL) | ||
| 29 | group_info->blocks[0] = group_info->small_block; | ||
| 30 | else { | ||
| 31 | for (i = 0; i < nblocks; i++) { | ||
| 32 | kgid_t *b; | ||
| 33 | b = (void *)__get_free_page(GFP_USER); | ||
| 34 | if (!b) | ||
| 35 | goto out_undo_partial_alloc; | ||
| 36 | group_info->blocks[i] = b; | ||
| 37 | } | ||
| 38 | } | ||
| 39 | return group_info; | ||
| 40 | 24 | ||
| 41 | out_undo_partial_alloc: | 25 | atomic_set(&gi->usage, 1); |
| 42 | while (--i >= 0) { | 26 | gi->ngroups = gidsetsize; |
| 43 | free_page((unsigned long)group_info->blocks[i]); | 27 | return gi; |
| 44 | } | ||
| 45 | kfree(group_info); | ||
| 46 | return NULL; | ||
| 47 | } | 28 | } |
| 48 | 29 | ||
| 49 | EXPORT_SYMBOL(groups_alloc); | 30 | EXPORT_SYMBOL(groups_alloc); |
| 50 | 31 | ||
| 51 | void groups_free(struct group_info *group_info) | 32 | void groups_free(struct group_info *group_info) |
| 52 | { | 33 | { |
| 53 | if (group_info->blocks[0] != group_info->small_block) { | 34 | kvfree(group_info); |
| 54 | int i; | ||
| 55 | for (i = 0; i < group_info->nblocks; i++) | ||
| 56 | free_page((unsigned long)group_info->blocks[i]); | ||
| 57 | } | ||
| 58 | kfree(group_info); | ||
| 59 | } | 35 | } |
| 60 | 36 | ||
| 61 | EXPORT_SYMBOL(groups_free); | 37 | EXPORT_SYMBOL(groups_free); |
| @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, | |||
| 70 | 46 | ||
| 71 | for (i = 0; i < count; i++) { | 47 | for (i = 0; i < count; i++) { |
| 72 | gid_t gid; | 48 | gid_t gid; |
| 73 | gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); | 49 | gid = from_kgid_munged(user_ns, group_info->gid[i]); |
| 74 | if (put_user(gid, grouplist+i)) | 50 | if (put_user(gid, grouplist+i)) |
| 75 | return -EFAULT; | 51 | return -EFAULT; |
| 76 | } | 52 | } |
| @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, | |||
| 95 | if (!gid_valid(kgid)) | 71 | if (!gid_valid(kgid)) |
| 96 | return -EINVAL; | 72 | return -EINVAL; |
| 97 | 73 | ||
| 98 | GROUP_AT(group_info, i) = kgid; | 74 | group_info->gid[i] = kgid; |
| 99 | } | 75 | } |
| 100 | return 0; | 76 | return 0; |
| 101 | } | 77 | } |
| @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) | |||
| 115 | for (base = 0; base < max; base++) { | 91 | for (base = 0; base < max; base++) { |
| 116 | int left = base; | 92 | int left = base; |
| 117 | int right = left + stride; | 93 | int right = left + stride; |
| 118 | kgid_t tmp = GROUP_AT(group_info, right); | 94 | kgid_t tmp = group_info->gid[right]; |
| 119 | 95 | ||
| 120 | while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { | 96 | while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { |
| 121 | GROUP_AT(group_info, right) = | 97 | group_info->gid[right] = group_info->gid[left]; |
| 122 | GROUP_AT(group_info, left); | ||
| 123 | right = left; | 98 | right = left; |
| 124 | left -= stride; | 99 | left -= stride; |
| 125 | } | 100 | } |
| 126 | GROUP_AT(group_info, right) = tmp; | 101 | group_info->gid[right] = tmp; |
| 127 | } | 102 | } |
| 128 | stride /= 3; | 103 | stride /= 3; |
| 129 | } | 104 | } |
| @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) | |||
| 141 | right = group_info->ngroups; | 116 | right = group_info->ngroups; |
| 142 | while (left < right) { | 117 | while (left < right) { |
| 143 | unsigned int mid = (left+right)/2; | 118 | unsigned int mid = (left+right)/2; |
| 144 | if (gid_gt(grp, GROUP_AT(group_info, mid))) | 119 | if (gid_gt(grp, group_info->gid[mid])) |
| 145 | left = mid + 1; | 120 | left = mid + 1; |
| 146 | else if (gid_lt(grp, GROUP_AT(group_info, mid))) | 121 | else if (gid_lt(grp, group_info->gid[mid])) |
| 147 | right = mid; | 122 | right = mid; |
| 148 | else | 123 | else |
| 149 | return 1; | 124 | return 1; |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..2b59c82cc3e1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
| 98 | 98 | ||
| 99 | trace_sched_process_hang(t); | 99 | trace_sched_process_hang(t); |
| 100 | 100 | ||
| 101 | if (!sysctl_hung_task_warnings) | 101 | if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) |
| 102 | return; | 102 | return; |
| 103 | 103 | ||
| 104 | if (sysctl_hung_task_warnings > 0) | ||
| 105 | sysctl_hung_task_warnings--; | ||
| 106 | |||
| 107 | /* | 104 | /* |
| 108 | * Ok, the task did not get scheduled for more than 2 minutes, | 105 | * Ok, the task did not get scheduled for more than 2 minutes, |
| 109 | * complain: | 106 | * complain: |
| 110 | */ | 107 | */ |
| 111 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", | 108 | if (sysctl_hung_task_warnings) { |
| 112 | t->comm, t->pid, timeout); | 109 | sysctl_hung_task_warnings--; |
| 113 | pr_err(" %s %s %.*s\n", | 110 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
| 114 | print_tainted(), init_utsname()->release, | 111 | t->comm, t->pid, timeout); |
| 115 | (int)strcspn(init_utsname()->version, " "), | 112 | pr_err(" %s %s %.*s\n", |
| 116 | init_utsname()->version); | 113 | print_tainted(), init_utsname()->release, |
| 117 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 114 | (int)strcspn(init_utsname()->version, " "), |
| 118 | " disables this message.\n"); | 115 | init_utsname()->version); |
| 119 | sched_show_task(t); | 116 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
| 120 | debug_show_held_locks(t); | 117 | " disables this message.\n"); |
| 118 | sched_show_task(t); | ||
| 119 | debug_show_all_locks(); | ||
| 120 | } | ||
| 121 | 121 | ||
| 122 | touch_nmi_watchdog(); | 122 | touch_nmi_watchdog(); |
| 123 | 123 | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f68959341c0f..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
| @@ -4,58 +4,151 @@ | |||
| 4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
| 5 | #include <linux/cpu.h> | 5 | #include <linux/cpu.h> |
| 6 | 6 | ||
| 7 | static int get_first_sibling(unsigned int cpu) | 7 | static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
| 8 | int cpus_per_vec) | ||
| 8 | { | 9 | { |
| 9 | unsigned int ret; | 10 | const struct cpumask *siblmsk; |
| 11 | int cpu, sibl; | ||
| 10 | 12 | ||
| 11 | ret = cpumask_first(topology_sibling_cpumask(cpu)); | 13 | for ( ; cpus_per_vec > 0; ) { |
| 12 | if (ret < nr_cpu_ids) | 14 | cpu = cpumask_first(nmsk); |
| 13 | return ret; | 15 | |
| 14 | return cpu; | 16 | /* Should not happen, but I'm too lazy to think about it */ |
| 17 | if (cpu >= nr_cpu_ids) | ||
| 18 | return; | ||
| 19 | |||
| 20 | cpumask_clear_cpu(cpu, nmsk); | ||
| 21 | cpumask_set_cpu(cpu, irqmsk); | ||
| 22 | cpus_per_vec--; | ||
| 23 | |||
| 24 | /* If the cpu has siblings, use them first */ | ||
| 25 | siblmsk = topology_sibling_cpumask(cpu); | ||
| 26 | for (sibl = -1; cpus_per_vec > 0; ) { | ||
| 27 | sibl = cpumask_next(sibl, siblmsk); | ||
| 28 | if (sibl >= nr_cpu_ids) | ||
| 29 | break; | ||
| 30 | if (!cpumask_test_and_clear_cpu(sibl, nmsk)) | ||
| 31 | continue; | ||
| 32 | cpumask_set_cpu(sibl, irqmsk); | ||
| 33 | cpus_per_vec--; | ||
| 34 | } | ||
| 35 | } | ||
| 36 | } | ||
| 37 | |||
| 38 | static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) | ||
| 39 | { | ||
| 40 | int n, nodes; | ||
| 41 | |||
| 42 | /* Calculate the number of nodes in the supplied affinity mask */ | ||
| 43 | for (n = 0, nodes = 0; n < num_online_nodes(); n++) { | ||
| 44 | if (cpumask_intersects(mask, cpumask_of_node(n))) { | ||
| 45 | node_set(n, *nodemsk); | ||
| 46 | nodes++; | ||
| 47 | } | ||
| 48 | } | ||
| 49 | return nodes; | ||
| 15 | } | 50 | } |
| 16 | 51 | ||
| 17 | /* | 52 | /** |
| 18 | * Take a map of online CPUs and the number of available interrupt vectors | 53 | * irq_create_affinity_masks - Create affinity masks for multiqueue spreading |
| 19 | * and generate an output cpumask suitable for spreading MSI/MSI-X vectors | 54 | * @affinity: The affinity mask to spread. If NULL cpu_online_mask |
| 20 | * so that they are distributed as good as possible around the CPUs. If | 55 | * is used |
| 21 | * more vectors than CPUs are available we'll map one to each CPU, | 56 | * @nvecs: The number of vectors |
| 22 | * otherwise we map one to the first sibling of each socket. | ||
| 23 | * | 57 | * |
| 24 | * If there are more vectors than CPUs we will still only have one bit | 58 | * Returns the masks pointer or NULL if allocation failed. |
| 25 | * set per CPU, but interrupt code will keep on assigning the vectors from | ||
| 26 | * the start of the bitmap until we run out of vectors. | ||
| 27 | */ | 59 | */ |
| 28 | struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) | 60 | struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, |
| 61 | int nvec) | ||
| 29 | { | 62 | { |
| 30 | struct cpumask *affinity_mask; | 63 | int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; |
| 31 | unsigned int max_vecs = *nr_vecs; | 64 | nodemask_t nodemsk = NODE_MASK_NONE; |
| 65 | struct cpumask *masks; | ||
| 66 | cpumask_var_t nmsk; | ||
| 32 | 67 | ||
| 33 | if (max_vecs == 1) | 68 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) |
| 34 | return NULL; | 69 | return NULL; |
| 35 | 70 | ||
| 36 | affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); | 71 | masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); |
| 37 | if (!affinity_mask) { | 72 | if (!masks) |
| 38 | *nr_vecs = 1; | 73 | goto out; |
| 39 | return NULL; | ||
| 40 | } | ||
| 41 | 74 | ||
| 42 | if (max_vecs >= num_online_cpus()) { | 75 | /* Stabilize the cpumasks */ |
| 43 | cpumask_copy(affinity_mask, cpu_online_mask); | 76 | get_online_cpus(); |
| 44 | *nr_vecs = num_online_cpus(); | 77 | /* If the supplied affinity mask is NULL, use cpu online mask */ |
| 45 | } else { | 78 | if (!affinity) |
| 46 | unsigned int vecs = 0, cpu; | 79 | affinity = cpu_online_mask; |
| 47 | 80 | ||
| 48 | for_each_online_cpu(cpu) { | 81 | nodes = get_nodes_in_cpumask(affinity, &nodemsk); |
| 49 | if (cpu == get_first_sibling(cpu)) { | ||
| 50 | cpumask_set_cpu(cpu, affinity_mask); | ||
| 51 | vecs++; | ||
| 52 | } | ||
| 53 | 82 | ||
| 54 | if (--max_vecs == 0) | 83 | /* |
| 84 | * If the number of nodes in the mask is less than or equal the | ||
| 85 | * number of vectors we just spread the vectors across the nodes. | ||
| 86 | */ | ||
| 87 | if (nvec <= nodes) { | ||
| 88 | for_each_node_mask(n, nodemsk) { | ||
| 89 | cpumask_copy(masks + curvec, cpumask_of_node(n)); | ||
| 90 | if (++curvec == nvec) | ||
| 55 | break; | 91 | break; |
| 56 | } | 92 | } |
| 57 | *nr_vecs = vecs; | 93 | goto outonl; |
| 94 | } | ||
| 95 | |||
| 96 | /* Spread the vectors per node */ | ||
| 97 | vecs_per_node = nvec / nodes; | ||
| 98 | /* Account for rounding errors */ | ||
| 99 | extra_vecs = nvec - (nodes * vecs_per_node); | ||
| 100 | |||
| 101 | for_each_node_mask(n, nodemsk) { | ||
| 102 | int ncpus, v, vecs_to_assign = vecs_per_node; | ||
| 103 | |||
| 104 | /* Get the cpus on this node which are in the mask */ | ||
| 105 | cpumask_and(nmsk, affinity, cpumask_of_node(n)); | ||
| 106 | |||
| 107 | /* Calculate the number of cpus per vector */ | ||
| 108 | ncpus = cpumask_weight(nmsk); | ||
| 109 | |||
| 110 | for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { | ||
| 111 | cpus_per_vec = ncpus / vecs_to_assign; | ||
| 112 | |||
| 113 | /* Account for extra vectors to compensate rounding errors */ | ||
| 114 | if (extra_vecs) { | ||
| 115 | cpus_per_vec++; | ||
| 116 | if (!--extra_vecs) | ||
| 117 | vecs_per_node++; | ||
| 118 | } | ||
| 119 | irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); | ||
| 120 | } | ||
| 121 | |||
| 122 | if (curvec >= nvec) | ||
| 123 | break; | ||
| 58 | } | 124 | } |
| 59 | 125 | ||
| 60 | return affinity_mask; | 126 | outonl: |
| 127 | put_online_cpus(); | ||
| 128 | out: | ||
| 129 | free_cpumask_var(nmsk); | ||
| 130 | return masks; | ||
| 131 | } | ||
| 132 | |||
| 133 | /** | ||
| 134 | * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask | ||
| 135 | * @affinity: The affinity mask to spread. If NULL cpu_online_mask | ||
| 136 | * is used | ||
| 137 | * @maxvec: The maximum number of vectors available | ||
| 138 | */ | ||
| 139 | int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) | ||
| 140 | { | ||
| 141 | int cpus, ret; | ||
| 142 | |||
| 143 | /* Stabilize the cpumasks */ | ||
| 144 | get_online_cpus(); | ||
| 145 | /* If the supplied affinity mask is NULL, use cpu online mask */ | ||
| 146 | if (!affinity) | ||
| 147 | affinity = cpu_online_mask; | ||
| 148 | |||
| 149 | cpus = cpumask_weight(affinity); | ||
| 150 | ret = (cpus < maxvec) ? cpus : maxvec; | ||
| 151 | |||
| 152 | put_online_cpus(); | ||
| 153 | return ret; | ||
| 61 | } | 154 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b4c1bc7c9ca2..be3c34e4f2ac 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) | |||
| 76 | if (!desc) | 76 | if (!desc) |
| 77 | return -EINVAL; | 77 | return -EINVAL; |
| 78 | 78 | ||
| 79 | type &= IRQ_TYPE_SENSE_MASK; | ||
| 80 | ret = __irq_set_trigger(desc, type); | 79 | ret = __irq_set_trigger(desc, type); |
| 81 | irq_put_desc_busunlock(desc, flags); | 80 | irq_put_desc_busunlock(desc, flags); |
| 82 | return ret; | 81 | return ret; |
| @@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) | |||
| 756 | { | 755 | { |
| 757 | struct irq_chip *chip = irq_desc_get_chip(desc); | 756 | struct irq_chip *chip = irq_desc_get_chip(desc); |
| 758 | struct irqaction *action = desc->action; | 757 | struct irqaction *action = desc->action; |
| 759 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); | ||
| 760 | unsigned int irq = irq_desc_get_irq(desc); | 758 | unsigned int irq = irq_desc_get_irq(desc); |
| 761 | irqreturn_t res; | 759 | irqreturn_t res; |
| 762 | 760 | ||
| @@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc) | |||
| 765 | if (chip->irq_ack) | 763 | if (chip->irq_ack) |
| 766 | chip->irq_ack(&desc->irq_data); | 764 | chip->irq_ack(&desc->irq_data); |
| 767 | 765 | ||
| 768 | trace_irq_handler_entry(irq, action); | 766 | if (likely(action)) { |
| 769 | res = action->handler(irq, dev_id); | 767 | trace_irq_handler_entry(irq, action); |
| 770 | trace_irq_handler_exit(irq, action, res); | 768 | res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); |
| 769 | trace_irq_handler_exit(irq, action, res); | ||
| 770 | } else { | ||
| 771 | unsigned int cpu = smp_processor_id(); | ||
| 772 | bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); | ||
| 773 | |||
| 774 | if (enabled) | ||
| 775 | irq_percpu_disable(desc, cpu); | ||
| 776 | |||
| 777 | pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", | ||
| 778 | enabled ? " and unmasked" : "", irq, cpu); | ||
| 779 | } | ||
| 771 | 780 | ||
| 772 | if (chip->irq_eoi) | 781 | if (chip->irq_eoi) |
| 773 | chip->irq_eoi(&desc->irq_data); | 782 | chip->irq_eoi(&desc->irq_data); |
| 774 | } | 783 | } |
| 775 | 784 | ||
| 776 | void | 785 | static void |
| 777 | __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | 786 | __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, |
| 778 | int is_chained, const char *name) | 787 | int is_chained, const char *name) |
| 779 | { | 788 | { |
| @@ -820,6 +829,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
| 820 | desc->name = name; | 829 | desc->name = name; |
| 821 | 830 | ||
| 822 | if (handle != handle_bad_irq && is_chained) { | 831 | if (handle != handle_bad_irq && is_chained) { |
| 832 | unsigned int type = irqd_get_trigger_type(&desc->irq_data); | ||
| 833 | |||
| 834 | /* | ||
| 835 | * We're about to start this interrupt immediately, | ||
| 836 | * hence the need to set the trigger configuration. | ||
| 837 | * But the .set_type callback may have overridden the | ||
| 838 | * flow handler, ignoring that we're dealing with a | ||
| 839 | * chained interrupt. Reset it immediately because we | ||
| 840 | * do know better. | ||
| 841 | */ | ||
| 842 | if (type != IRQ_TYPE_NONE) { | ||
| 843 | __irq_set_trigger(desc, type); | ||
| 844 | desc->handle_irq = handle; | ||
| 845 | } | ||
| 846 | |||
| 823 | irq_settings_set_noprobe(desc); | 847 | irq_settings_set_noprobe(desc); |
| 824 | irq_settings_set_norequest(desc); | 848 | irq_settings_set_norequest(desc); |
| 825 | irq_settings_set_nothread(desc); | 849 | irq_settings_set_nothread(desc); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
| @@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
| 260 | } | 260 | } |
| 261 | 261 | ||
| 262 | /** | 262 | /** |
| 263 | * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain | 263 | * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain |
| 264 | * @d: irq domain for which to allocate chips | 264 | * @d: irq domain for which to allocate chips |
| 265 | * @irqs_per_chip: Number of interrupts each chip handles | 265 | * @irqs_per_chip: Number of interrupts each chip handles (max 32) |
| 266 | * @num_ct: Number of irq_chip_type instances associated with this | 266 | * @num_ct: Number of irq_chip_type instances associated with this |
| 267 | * @name: Name of the irq chip | 267 | * @name: Name of the irq chip |
| 268 | * @handler: Default flow handler associated with these chips | 268 | * @handler: Default flow handler associated with these chips |
| @@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
| 270 | * @set: IRQ_* bits to set in the mapping function | 270 | * @set: IRQ_* bits to set in the mapping function |
| 271 | * @gcflags: Generic chip specific setup flags | 271 | * @gcflags: Generic chip specific setup flags |
| 272 | */ | 272 | */ |
| 273 | int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | 273 | int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, |
| 274 | int num_ct, const char *name, | 274 | int num_ct, const char *name, |
| 275 | irq_flow_handler_t handler, | 275 | irq_flow_handler_t handler, |
| 276 | unsigned int clr, unsigned int set, | 276 | unsigned int clr, unsigned int set, |
| 277 | enum irq_gc_flags gcflags) | 277 | enum irq_gc_flags gcflags) |
| 278 | { | 278 | { |
| 279 | struct irq_domain_chip_generic *dgc; | 279 | struct irq_domain_chip_generic *dgc; |
| 280 | struct irq_chip_generic *gc; | 280 | struct irq_chip_generic *gc; |
| @@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
| 326 | d->name = name; | 326 | d->name = name; |
| 327 | return 0; | 327 | return 0; |
| 328 | } | 328 | } |
| 329 | EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | 329 | EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); |
| 330 | |||
| 331 | static struct irq_chip_generic * | ||
| 332 | __irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | ||
| 333 | { | ||
| 334 | struct irq_domain_chip_generic *dgc = d->gc; | ||
| 335 | int idx; | ||
| 336 | |||
| 337 | if (!dgc) | ||
| 338 | return ERR_PTR(-ENODEV); | ||
| 339 | idx = hw_irq / dgc->irqs_per_chip; | ||
| 340 | if (idx >= dgc->num_chips) | ||
| 341 | return ERR_PTR(-EINVAL); | ||
| 342 | return dgc->gc[idx]; | ||
| 343 | } | ||
| 330 | 344 | ||
| 331 | /** | 345 | /** |
| 332 | * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq | 346 | * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq |
| @@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | |||
| 336 | struct irq_chip_generic * | 350 | struct irq_chip_generic * |
| 337 | irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | 351 | irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) |
| 338 | { | 352 | { |
| 339 | struct irq_domain_chip_generic *dgc = d->gc; | 353 | struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); |
| 340 | int idx; | ||
| 341 | 354 | ||
| 342 | if (!dgc) | 355 | return !IS_ERR(gc) ? gc : NULL; |
| 343 | return NULL; | ||
| 344 | idx = hw_irq / dgc->irqs_per_chip; | ||
| 345 | if (idx >= dgc->num_chips) | ||
| 346 | return NULL; | ||
| 347 | return dgc->gc[idx]; | ||
| 348 | } | 356 | } |
| 349 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); | 357 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); |
| 350 | 358 | ||
| @@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
| 368 | unsigned long flags; | 376 | unsigned long flags; |
| 369 | int idx; | 377 | int idx; |
| 370 | 378 | ||
| 371 | if (!d->gc) | 379 | gc = __irq_get_domain_generic_chip(d, hw_irq); |
| 372 | return -ENODEV; | 380 | if (IS_ERR(gc)) |
| 373 | 381 | return PTR_ERR(gc); | |
| 374 | idx = hw_irq / dgc->irqs_per_chip; | ||
| 375 | if (idx >= dgc->num_chips) | ||
| 376 | return -EINVAL; | ||
| 377 | gc = dgc->gc[idx]; | ||
| 378 | 382 | ||
| 379 | idx = hw_irq % dgc->irqs_per_chip; | 383 | idx = hw_irq % dgc->irqs_per_chip; |
| 380 | 384 | ||
| @@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
| 409 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); | 413 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); |
| 410 | return 0; | 414 | return 0; |
| 411 | } | 415 | } |
| 412 | EXPORT_SYMBOL_GPL(irq_map_generic_chip); | 416 | |
| 417 | static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) | ||
| 418 | { | ||
| 419 | struct irq_data *data = irq_domain_get_irq_data(d, virq); | ||
| 420 | struct irq_domain_chip_generic *dgc = d->gc; | ||
| 421 | unsigned int hw_irq = data->hwirq; | ||
| 422 | struct irq_chip_generic *gc; | ||
| 423 | int irq_idx; | ||
| 424 | |||
| 425 | gc = irq_get_domain_generic_chip(d, hw_irq); | ||
| 426 | if (!gc) | ||
| 427 | return; | ||
| 428 | |||
| 429 | irq_idx = hw_irq % dgc->irqs_per_chip; | ||
| 430 | |||
| 431 | clear_bit(irq_idx, &gc->installed); | ||
| 432 | irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, | ||
| 433 | NULL); | ||
| 434 | |||
| 435 | } | ||
| 413 | 436 | ||
| 414 | struct irq_domain_ops irq_generic_chip_ops = { | 437 | struct irq_domain_ops irq_generic_chip_ops = { |
| 415 | .map = irq_map_generic_chip, | 438 | .map = irq_map_generic_chip, |
| 439 | .unmap = irq_unmap_generic_chip, | ||
| 416 | .xlate = irq_domain_xlate_onetwocell, | 440 | .xlate = irq_domain_xlate_onetwocell, |
| 417 | }; | 441 | }; |
| 418 | EXPORT_SYMBOL_GPL(irq_generic_chip_ops); | 442 | EXPORT_SYMBOL_GPL(irq_generic_chip_ops); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..00bb0aeea1d0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/radix-tree.h> | 15 | #include <linux/radix-tree.h> |
| 16 | #include <linux/bitmap.h> | 16 | #include <linux/bitmap.h> |
| 17 | #include <linux/irqdomain.h> | 17 | #include <linux/irqdomain.h> |
| 18 | #include <linux/sysfs.h> | ||
| 18 | 19 | ||
| 19 | #include "internals.h" | 20 | #include "internals.h" |
| 20 | 21 | ||
| @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); | |||
| 123 | 124 | ||
| 124 | #ifdef CONFIG_SPARSE_IRQ | 125 | #ifdef CONFIG_SPARSE_IRQ |
| 125 | 126 | ||
| 127 | static void irq_kobj_release(struct kobject *kobj); | ||
| 128 | |||
| 129 | #ifdef CONFIG_SYSFS | ||
| 130 | static struct kobject *irq_kobj_base; | ||
| 131 | |||
| 132 | #define IRQ_ATTR_RO(_name) \ | ||
| 133 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | ||
| 134 | |||
| 135 | static ssize_t per_cpu_count_show(struct kobject *kobj, | ||
| 136 | struct kobj_attribute *attr, char *buf) | ||
| 137 | { | ||
| 138 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 139 | int cpu, irq = desc->irq_data.irq; | ||
| 140 | ssize_t ret = 0; | ||
| 141 | char *p = ""; | ||
| 142 | |||
| 143 | for_each_possible_cpu(cpu) { | ||
| 144 | unsigned int c = kstat_irqs_cpu(irq, cpu); | ||
| 145 | |||
| 146 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); | ||
| 147 | p = ","; | ||
| 148 | } | ||
| 149 | |||
| 150 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); | ||
| 151 | return ret; | ||
| 152 | } | ||
| 153 | IRQ_ATTR_RO(per_cpu_count); | ||
| 154 | |||
| 155 | static ssize_t chip_name_show(struct kobject *kobj, | ||
| 156 | struct kobj_attribute *attr, char *buf) | ||
| 157 | { | ||
| 158 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 159 | ssize_t ret = 0; | ||
| 160 | |||
| 161 | raw_spin_lock_irq(&desc->lock); | ||
| 162 | if (desc->irq_data.chip && desc->irq_data.chip->name) { | ||
| 163 | ret = scnprintf(buf, PAGE_SIZE, "%s\n", | ||
| 164 | desc->irq_data.chip->name); | ||
| 165 | } | ||
| 166 | raw_spin_unlock_irq(&desc->lock); | ||
| 167 | |||
| 168 | return ret; | ||
| 169 | } | ||
| 170 | IRQ_ATTR_RO(chip_name); | ||
| 171 | |||
| 172 | static ssize_t hwirq_show(struct kobject *kobj, | ||
| 173 | struct kobj_attribute *attr, char *buf) | ||
| 174 | { | ||
| 175 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 176 | ssize_t ret = 0; | ||
| 177 | |||
| 178 | raw_spin_lock_irq(&desc->lock); | ||
| 179 | if (desc->irq_data.domain) | ||
| 180 | ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); | ||
| 181 | raw_spin_unlock_irq(&desc->lock); | ||
| 182 | |||
| 183 | return ret; | ||
| 184 | } | ||
| 185 | IRQ_ATTR_RO(hwirq); | ||
| 186 | |||
| 187 | static ssize_t type_show(struct kobject *kobj, | ||
| 188 | struct kobj_attribute *attr, char *buf) | ||
| 189 | { | ||
| 190 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 191 | ssize_t ret = 0; | ||
| 192 | |||
| 193 | raw_spin_lock_irq(&desc->lock); | ||
| 194 | ret = sprintf(buf, "%s\n", | ||
| 195 | irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); | ||
| 196 | raw_spin_unlock_irq(&desc->lock); | ||
| 197 | |||
| 198 | return ret; | ||
| 199 | |||
| 200 | } | ||
| 201 | IRQ_ATTR_RO(type); | ||
| 202 | |||
| 203 | static ssize_t name_show(struct kobject *kobj, | ||
| 204 | struct kobj_attribute *attr, char *buf) | ||
| 205 | { | ||
| 206 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 207 | ssize_t ret = 0; | ||
| 208 | |||
| 209 | raw_spin_lock_irq(&desc->lock); | ||
| 210 | if (desc->name) | ||
| 211 | ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); | ||
| 212 | raw_spin_unlock_irq(&desc->lock); | ||
| 213 | |||
| 214 | return ret; | ||
| 215 | } | ||
| 216 | IRQ_ATTR_RO(name); | ||
| 217 | |||
| 218 | static ssize_t actions_show(struct kobject *kobj, | ||
| 219 | struct kobj_attribute *attr, char *buf) | ||
| 220 | { | ||
| 221 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); | ||
| 222 | struct irqaction *action; | ||
| 223 | ssize_t ret = 0; | ||
| 224 | char *p = ""; | ||
| 225 | |||
| 226 | raw_spin_lock_irq(&desc->lock); | ||
| 227 | for (action = desc->action; action != NULL; action = action->next) { | ||
| 228 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", | ||
| 229 | p, action->name); | ||
| 230 | p = ","; | ||
| 231 | } | ||
| 232 | raw_spin_unlock_irq(&desc->lock); | ||
| 233 | |||
| 234 | if (ret) | ||
| 235 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); | ||
| 236 | |||
| 237 | return ret; | ||
| 238 | } | ||
| 239 | IRQ_ATTR_RO(actions); | ||
| 240 | |||
| 241 | static struct attribute *irq_attrs[] = { | ||
| 242 | &per_cpu_count_attr.attr, | ||
| 243 | &chip_name_attr.attr, | ||
| 244 | &hwirq_attr.attr, | ||
| 245 | &type_attr.attr, | ||
| 246 | &name_attr.attr, | ||
| 247 | &actions_attr.attr, | ||
| 248 | NULL | ||
| 249 | }; | ||
| 250 | |||
| 251 | static struct kobj_type irq_kobj_type = { | ||
| 252 | .release = irq_kobj_release, | ||
| 253 | .sysfs_ops = &kobj_sysfs_ops, | ||
| 254 | .default_attrs = irq_attrs, | ||
| 255 | }; | ||
| 256 | |||
| 257 | static void irq_sysfs_add(int irq, struct irq_desc *desc) | ||
| 258 | { | ||
| 259 | if (irq_kobj_base) { | ||
| 260 | /* | ||
| 261 | * Continue even in case of failure as this is nothing | ||
| 262 | * crucial. | ||
| 263 | */ | ||
| 264 | if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) | ||
| 265 | pr_warn("Failed to add kobject for irq %d\n", irq); | ||
| 266 | } | ||
| 267 | } | ||
| 268 | |||
| 269 | static int __init irq_sysfs_init(void) | ||
| 270 | { | ||
| 271 | struct irq_desc *desc; | ||
| 272 | int irq; | ||
| 273 | |||
| 274 | /* Prevent concurrent irq alloc/free */ | ||
| 275 | irq_lock_sparse(); | ||
| 276 | |||
| 277 | irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); | ||
| 278 | if (!irq_kobj_base) { | ||
| 279 | irq_unlock_sparse(); | ||
| 280 | return -ENOMEM; | ||
| 281 | } | ||
| 282 | |||
| 283 | /* Add the already allocated interrupts */ | ||
| 284 | for_each_irq_desc(irq, desc) | ||
| 285 | irq_sysfs_add(irq, desc); | ||
| 286 | irq_unlock_sparse(); | ||
| 287 | |||
| 288 | return 0; | ||
| 289 | } | ||
| 290 | postcore_initcall(irq_sysfs_init); | ||
| 291 | |||
| 292 | #else /* !CONFIG_SYSFS */ | ||
| 293 | |||
| 294 | static struct kobj_type irq_kobj_type = { | ||
| 295 | .release = irq_kobj_release, | ||
| 296 | }; | ||
| 297 | |||
| 298 | static void irq_sysfs_add(int irq, struct irq_desc *desc) {} | ||
| 299 | |||
| 300 | #endif /* CONFIG_SYSFS */ | ||
| 301 | |||
| 126 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); | 302 | static RADIX_TREE(irq_desc_tree, GFP_KERNEL); |
| 127 | 303 | ||
| 128 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) | 304 | static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) |
| @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, | |||
| 187 | 363 | ||
| 188 | desc_set_defaults(irq, desc, node, affinity, owner); | 364 | desc_set_defaults(irq, desc, node, affinity, owner); |
| 189 | irqd_set(&desc->irq_data, flags); | 365 | irqd_set(&desc->irq_data, flags); |
| 366 | kobject_init(&desc->kobj, &irq_kobj_type); | ||
| 190 | 367 | ||
| 191 | return desc; | 368 | return desc; |
| 192 | 369 | ||
| @@ -197,15 +374,22 @@ err_desc: | |||
| 197 | return NULL; | 374 | return NULL; |
| 198 | } | 375 | } |
| 199 | 376 | ||
| 200 | static void delayed_free_desc(struct rcu_head *rhp) | 377 | static void irq_kobj_release(struct kobject *kobj) |
| 201 | { | 378 | { |
| 202 | struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); | 379 | struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); |
| 203 | 380 | ||
| 204 | free_masks(desc); | 381 | free_masks(desc); |
| 205 | free_percpu(desc->kstat_irqs); | 382 | free_percpu(desc->kstat_irqs); |
| 206 | kfree(desc); | 383 | kfree(desc); |
| 207 | } | 384 | } |
| 208 | 385 | ||
| 386 | static void delayed_free_desc(struct rcu_head *rhp) | ||
| 387 | { | ||
| 388 | struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); | ||
| 389 | |||
| 390 | kobject_put(&desc->kobj); | ||
| 391 | } | ||
| 392 | |||
| 209 | static void free_desc(unsigned int irq) | 393 | static void free_desc(unsigned int irq) |
| 210 | { | 394 | { |
| 211 | struct irq_desc *desc = irq_to_desc(irq); | 395 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) | |||
| 217 | * kstat_irq_usr(). Once we deleted the descriptor from the | 401 | * kstat_irq_usr(). Once we deleted the descriptor from the |
| 218 | * sparse tree we can free it. Access in proc will fail to | 402 | * sparse tree we can free it. Access in proc will fail to |
| 219 | * lookup the descriptor. | 403 | * lookup the descriptor. |
| 404 | * | ||
| 405 | * The sysfs entry must be serialized against a concurrent | ||
| 406 | * irq_sysfs_init() as well. | ||
| 220 | */ | 407 | */ |
| 221 | mutex_lock(&sparse_irq_lock); | 408 | mutex_lock(&sparse_irq_lock); |
| 409 | kobject_del(&desc->kobj); | ||
| 222 | delete_irq_desc(irq); | 410 | delete_irq_desc(irq); |
| 223 | mutex_unlock(&sparse_irq_lock); | 411 | mutex_unlock(&sparse_irq_lock); |
| 224 | 412 | ||
| @@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, | |||
| 236 | const struct cpumask *mask = NULL; | 424 | const struct cpumask *mask = NULL; |
| 237 | struct irq_desc *desc; | 425 | struct irq_desc *desc; |
| 238 | unsigned int flags; | 426 | unsigned int flags; |
| 239 | int i, cpu = -1; | 427 | int i; |
| 240 | 428 | ||
| 241 | if (affinity && cpumask_empty(affinity)) | 429 | /* Validate affinity mask(s) */ |
| 242 | return -EINVAL; | 430 | if (affinity) { |
| 431 | for (i = 0, mask = affinity; i < cnt; i++, mask++) { | ||
| 432 | if (cpumask_empty(mask)) | ||
| 433 | return -EINVAL; | ||
| 434 | } | ||
| 435 | } | ||
| 243 | 436 | ||
| 244 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; | 437 | flags = affinity ? IRQD_AFFINITY_MANAGED : 0; |
| 438 | mask = NULL; | ||
| 245 | 439 | ||
| 246 | for (i = 0; i < cnt; i++) { | 440 | for (i = 0; i < cnt; i++) { |
| 247 | if (affinity) { | 441 | if (affinity) { |
| 248 | cpu = cpumask_next(cpu, affinity); | 442 | node = cpu_to_node(cpumask_first(affinity)); |
| 249 | if (cpu >= nr_cpu_ids) | 443 | mask = affinity; |
| 250 | cpu = cpumask_first(affinity); | 444 | affinity++; |
| 251 | node = cpu_to_node(cpu); | ||
| 252 | |||
| 253 | /* | ||
| 254 | * For single allocations we use the caller provided | ||
| 255 | * mask otherwise we use the mask of the target cpu | ||
| 256 | */ | ||
| 257 | mask = cnt == 1 ? affinity : cpumask_of(cpu); | ||
| 258 | } | 445 | } |
| 259 | desc = alloc_desc(start + i, node, flags, mask, owner); | 446 | desc = alloc_desc(start + i, node, flags, mask, owner); |
| 260 | if (!desc) | 447 | if (!desc) |
| 261 | goto err; | 448 | goto err; |
| 262 | mutex_lock(&sparse_irq_lock); | 449 | mutex_lock(&sparse_irq_lock); |
| 263 | irq_insert_desc(start + i, desc); | 450 | irq_insert_desc(start + i, desc); |
| 451 | irq_sysfs_add(start + i, desc); | ||
| 264 | mutex_unlock(&sparse_irq_lock); | 452 | mutex_unlock(&sparse_irq_lock); |
| 265 | } | 453 | } |
| 266 | return start; | 454 | return start; |
| @@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); | |||
| 481 | * @cnt: Number of consecutive irqs to allocate. | 669 | * @cnt: Number of consecutive irqs to allocate. |
| 482 | * @node: Preferred node on which the irq descriptor should be allocated | 670 | * @node: Preferred node on which the irq descriptor should be allocated |
| 483 | * @owner: Owning module (can be NULL) | 671 | * @owner: Owning module (can be NULL) |
| 484 | * @affinity: Optional pointer to an affinity mask which hints where the | 672 | * @affinity: Optional pointer to an affinity mask array of size @cnt which |
| 485 | * irq descriptors should be allocated and which default | 673 | * hints where the irq descriptors should be allocated and which |
| 486 | * affinities to use | 674 | * default affinities to use |
| 487 | * | 675 | * |
| 488 | * Returns the first irq number or error code | 676 | * Returns the first irq number or error code |
| 489 | */ | 677 | */ |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); | |||
| 80 | 80 | ||
| 81 | /** | 81 | /** |
| 82 | * __irq_domain_add() - Allocate a new irq_domain data structure | 82 | * __irq_domain_add() - Allocate a new irq_domain data structure |
| 83 | * @of_node: optional device-tree node of the interrupt controller | 83 | * @fwnode: firmware node for the interrupt controller |
| 84 | * @size: Size of linear map; 0 for radix mapping only | 84 | * @size: Size of linear map; 0 for radix mapping only |
| 85 | * @hwirq_max: Maximum number of interrupts supported by controller | 85 | * @hwirq_max: Maximum number of interrupts supported by controller |
| 86 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 86 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
| @@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | |||
| 96 | const struct irq_domain_ops *ops, | 96 | const struct irq_domain_ops *ops, |
| 97 | void *host_data) | 97 | void *host_data) |
| 98 | { | 98 | { |
| 99 | struct device_node *of_node = to_of_node(fwnode); | ||
| 99 | struct irq_domain *domain; | 100 | struct irq_domain *domain; |
| 100 | struct device_node *of_node; | ||
| 101 | |||
| 102 | of_node = to_of_node(fwnode); | ||
| 103 | 101 | ||
| 104 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), | 102 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
| 105 | GFP_KERNEL, of_node_to_nid(of_node)); | 103 | GFP_KERNEL, of_node_to_nid(of_node)); |
| @@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, | |||
| 868 | if (WARN_ON(intsize < 1)) | 866 | if (WARN_ON(intsize < 1)) |
| 869 | return -EINVAL; | 867 | return -EINVAL; |
| 870 | *out_hwirq = intspec[0]; | 868 | *out_hwirq = intspec[0]; |
| 871 | *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; | 869 | if (intsize > 1) |
| 870 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; | ||
| 871 | else | ||
| 872 | *out_type = IRQ_TYPE_NONE; | ||
| 872 | return 0; | 873 | return 0; |
| 873 | } | 874 | } |
| 874 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); | 875 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 73a2b786b5e9..0c5f1a5db654 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) | |||
| 669 | return 0; | 669 | return 0; |
| 670 | } | 670 | } |
| 671 | 671 | ||
| 672 | flags &= IRQ_TYPE_SENSE_MASK; | ||
| 673 | |||
| 674 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { | 672 | if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { |
| 675 | if (!irqd_irq_masked(&desc->irq_data)) | 673 | if (!irqd_irq_masked(&desc->irq_data)) |
| 676 | mask_irq(desc); | 674 | mask_irq(desc); |
| @@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) | |||
| 678 | unmask = 1; | 676 | unmask = 1; |
| 679 | } | 677 | } |
| 680 | 678 | ||
| 681 | /* caller masked out all except trigger mode flags */ | 679 | /* Mask all flags except trigger mode */ |
| 680 | flags &= IRQ_TYPE_SENSE_MASK; | ||
| 682 | ret = chip->irq_set_type(&desc->irq_data, flags); | 681 | ret = chip->irq_set_type(&desc->irq_data, flags); |
| 683 | 682 | ||
| 684 | switch (ret) { | 683 | switch (ret) { |
| @@ -1681,8 +1680,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1681 | action->dev_id = dev_id; | 1680 | action->dev_id = dev_id; |
| 1682 | 1681 | ||
| 1683 | retval = irq_chip_pm_get(&desc->irq_data); | 1682 | retval = irq_chip_pm_get(&desc->irq_data); |
| 1684 | if (retval < 0) | 1683 | if (retval < 0) { |
| 1684 | kfree(action); | ||
| 1685 | return retval; | 1685 | return retval; |
| 1686 | } | ||
| 1686 | 1687 | ||
| 1687 | chip_bus_lock(desc); | 1688 | chip_bus_lock(desc); |
| 1688 | retval = __setup_irq(irq, desc, action); | 1689 | retval = __setup_irq(irq, desc, action); |
| @@ -1985,8 +1986,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1985 | action->percpu_dev_id = dev_id; | 1986 | action->percpu_dev_id = dev_id; |
| 1986 | 1987 | ||
| 1987 | retval = irq_chip_pm_get(&desc->irq_data); | 1988 | retval = irq_chip_pm_get(&desc->irq_data); |
| 1988 | if (retval < 0) | 1989 | if (retval < 0) { |
| 1990 | kfree(action); | ||
| 1989 | return retval; | 1991 | return retval; |
| 1992 | } | ||
| 1990 | 1993 | ||
| 1991 | chip_bus_lock(desc); | 1994 | chip_bus_lock(desc); |
| 1992 | retval = __setup_irq(irq, desc, action); | 1995 | retval = __setup_irq(irq, desc, action); |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
| @@ -18,20 +18,42 @@ | |||
| 18 | /* Temparory solution for building, will be removed later */ | 18 | /* Temparory solution for building, will be removed later */ |
| 19 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
| 20 | 20 | ||
| 21 | struct msi_desc *alloc_msi_entry(struct device *dev) | 21 | /** |
| 22 | * alloc_msi_entry - Allocate an initialize msi_entry | ||
| 23 | * @dev: Pointer to the device for which this is allocated | ||
| 24 | * @nvec: The number of vectors used in this entry | ||
| 25 | * @affinity: Optional pointer to an affinity mask array size of @nvec | ||
| 26 | * | ||
| 27 | * If @affinity is not NULL then a an affinity array[@nvec] is allocated | ||
| 28 | * and the affinity masks from @affinity are copied. | ||
| 29 | */ | ||
| 30 | struct msi_desc * | ||
| 31 | alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) | ||
| 22 | { | 32 | { |
| 23 | struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); | 33 | struct msi_desc *desc; |
| 34 | |||
| 35 | desc = kzalloc(sizeof(*desc), GFP_KERNEL); | ||
| 24 | if (!desc) | 36 | if (!desc) |
| 25 | return NULL; | 37 | return NULL; |
| 26 | 38 | ||
| 27 | INIT_LIST_HEAD(&desc->list); | 39 | INIT_LIST_HEAD(&desc->list); |
| 28 | desc->dev = dev; | 40 | desc->dev = dev; |
| 41 | desc->nvec_used = nvec; | ||
| 42 | if (affinity) { | ||
| 43 | desc->affinity = kmemdup(affinity, | ||
| 44 | nvec * sizeof(*desc->affinity), GFP_KERNEL); | ||
| 45 | if (!desc->affinity) { | ||
| 46 | kfree(desc); | ||
| 47 | return NULL; | ||
| 48 | } | ||
| 49 | } | ||
| 29 | 50 | ||
| 30 | return desc; | 51 | return desc; |
| 31 | } | 52 | } |
| 32 | 53 | ||
| 33 | void free_msi_entry(struct msi_desc *entry) | 54 | void free_msi_entry(struct msi_desc *entry) |
| 34 | { | 55 | { |
| 56 | kfree(entry->affinity); | ||
| 35 | kfree(entry); | 57 | kfree(entry); |
| 36 | } | 58 | } |
| 37 | 59 | ||
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 503bc2d348e5..037c321c5618 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
| @@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, | |||
| 887 | return 0; | 887 | return 0; |
| 888 | out: | 888 | out: |
| 889 | vfree(pi->sechdrs); | 889 | vfree(pi->sechdrs); |
| 890 | pi->sechdrs = NULL; | ||
| 891 | |||
| 890 | vfree(pi->purgatory_buf); | 892 | vfree(pi->purgatory_buf); |
| 893 | pi->purgatory_buf = NULL; | ||
| 891 | return ret; | 894 | return ret; |
| 892 | } | 895 | } |
| 893 | 896 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b9b5e0..d63095472ea9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -49,7 +49,7 @@ | |||
| 49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 50 | #include <linux/jump_label.h> | 50 | #include <linux/jump_label.h> |
| 51 | 51 | ||
| 52 | #include <asm-generic/sections.h> | 52 | #include <asm/sections.h> |
| 53 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
| 54 | #include <asm/errno.h> | 54 | #include <asm/errno.h> |
| 55 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..be2cc1f9dd57 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) | |||
| 64 | static struct kthread *to_live_kthread(struct task_struct *k) | 64 | static struct kthread *to_live_kthread(struct task_struct *k) |
| 65 | { | 65 | { |
| 66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); | 66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); |
| 67 | if (likely(vfork)) | 67 | if (likely(vfork) && try_get_task_stack(k)) |
| 68 | return __to_kthread(vfork); | 68 | return __to_kthread(vfork); |
| 69 | return NULL; | 69 | return NULL; |
| 70 | } | 70 | } |
| @@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task) | |||
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | /** | 140 | /** |
| 141 | * probe_kthread_data - speculative version of kthread_data() | 141 | * kthread_probe_data - speculative version of kthread_data() |
| 142 | * @task: possible kthread task in question | 142 | * @task: possible kthread task in question |
| 143 | * | 143 | * |
| 144 | * @task could be a kthread task. Return the data value specified when it | 144 | * @task could be a kthread task. Return the data value specified when it |
| @@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task) | |||
| 146 | * inaccessible for any reason, %NULL is returned. This function requires | 146 | * inaccessible for any reason, %NULL is returned. This function requires |
| 147 | * that @task itself is safe to dereference. | 147 | * that @task itself is safe to dereference. |
| 148 | */ | 148 | */ |
| 149 | void *probe_kthread_data(struct task_struct *task) | 149 | void *kthread_probe_data(struct task_struct *task) |
| 150 | { | 150 | { |
| 151 | struct kthread *kthread = to_kthread(task); | 151 | struct kthread *kthread = to_kthread(task); |
| 152 | void *data = NULL; | 152 | void *data = NULL; |
| @@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 244 | } | 244 | } |
| 245 | } | 245 | } |
| 246 | 246 | ||
| 247 | /** | 247 | static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), |
| 248 | * kthread_create_on_node - create a kthread. | 248 | void *data, int node, |
| 249 | * @threadfn: the function to run until signal_pending(current). | 249 | const char namefmt[], |
| 250 | * @data: data ptr for @threadfn. | 250 | va_list args) |
| 251 | * @node: task and thread structures for the thread are allocated on this node | ||
| 252 | * @namefmt: printf-style name for the thread. | ||
| 253 | * | ||
| 254 | * Description: This helper function creates and names a kernel | ||
| 255 | * thread. The thread will be stopped: use wake_up_process() to start | ||
| 256 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and | ||
| 257 | * is affine to all CPUs. | ||
| 258 | * | ||
| 259 | * If thread is going to be bound on a particular cpu, give its node | ||
| 260 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. | ||
| 261 | * When woken, the thread will run @threadfn() with @data as its | ||
| 262 | * argument. @threadfn() can either call do_exit() directly if it is a | ||
| 263 | * standalone thread for which no one will call kthread_stop(), or | ||
| 264 | * return when 'kthread_should_stop()' is true (which means | ||
| 265 | * kthread_stop() has been called). The return value should be zero | ||
| 266 | * or a negative error number; it will be passed to kthread_stop(). | ||
| 267 | * | ||
| 268 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). | ||
| 269 | */ | ||
| 270 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | ||
| 271 | void *data, int node, | ||
| 272 | const char namefmt[], | ||
| 273 | ...) | ||
| 274 | { | 251 | { |
| 275 | DECLARE_COMPLETION_ONSTACK(done); | 252 | DECLARE_COMPLETION_ONSTACK(done); |
| 276 | struct task_struct *task; | 253 | struct task_struct *task; |
| @@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
| 311 | task = create->result; | 288 | task = create->result; |
| 312 | if (!IS_ERR(task)) { | 289 | if (!IS_ERR(task)) { |
| 313 | static const struct sched_param param = { .sched_priority = 0 }; | 290 | static const struct sched_param param = { .sched_priority = 0 }; |
| 314 | va_list args; | ||
| 315 | 291 | ||
| 316 | va_start(args, namefmt); | ||
| 317 | vsnprintf(task->comm, sizeof(task->comm), namefmt, args); | 292 | vsnprintf(task->comm, sizeof(task->comm), namefmt, args); |
| 318 | va_end(args); | ||
| 319 | /* | 293 | /* |
| 320 | * root may have changed our (kthreadd's) priority or CPU mask. | 294 | * root may have changed our (kthreadd's) priority or CPU mask. |
| 321 | * The kernel thread should not inherit these properties. | 295 | * The kernel thread should not inherit these properties. |
| @@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
| 326 | kfree(create); | 300 | kfree(create); |
| 327 | return task; | 301 | return task; |
| 328 | } | 302 | } |
| 303 | |||
| 304 | /** | ||
| 305 | * kthread_create_on_node - create a kthread. | ||
| 306 | * @threadfn: the function to run until signal_pending(current). | ||
| 307 | * @data: data ptr for @threadfn. | ||
| 308 | * @node: task and thread structures for the thread are allocated on this node | ||
| 309 | * @namefmt: printf-style name for the thread. | ||
| 310 | * | ||
| 311 | * Description: This helper function creates and names a kernel | ||
| 312 | * thread. The thread will be stopped: use wake_up_process() to start | ||
| 313 | * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and | ||
| 314 | * is affine to all CPUs. | ||
| 315 | * | ||
| 316 | * If thread is going to be bound on a particular cpu, give its node | ||
| 317 | * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. | ||
| 318 | * When woken, the thread will run @threadfn() with @data as its | ||
| 319 | * argument. @threadfn() can either call do_exit() directly if it is a | ||
| 320 | * standalone thread for which no one will call kthread_stop(), or | ||
| 321 | * return when 'kthread_should_stop()' is true (which means | ||
| 322 | * kthread_stop() has been called). The return value should be zero | ||
| 323 | * or a negative error number; it will be passed to kthread_stop(). | ||
| 324 | * | ||
| 325 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). | ||
| 326 | */ | ||
| 327 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | ||
| 328 | void *data, int node, | ||
| 329 | const char namefmt[], | ||
| 330 | ...) | ||
| 331 | { | ||
| 332 | struct task_struct *task; | ||
| 333 | va_list args; | ||
| 334 | |||
| 335 | va_start(args, namefmt); | ||
| 336 | task = __kthread_create_on_node(threadfn, data, node, namefmt, args); | ||
| 337 | va_end(args); | ||
| 338 | |||
| 339 | return task; | ||
| 340 | } | ||
| 329 | EXPORT_SYMBOL(kthread_create_on_node); | 341 | EXPORT_SYMBOL(kthread_create_on_node); |
| 330 | 342 | ||
| 331 | static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) | 343 | static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) |
| @@ -390,10 +402,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
| 390 | cpu); | 402 | cpu); |
| 391 | if (IS_ERR(p)) | 403 | if (IS_ERR(p)) |
| 392 | return p; | 404 | return p; |
| 405 | kthread_bind(p, cpu); | ||
| 406 | /* CPU hotplug need to bind once again when unparking the thread. */ | ||
| 393 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); | 407 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); |
| 394 | to_kthread(p)->cpu = cpu; | 408 | to_kthread(p)->cpu = cpu; |
| 395 | /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ | ||
| 396 | kthread_park(p); | ||
| 397 | return p; | 409 | return p; |
| 398 | } | 410 | } |
| 399 | 411 | ||
| @@ -407,6 +419,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) | |||
| 407 | * which might be about to be cleared. | 419 | * which might be about to be cleared. |
| 408 | */ | 420 | */ |
| 409 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | 421 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { |
| 422 | /* | ||
| 423 | * Newly created kthread was parked when the CPU was offline. | ||
| 424 | * The binding was lost and we need to set it again. | ||
| 425 | */ | ||
| 410 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | 426 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) |
| 411 | __kthread_bind(k, kthread->cpu, TASK_PARKED); | 427 | __kthread_bind(k, kthread->cpu, TASK_PARKED); |
| 412 | wake_up_state(k, TASK_PARKED); | 428 | wake_up_state(k, TASK_PARKED); |
| @@ -425,8 +441,10 @@ void kthread_unpark(struct task_struct *k) | |||
| 425 | { | 441 | { |
| 426 | struct kthread *kthread = to_live_kthread(k); | 442 | struct kthread *kthread = to_live_kthread(k); |
| 427 | 443 | ||
| 428 | if (kthread) | 444 | if (kthread) { |
| 429 | __kthread_unpark(k, kthread); | 445 | __kthread_unpark(k, kthread); |
| 446 | put_task_stack(k); | ||
| 447 | } | ||
| 430 | } | 448 | } |
| 431 | EXPORT_SYMBOL_GPL(kthread_unpark); | 449 | EXPORT_SYMBOL_GPL(kthread_unpark); |
| 432 | 450 | ||
| @@ -455,6 +473,7 @@ int kthread_park(struct task_struct *k) | |||
| 455 | wait_for_completion(&kthread->parked); | 473 | wait_for_completion(&kthread->parked); |
| 456 | } | 474 | } |
| 457 | } | 475 | } |
| 476 | put_task_stack(k); | ||
| 458 | ret = 0; | 477 | ret = 0; |
| 459 | } | 478 | } |
| 460 | return ret; | 479 | return ret; |
| @@ -490,6 +509,7 @@ int kthread_stop(struct task_struct *k) | |||
| 490 | __kthread_unpark(k, kthread); | 509 | __kthread_unpark(k, kthread); |
| 491 | wake_up_process(k); | 510 | wake_up_process(k); |
| 492 | wait_for_completion(&kthread->exited); | 511 | wait_for_completion(&kthread->exited); |
| 512 | put_task_stack(k); | ||
| 493 | } | 513 | } |
| 494 | ret = k->exit_code; | 514 | ret = k->exit_code; |
| 495 | put_task_struct(k); | 515 | put_task_struct(k); |
| @@ -536,39 +556,48 @@ int kthreadd(void *unused) | |||
| 536 | return 0; | 556 | return 0; |
| 537 | } | 557 | } |
| 538 | 558 | ||
| 539 | void __init_kthread_worker(struct kthread_worker *worker, | 559 | void __kthread_init_worker(struct kthread_worker *worker, |
| 540 | const char *name, | 560 | const char *name, |
| 541 | struct lock_class_key *key) | 561 | struct lock_class_key *key) |
| 542 | { | 562 | { |
| 563 | memset(worker, 0, sizeof(struct kthread_worker)); | ||
| 543 | spin_lock_init(&worker->lock); | 564 | spin_lock_init(&worker->lock); |
| 544 | lockdep_set_class_and_name(&worker->lock, key, name); | 565 | lockdep_set_class_and_name(&worker->lock, key, name); |
| 545 | INIT_LIST_HEAD(&worker->work_list); | 566 | INIT_LIST_HEAD(&worker->work_list); |
| 546 | worker->task = NULL; | 567 | INIT_LIST_HEAD(&worker->delayed_work_list); |
| 547 | } | 568 | } |
| 548 | EXPORT_SYMBOL_GPL(__init_kthread_worker); | 569 | EXPORT_SYMBOL_GPL(__kthread_init_worker); |
| 549 | 570 | ||
| 550 | /** | 571 | /** |
| 551 | * kthread_worker_fn - kthread function to process kthread_worker | 572 | * kthread_worker_fn - kthread function to process kthread_worker |
| 552 | * @worker_ptr: pointer to initialized kthread_worker | 573 | * @worker_ptr: pointer to initialized kthread_worker |
| 553 | * | 574 | * |
| 554 | * This function can be used as @threadfn to kthread_create() or | 575 | * This function implements the main cycle of kthread worker. It processes |
| 555 | * kthread_run() with @worker_ptr argument pointing to an initialized | 576 | * work_list until it is stopped with kthread_stop(). It sleeps when the queue |
| 556 | * kthread_worker. The started kthread will process work_list until | 577 | * is empty. |
| 557 | * the it is stopped with kthread_stop(). A kthread can also call | ||
| 558 | * this function directly after extra initialization. | ||
| 559 | * | 578 | * |
| 560 | * Different kthreads can be used for the same kthread_worker as long | 579 | * The works are not allowed to keep any locks, disable preemption or interrupts |
| 561 | * as there's only one kthread attached to it at any given time. A | 580 | * when they finish. There is defined a safe point for freezing when one work |
| 562 | * kthread_worker without an attached kthread simply collects queued | 581 | * finishes and before a new one is started. |
| 563 | * kthread_works. | 582 | * |
| 583 | * Also the works must not be handled by more than one worker at the same time, | ||
| 584 | * see also kthread_queue_work(). | ||
| 564 | */ | 585 | */ |
| 565 | int kthread_worker_fn(void *worker_ptr) | 586 | int kthread_worker_fn(void *worker_ptr) |
| 566 | { | 587 | { |
| 567 | struct kthread_worker *worker = worker_ptr; | 588 | struct kthread_worker *worker = worker_ptr; |
| 568 | struct kthread_work *work; | 589 | struct kthread_work *work; |
| 569 | 590 | ||
| 570 | WARN_ON(worker->task); | 591 | /* |
| 592 | * FIXME: Update the check and remove the assignment when all kthread | ||
| 593 | * worker users are created using kthread_create_worker*() functions. | ||
| 594 | */ | ||
| 595 | WARN_ON(worker->task && worker->task != current); | ||
| 571 | worker->task = current; | 596 | worker->task = current; |
| 597 | |||
| 598 | if (worker->flags & KTW_FREEZABLE) | ||
| 599 | set_freezable(); | ||
| 600 | |||
| 572 | repeat: | 601 | repeat: |
| 573 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | 602 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ |
| 574 | 603 | ||
| @@ -601,13 +630,132 @@ repeat: | |||
| 601 | } | 630 | } |
| 602 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | 631 | EXPORT_SYMBOL_GPL(kthread_worker_fn); |
| 603 | 632 | ||
| 604 | /* insert @work before @pos in @worker */ | 633 | static struct kthread_worker * |
| 605 | static void insert_kthread_work(struct kthread_worker *worker, | 634 | __kthread_create_worker(int cpu, unsigned int flags, |
| 606 | struct kthread_work *work, | 635 | const char namefmt[], va_list args) |
| 607 | struct list_head *pos) | 636 | { |
| 637 | struct kthread_worker *worker; | ||
| 638 | struct task_struct *task; | ||
| 639 | |||
| 640 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | ||
| 641 | if (!worker) | ||
| 642 | return ERR_PTR(-ENOMEM); | ||
| 643 | |||
| 644 | kthread_init_worker(worker); | ||
| 645 | |||
| 646 | if (cpu >= 0) { | ||
| 647 | char name[TASK_COMM_LEN]; | ||
| 648 | |||
| 649 | /* | ||
| 650 | * kthread_create_worker_on_cpu() allows to pass a generic | ||
| 651 | * namefmt in compare with kthread_create_on_cpu. We need | ||
| 652 | * to format it here. | ||
| 653 | */ | ||
| 654 | vsnprintf(name, sizeof(name), namefmt, args); | ||
| 655 | task = kthread_create_on_cpu(kthread_worker_fn, worker, | ||
| 656 | cpu, name); | ||
| 657 | } else { | ||
| 658 | task = __kthread_create_on_node(kthread_worker_fn, worker, | ||
| 659 | -1, namefmt, args); | ||
| 660 | } | ||
| 661 | |||
| 662 | if (IS_ERR(task)) | ||
| 663 | goto fail_task; | ||
| 664 | |||
| 665 | worker->flags = flags; | ||
| 666 | worker->task = task; | ||
| 667 | wake_up_process(task); | ||
| 668 | return worker; | ||
| 669 | |||
| 670 | fail_task: | ||
| 671 | kfree(worker); | ||
| 672 | return ERR_CAST(task); | ||
| 673 | } | ||
| 674 | |||
| 675 | /** | ||
| 676 | * kthread_create_worker - create a kthread worker | ||
| 677 | * @flags: flags modifying the default behavior of the worker | ||
| 678 | * @namefmt: printf-style name for the kthread worker (task). | ||
| 679 | * | ||
| 680 | * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) | ||
| 681 | * when the needed structures could not get allocated, and ERR_PTR(-EINTR) | ||
| 682 | * when the worker was SIGKILLed. | ||
| 683 | */ | ||
| 684 | struct kthread_worker * | ||
| 685 | kthread_create_worker(unsigned int flags, const char namefmt[], ...) | ||
| 686 | { | ||
| 687 | struct kthread_worker *worker; | ||
| 688 | va_list args; | ||
| 689 | |||
| 690 | va_start(args, namefmt); | ||
| 691 | worker = __kthread_create_worker(-1, flags, namefmt, args); | ||
| 692 | va_end(args); | ||
| 693 | |||
| 694 | return worker; | ||
| 695 | } | ||
| 696 | EXPORT_SYMBOL(kthread_create_worker); | ||
| 697 | |||
| 698 | /** | ||
| 699 | * kthread_create_worker_on_cpu - create a kthread worker and bind it | ||
| 700 | * it to a given CPU and the associated NUMA node. | ||
| 701 | * @cpu: CPU number | ||
| 702 | * @flags: flags modifying the default behavior of the worker | ||
| 703 | * @namefmt: printf-style name for the kthread worker (task). | ||
| 704 | * | ||
| 705 | * Use a valid CPU number if you want to bind the kthread worker | ||
| 706 | * to the given CPU and the associated NUMA node. | ||
| 707 | * | ||
| 708 | * A good practice is to add the cpu number also into the worker name. | ||
| 709 | * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). | ||
| 710 | * | ||
| 711 | * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) | ||
| 712 | * when the needed structures could not get allocated, and ERR_PTR(-EINTR) | ||
| 713 | * when the worker was SIGKILLed. | ||
| 714 | */ | ||
| 715 | struct kthread_worker * | ||
| 716 | kthread_create_worker_on_cpu(int cpu, unsigned int flags, | ||
| 717 | const char namefmt[], ...) | ||
| 718 | { | ||
| 719 | struct kthread_worker *worker; | ||
| 720 | va_list args; | ||
| 721 | |||
| 722 | va_start(args, namefmt); | ||
| 723 | worker = __kthread_create_worker(cpu, flags, namefmt, args); | ||
| 724 | va_end(args); | ||
| 725 | |||
| 726 | return worker; | ||
| 727 | } | ||
| 728 | EXPORT_SYMBOL(kthread_create_worker_on_cpu); | ||
| 729 | |||
| 730 | /* | ||
| 731 | * Returns true when the work could not be queued at the moment. | ||
| 732 | * It happens when it is already pending in a worker list | ||
| 733 | * or when it is being cancelled. | ||
| 734 | */ | ||
| 735 | static inline bool queuing_blocked(struct kthread_worker *worker, | ||
| 736 | struct kthread_work *work) | ||
| 608 | { | 737 | { |
| 609 | lockdep_assert_held(&worker->lock); | 738 | lockdep_assert_held(&worker->lock); |
| 610 | 739 | ||
| 740 | return !list_empty(&work->node) || work->canceling; | ||
| 741 | } | ||
| 742 | |||
| 743 | static void kthread_insert_work_sanity_check(struct kthread_worker *worker, | ||
| 744 | struct kthread_work *work) | ||
| 745 | { | ||
| 746 | lockdep_assert_held(&worker->lock); | ||
| 747 | WARN_ON_ONCE(!list_empty(&work->node)); | ||
| 748 | /* Do not use a work with >1 worker, see kthread_queue_work() */ | ||
| 749 | WARN_ON_ONCE(work->worker && work->worker != worker); | ||
| 750 | } | ||
| 751 | |||
| 752 | /* insert @work before @pos in @worker */ | ||
| 753 | static void kthread_insert_work(struct kthread_worker *worker, | ||
| 754 | struct kthread_work *work, | ||
| 755 | struct list_head *pos) | ||
| 756 | { | ||
| 757 | kthread_insert_work_sanity_check(worker, work); | ||
| 758 | |||
| 611 | list_add_tail(&work->node, pos); | 759 | list_add_tail(&work->node, pos); |
| 612 | work->worker = worker; | 760 | work->worker = worker; |
| 613 | if (!worker->current_work && likely(worker->task)) | 761 | if (!worker->current_work && likely(worker->task)) |
| @@ -615,29 +763,133 @@ static void insert_kthread_work(struct kthread_worker *worker, | |||
| 615 | } | 763 | } |
| 616 | 764 | ||
| 617 | /** | 765 | /** |
| 618 | * queue_kthread_work - queue a kthread_work | 766 | * kthread_queue_work - queue a kthread_work |
| 619 | * @worker: target kthread_worker | 767 | * @worker: target kthread_worker |
| 620 | * @work: kthread_work to queue | 768 | * @work: kthread_work to queue |
| 621 | * | 769 | * |
| 622 | * Queue @work to work processor @task for async execution. @task | 770 | * Queue @work to work processor @task for async execution. @task |
| 623 | * must have been created with kthread_worker_create(). Returns %true | 771 | * must have been created with kthread_worker_create(). Returns %true |
| 624 | * if @work was successfully queued, %false if it was already pending. | 772 | * if @work was successfully queued, %false if it was already pending. |
| 773 | * | ||
| 774 | * Reinitialize the work if it needs to be used by another worker. | ||
| 775 | * For example, when the worker was stopped and started again. | ||
| 625 | */ | 776 | */ |
| 626 | bool queue_kthread_work(struct kthread_worker *worker, | 777 | bool kthread_queue_work(struct kthread_worker *worker, |
| 627 | struct kthread_work *work) | 778 | struct kthread_work *work) |
| 628 | { | 779 | { |
| 629 | bool ret = false; | 780 | bool ret = false; |
| 630 | unsigned long flags; | 781 | unsigned long flags; |
| 631 | 782 | ||
| 632 | spin_lock_irqsave(&worker->lock, flags); | 783 | spin_lock_irqsave(&worker->lock, flags); |
| 633 | if (list_empty(&work->node)) { | 784 | if (!queuing_blocked(worker, work)) { |
| 634 | insert_kthread_work(worker, work, &worker->work_list); | 785 | kthread_insert_work(worker, work, &worker->work_list); |
| 786 | ret = true; | ||
| 787 | } | ||
| 788 | spin_unlock_irqrestore(&worker->lock, flags); | ||
| 789 | return ret; | ||
| 790 | } | ||
| 791 | EXPORT_SYMBOL_GPL(kthread_queue_work); | ||
| 792 | |||
| 793 | /** | ||
| 794 | * kthread_delayed_work_timer_fn - callback that queues the associated kthread | ||
| 795 | * delayed work when the timer expires. | ||
| 796 | * @__data: pointer to the data associated with the timer | ||
| 797 | * | ||
| 798 | * The format of the function is defined by struct timer_list. | ||
| 799 | * It should have been called from irqsafe timer with irq already off. | ||
| 800 | */ | ||
| 801 | void kthread_delayed_work_timer_fn(unsigned long __data) | ||
| 802 | { | ||
| 803 | struct kthread_delayed_work *dwork = | ||
| 804 | (struct kthread_delayed_work *)__data; | ||
| 805 | struct kthread_work *work = &dwork->work; | ||
| 806 | struct kthread_worker *worker = work->worker; | ||
| 807 | |||
| 808 | /* | ||
| 809 | * This might happen when a pending work is reinitialized. | ||
| 810 | * It means that it is used a wrong way. | ||
| 811 | */ | ||
| 812 | if (WARN_ON_ONCE(!worker)) | ||
| 813 | return; | ||
| 814 | |||
| 815 | spin_lock(&worker->lock); | ||
| 816 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | ||
| 817 | WARN_ON_ONCE(work->worker != worker); | ||
| 818 | |||
| 819 | /* Move the work from worker->delayed_work_list. */ | ||
| 820 | WARN_ON_ONCE(list_empty(&work->node)); | ||
| 821 | list_del_init(&work->node); | ||
| 822 | kthread_insert_work(worker, work, &worker->work_list); | ||
| 823 | |||
| 824 | spin_unlock(&worker->lock); | ||
| 825 | } | ||
| 826 | EXPORT_SYMBOL(kthread_delayed_work_timer_fn); | ||
| 827 | |||
| 828 | void __kthread_queue_delayed_work(struct kthread_worker *worker, | ||
| 829 | struct kthread_delayed_work *dwork, | ||
| 830 | unsigned long delay) | ||
| 831 | { | ||
| 832 | struct timer_list *timer = &dwork->timer; | ||
| 833 | struct kthread_work *work = &dwork->work; | ||
| 834 | |||
| 835 | WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || | ||
| 836 | timer->data != (unsigned long)dwork); | ||
| 837 | |||
| 838 | /* | ||
| 839 | * If @delay is 0, queue @dwork->work immediately. This is for | ||
| 840 | * both optimization and correctness. The earliest @timer can | ||
| 841 | * expire is on the closest next tick and delayed_work users depend | ||
| 842 | * on that there's no such delay when @delay is 0. | ||
| 843 | */ | ||
| 844 | if (!delay) { | ||
| 845 | kthread_insert_work(worker, work, &worker->work_list); | ||
| 846 | return; | ||
| 847 | } | ||
| 848 | |||
| 849 | /* Be paranoid and try to detect possible races already now. */ | ||
| 850 | kthread_insert_work_sanity_check(worker, work); | ||
| 851 | |||
| 852 | list_add(&work->node, &worker->delayed_work_list); | ||
| 853 | work->worker = worker; | ||
| 854 | timer_stats_timer_set_start_info(&dwork->timer); | ||
| 855 | timer->expires = jiffies + delay; | ||
| 856 | add_timer(timer); | ||
| 857 | } | ||
| 858 | |||
| 859 | /** | ||
| 860 | * kthread_queue_delayed_work - queue the associated kthread work | ||
| 861 | * after a delay. | ||
| 862 | * @worker: target kthread_worker | ||
| 863 | * @dwork: kthread_delayed_work to queue | ||
| 864 | * @delay: number of jiffies to wait before queuing | ||
| 865 | * | ||
| 866 | * If the work has not been pending it starts a timer that will queue | ||
| 867 | * the work after the given @delay. If @delay is zero, it queues the | ||
| 868 | * work immediately. | ||
| 869 | * | ||
| 870 | * Return: %false if the @work has already been pending. It means that | ||
| 871 | * either the timer was running or the work was queued. It returns %true | ||
| 872 | * otherwise. | ||
| 873 | */ | ||
| 874 | bool kthread_queue_delayed_work(struct kthread_worker *worker, | ||
| 875 | struct kthread_delayed_work *dwork, | ||
| 876 | unsigned long delay) | ||
| 877 | { | ||
| 878 | struct kthread_work *work = &dwork->work; | ||
| 879 | unsigned long flags; | ||
| 880 | bool ret = false; | ||
| 881 | |||
| 882 | spin_lock_irqsave(&worker->lock, flags); | ||
| 883 | |||
| 884 | if (!queuing_blocked(worker, work)) { | ||
| 885 | __kthread_queue_delayed_work(worker, dwork, delay); | ||
| 635 | ret = true; | 886 | ret = true; |
| 636 | } | 887 | } |
| 888 | |||
| 637 | spin_unlock_irqrestore(&worker->lock, flags); | 889 | spin_unlock_irqrestore(&worker->lock, flags); |
| 638 | return ret; | 890 | return ret; |
| 639 | } | 891 | } |
| 640 | EXPORT_SYMBOL_GPL(queue_kthread_work); | 892 | EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); |
| 641 | 893 | ||
| 642 | struct kthread_flush_work { | 894 | struct kthread_flush_work { |
| 643 | struct kthread_work work; | 895 | struct kthread_work work; |
| @@ -652,12 +904,12 @@ static void kthread_flush_work_fn(struct kthread_work *work) | |||
| 652 | } | 904 | } |
| 653 | 905 | ||
| 654 | /** | 906 | /** |
| 655 | * flush_kthread_work - flush a kthread_work | 907 | * kthread_flush_work - flush a kthread_work |
| 656 | * @work: work to flush | 908 | * @work: work to flush |
| 657 | * | 909 | * |
| 658 | * If @work is queued or executing, wait for it to finish execution. | 910 | * If @work is queued or executing, wait for it to finish execution. |
| 659 | */ | 911 | */ |
| 660 | void flush_kthread_work(struct kthread_work *work) | 912 | void kthread_flush_work(struct kthread_work *work) |
| 661 | { | 913 | { |
| 662 | struct kthread_flush_work fwork = { | 914 | struct kthread_flush_work fwork = { |
| 663 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | 915 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), |
| @@ -666,21 +918,19 @@ void flush_kthread_work(struct kthread_work *work) | |||
| 666 | struct kthread_worker *worker; | 918 | struct kthread_worker *worker; |
| 667 | bool noop = false; | 919 | bool noop = false; |
| 668 | 920 | ||
| 669 | retry: | ||
| 670 | worker = work->worker; | 921 | worker = work->worker; |
| 671 | if (!worker) | 922 | if (!worker) |
| 672 | return; | 923 | return; |
| 673 | 924 | ||
| 674 | spin_lock_irq(&worker->lock); | 925 | spin_lock_irq(&worker->lock); |
| 675 | if (work->worker != worker) { | 926 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ |
| 676 | spin_unlock_irq(&worker->lock); | 927 | WARN_ON_ONCE(work->worker != worker); |
| 677 | goto retry; | ||
| 678 | } | ||
| 679 | 928 | ||
| 680 | if (!list_empty(&work->node)) | 929 | if (!list_empty(&work->node)) |
| 681 | insert_kthread_work(worker, &fwork.work, work->node.next); | 930 | kthread_insert_work(worker, &fwork.work, work->node.next); |
| 682 | else if (worker->current_work == work) | 931 | else if (worker->current_work == work) |
| 683 | insert_kthread_work(worker, &fwork.work, worker->work_list.next); | 932 | kthread_insert_work(worker, &fwork.work, |
| 933 | worker->work_list.next); | ||
| 684 | else | 934 | else |
| 685 | noop = true; | 935 | noop = true; |
| 686 | 936 | ||
| @@ -689,23 +939,214 @@ retry: | |||
| 689 | if (!noop) | 939 | if (!noop) |
| 690 | wait_for_completion(&fwork.done); | 940 | wait_for_completion(&fwork.done); |
| 691 | } | 941 | } |
| 692 | EXPORT_SYMBOL_GPL(flush_kthread_work); | 942 | EXPORT_SYMBOL_GPL(kthread_flush_work); |
| 943 | |||
| 944 | /* | ||
| 945 | * This function removes the work from the worker queue. Also it makes sure | ||
| 946 | * that it won't get queued later via the delayed work's timer. | ||
| 947 | * | ||
| 948 | * The work might still be in use when this function finishes. See the | ||
| 949 | * current_work proceed by the worker. | ||
| 950 | * | ||
| 951 | * Return: %true if @work was pending and successfully canceled, | ||
| 952 | * %false if @work was not pending | ||
| 953 | */ | ||
| 954 | static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, | ||
| 955 | unsigned long *flags) | ||
| 956 | { | ||
| 957 | /* Try to cancel the timer if exists. */ | ||
| 958 | if (is_dwork) { | ||
| 959 | struct kthread_delayed_work *dwork = | ||
| 960 | container_of(work, struct kthread_delayed_work, work); | ||
| 961 | struct kthread_worker *worker = work->worker; | ||
| 962 | |||
| 963 | /* | ||
| 964 | * del_timer_sync() must be called to make sure that the timer | ||
| 965 | * callback is not running. The lock must be temporary released | ||
| 966 | * to avoid a deadlock with the callback. In the meantime, | ||
| 967 | * any queuing is blocked by setting the canceling counter. | ||
| 968 | */ | ||
| 969 | work->canceling++; | ||
| 970 | spin_unlock_irqrestore(&worker->lock, *flags); | ||
| 971 | del_timer_sync(&dwork->timer); | ||
| 972 | spin_lock_irqsave(&worker->lock, *flags); | ||
| 973 | work->canceling--; | ||
| 974 | } | ||
| 975 | |||
| 976 | /* | ||
| 977 | * Try to remove the work from a worker list. It might either | ||
| 978 | * be from worker->work_list or from worker->delayed_work_list. | ||
| 979 | */ | ||
| 980 | if (!list_empty(&work->node)) { | ||
| 981 | list_del_init(&work->node); | ||
| 982 | return true; | ||
| 983 | } | ||
| 984 | |||
| 985 | return false; | ||
| 986 | } | ||
| 693 | 987 | ||
| 694 | /** | 988 | /** |
| 695 | * flush_kthread_worker - flush all current works on a kthread_worker | 989 | * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work |
| 990 | * @worker: kthread worker to use | ||
| 991 | * @dwork: kthread delayed work to queue | ||
| 992 | * @delay: number of jiffies to wait before queuing | ||
| 993 | * | ||
| 994 | * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, | ||
| 995 | * modify @dwork's timer so that it expires after @delay. If @delay is zero, | ||
| 996 | * @work is guaranteed to be queued immediately. | ||
| 997 | * | ||
| 998 | * Return: %true if @dwork was pending and its timer was modified, | ||
| 999 | * %false otherwise. | ||
| 1000 | * | ||
| 1001 | * A special case is when the work is being canceled in parallel. | ||
| 1002 | * It might be caused either by the real kthread_cancel_delayed_work_sync() | ||
| 1003 | * or yet another kthread_mod_delayed_work() call. We let the other command | ||
| 1004 | * win and return %false here. The caller is supposed to synchronize these | ||
| 1005 | * operations a reasonable way. | ||
| 1006 | * | ||
| 1007 | * This function is safe to call from any context including IRQ handler. | ||
| 1008 | * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() | ||
| 1009 | * for details. | ||
| 1010 | */ | ||
| 1011 | bool kthread_mod_delayed_work(struct kthread_worker *worker, | ||
| 1012 | struct kthread_delayed_work *dwork, | ||
| 1013 | unsigned long delay) | ||
| 1014 | { | ||
| 1015 | struct kthread_work *work = &dwork->work; | ||
| 1016 | unsigned long flags; | ||
| 1017 | int ret = false; | ||
| 1018 | |||
| 1019 | spin_lock_irqsave(&worker->lock, flags); | ||
| 1020 | |||
| 1021 | /* Do not bother with canceling when never queued. */ | ||
| 1022 | if (!work->worker) | ||
| 1023 | goto fast_queue; | ||
| 1024 | |||
| 1025 | /* Work must not be used with >1 worker, see kthread_queue_work() */ | ||
| 1026 | WARN_ON_ONCE(work->worker != worker); | ||
| 1027 | |||
| 1028 | /* Do not fight with another command that is canceling this work. */ | ||
| 1029 | if (work->canceling) | ||
| 1030 | goto out; | ||
| 1031 | |||
| 1032 | ret = __kthread_cancel_work(work, true, &flags); | ||
| 1033 | fast_queue: | ||
| 1034 | __kthread_queue_delayed_work(worker, dwork, delay); | ||
| 1035 | out: | ||
| 1036 | spin_unlock_irqrestore(&worker->lock, flags); | ||
| 1037 | return ret; | ||
| 1038 | } | ||
| 1039 | EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); | ||
| 1040 | |||
| 1041 | static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) | ||
| 1042 | { | ||
| 1043 | struct kthread_worker *worker = work->worker; | ||
| 1044 | unsigned long flags; | ||
| 1045 | int ret = false; | ||
| 1046 | |||
| 1047 | if (!worker) | ||
| 1048 | goto out; | ||
| 1049 | |||
| 1050 | spin_lock_irqsave(&worker->lock, flags); | ||
| 1051 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | ||
| 1052 | WARN_ON_ONCE(work->worker != worker); | ||
| 1053 | |||
| 1054 | ret = __kthread_cancel_work(work, is_dwork, &flags); | ||
| 1055 | |||
| 1056 | if (worker->current_work != work) | ||
| 1057 | goto out_fast; | ||
| 1058 | |||
| 1059 | /* | ||
| 1060 | * The work is in progress and we need to wait with the lock released. | ||
| 1061 | * In the meantime, block any queuing by setting the canceling counter. | ||
| 1062 | */ | ||
| 1063 | work->canceling++; | ||
| 1064 | spin_unlock_irqrestore(&worker->lock, flags); | ||
| 1065 | kthread_flush_work(work); | ||
| 1066 | spin_lock_irqsave(&worker->lock, flags); | ||
| 1067 | work->canceling--; | ||
| 1068 | |||
| 1069 | out_fast: | ||
| 1070 | spin_unlock_irqrestore(&worker->lock, flags); | ||
| 1071 | out: | ||
| 1072 | return ret; | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | /** | ||
| 1076 | * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish | ||
| 1077 | * @work: the kthread work to cancel | ||
| 1078 | * | ||
| 1079 | * Cancel @work and wait for its execution to finish. This function | ||
| 1080 | * can be used even if the work re-queues itself. On return from this | ||
| 1081 | * function, @work is guaranteed to be not pending or executing on any CPU. | ||
| 1082 | * | ||
| 1083 | * kthread_cancel_work_sync(&delayed_work->work) must not be used for | ||
| 1084 | * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. | ||
| 1085 | * | ||
| 1086 | * The caller must ensure that the worker on which @work was last | ||
| 1087 | * queued can't be destroyed before this function returns. | ||
| 1088 | * | ||
| 1089 | * Return: %true if @work was pending, %false otherwise. | ||
| 1090 | */ | ||
| 1091 | bool kthread_cancel_work_sync(struct kthread_work *work) | ||
| 1092 | { | ||
| 1093 | return __kthread_cancel_work_sync(work, false); | ||
| 1094 | } | ||
| 1095 | EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); | ||
| 1096 | |||
| 1097 | /** | ||
| 1098 | * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and | ||
| 1099 | * wait for it to finish. | ||
| 1100 | * @dwork: the kthread delayed work to cancel | ||
| 1101 | * | ||
| 1102 | * This is kthread_cancel_work_sync() for delayed works. | ||
| 1103 | * | ||
| 1104 | * Return: %true if @dwork was pending, %false otherwise. | ||
| 1105 | */ | ||
| 1106 | bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) | ||
| 1107 | { | ||
| 1108 | return __kthread_cancel_work_sync(&dwork->work, true); | ||
| 1109 | } | ||
| 1110 | EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); | ||
| 1111 | |||
| 1112 | /** | ||
| 1113 | * kthread_flush_worker - flush all current works on a kthread_worker | ||
| 696 | * @worker: worker to flush | 1114 | * @worker: worker to flush |
| 697 | * | 1115 | * |
| 698 | * Wait until all currently executing or pending works on @worker are | 1116 | * Wait until all currently executing or pending works on @worker are |
| 699 | * finished. | 1117 | * finished. |
| 700 | */ | 1118 | */ |
| 701 | void flush_kthread_worker(struct kthread_worker *worker) | 1119 | void kthread_flush_worker(struct kthread_worker *worker) |
| 702 | { | 1120 | { |
| 703 | struct kthread_flush_work fwork = { | 1121 | struct kthread_flush_work fwork = { |
| 704 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | 1122 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), |
| 705 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), | 1123 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), |
| 706 | }; | 1124 | }; |
| 707 | 1125 | ||
| 708 | queue_kthread_work(worker, &fwork.work); | 1126 | kthread_queue_work(worker, &fwork.work); |
| 709 | wait_for_completion(&fwork.done); | 1127 | wait_for_completion(&fwork.done); |
| 710 | } | 1128 | } |
| 711 | EXPORT_SYMBOL_GPL(flush_kthread_worker); | 1129 | EXPORT_SYMBOL_GPL(kthread_flush_worker); |
| 1130 | |||
| 1131 | /** | ||
| 1132 | * kthread_destroy_worker - destroy a kthread worker | ||
| 1133 | * @worker: worker to be destroyed | ||
| 1134 | * | ||
| 1135 | * Flush and destroy @worker. The simple flush is enough because the kthread | ||
| 1136 | * worker API is used only in trivial scenarios. There are no multi-step state | ||
| 1137 | * machines needed. | ||
| 1138 | */ | ||
| 1139 | void kthread_destroy_worker(struct kthread_worker *worker) | ||
| 1140 | { | ||
| 1141 | struct task_struct *task; | ||
| 1142 | |||
| 1143 | task = worker->task; | ||
| 1144 | if (WARN_ON(!task)) | ||
| 1145 | return; | ||
| 1146 | |||
| 1147 | kthread_flush_worker(worker); | ||
| 1148 | kthread_stop(task); | ||
| 1149 | WARN_ON(!list_empty(&worker->work_list)); | ||
| 1150 | kfree(worker); | ||
| 1151 | } | ||
| 1152 | EXPORT_SYMBOL(kthread_destroy_worker); | ||
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 8bbe50704621..af4643873e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
| @@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod, | |||
| 274 | 274 | ||
| 275 | objname = klp_is_module(obj) ? obj->name : "vmlinux"; | 275 | objname = klp_is_module(obj) ? obj->name : "vmlinux"; |
| 276 | 276 | ||
| 277 | module_disable_ro(pmod); | ||
| 278 | /* For each klp relocation section */ | 277 | /* For each klp relocation section */ |
| 279 | for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { | 278 | for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { |
| 280 | sec = pmod->klp_info->sechdrs + i; | 279 | sec = pmod->klp_info->sechdrs + i; |
| @@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod, | |||
| 309 | break; | 308 | break; |
| 310 | } | 309 | } |
| 311 | 310 | ||
| 312 | module_enable_ro(pmod, true); | ||
| 313 | return ret; | 311 | return ret; |
| 314 | } | 312 | } |
| 315 | 313 | ||
| @@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
| 547 | list_prev_entry(patch, list)->state == KLP_DISABLED) | 545 | list_prev_entry(patch, list)->state == KLP_DISABLED) |
| 548 | return -EBUSY; | 546 | return -EBUSY; |
| 549 | 547 | ||
| 550 | pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); | ||
| 551 | add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); | ||
| 552 | |||
| 553 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 548 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
| 554 | 549 | ||
| 555 | klp_for_each_object(patch, obj) { | 550 | klp_for_each_object(patch, obj) { |
| @@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) | |||
| 763 | func->old_sympos ? func->old_sympos : 1); | 758 | func->old_sympos ? func->old_sympos : 1); |
| 764 | } | 759 | } |
| 765 | 760 | ||
| 761 | /* Arches may override this to finish any remaining arch-specific tasks */ | ||
| 762 | void __weak arch_klp_init_object_loaded(struct klp_patch *patch, | ||
| 763 | struct klp_object *obj) | ||
| 764 | { | ||
| 765 | } | ||
| 766 | |||
| 766 | /* parts of the initialization that is done only when the object is loaded */ | 767 | /* parts of the initialization that is done only when the object is loaded */ |
| 767 | static int klp_init_object_loaded(struct klp_patch *patch, | 768 | static int klp_init_object_loaded(struct klp_patch *patch, |
| 768 | struct klp_object *obj) | 769 | struct klp_object *obj) |
| @@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch, | |||
| 770 | struct klp_func *func; | 771 | struct klp_func *func; |
| 771 | int ret; | 772 | int ret; |
| 772 | 773 | ||
| 774 | module_disable_ro(patch->mod); | ||
| 773 | ret = klp_write_object_relocations(patch->mod, obj); | 775 | ret = klp_write_object_relocations(patch->mod, obj); |
| 774 | if (ret) | 776 | if (ret) { |
| 777 | module_enable_ro(patch->mod, true); | ||
| 775 | return ret; | 778 | return ret; |
| 779 | } | ||
| 780 | |||
| 781 | arch_klp_init_object_loaded(patch, obj); | ||
| 782 | module_enable_ro(patch->mod, true); | ||
| 776 | 783 | ||
| 777 | klp_for_each_func(obj, func) { | 784 | klp_for_each_func(obj, func) { |
| 778 | ret = klp_find_object_symbol(obj->name, func->old_name, | 785 | ret = klp_find_object_symbol(obj->name, func->old_name, |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
| @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | |||
| 18 | endif | 18 | endif |
| 19 | obj-$(CONFIG_SMP) += spinlock.o | 19 | obj-$(CONFIG_SMP) += spinlock.o |
| 20 | obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o | 20 | obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o |
| 21 | obj-$(CONFIG_SMP) += lglock.o | ||
| 22 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | 21 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o |
| 23 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o | 22 | obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o |
| 24 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | 23 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o |
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 951cfcd10b4a..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null | |||
| @@ -1,111 +0,0 @@ | |||
| 1 | /* See include/linux/lglock.h for description */ | ||
| 2 | #include <linux/module.h> | ||
| 3 | #include <linux/lglock.h> | ||
| 4 | #include <linux/cpu.h> | ||
| 5 | #include <linux/string.h> | ||
| 6 | |||
| 7 | /* | ||
| 8 | * Note there is no uninit, so lglocks cannot be defined in | ||
| 9 | * modules (but it's fine to use them from there) | ||
| 10 | * Could be added though, just undo lg_lock_init | ||
| 11 | */ | ||
| 12 | |||
| 13 | void lg_lock_init(struct lglock *lg, char *name) | ||
| 14 | { | ||
| 15 | LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); | ||
| 16 | } | ||
| 17 | EXPORT_SYMBOL(lg_lock_init); | ||
| 18 | |||
| 19 | void lg_local_lock(struct lglock *lg) | ||
| 20 | { | ||
| 21 | arch_spinlock_t *lock; | ||
| 22 | |||
| 23 | preempt_disable(); | ||
| 24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
| 25 | lock = this_cpu_ptr(lg->lock); | ||
| 26 | arch_spin_lock(lock); | ||
| 27 | } | ||
| 28 | EXPORT_SYMBOL(lg_local_lock); | ||
| 29 | |||
| 30 | void lg_local_unlock(struct lglock *lg) | ||
| 31 | { | ||
| 32 | arch_spinlock_t *lock; | ||
| 33 | |||
| 34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
| 35 | lock = this_cpu_ptr(lg->lock); | ||
| 36 | arch_spin_unlock(lock); | ||
| 37 | preempt_enable(); | ||
| 38 | } | ||
| 39 | EXPORT_SYMBOL(lg_local_unlock); | ||
| 40 | |||
| 41 | void lg_local_lock_cpu(struct lglock *lg, int cpu) | ||
| 42 | { | ||
| 43 | arch_spinlock_t *lock; | ||
| 44 | |||
| 45 | preempt_disable(); | ||
| 46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
| 47 | lock = per_cpu_ptr(lg->lock, cpu); | ||
| 48 | arch_spin_lock(lock); | ||
| 49 | } | ||
| 50 | EXPORT_SYMBOL(lg_local_lock_cpu); | ||
| 51 | |||
| 52 | void lg_local_unlock_cpu(struct lglock *lg, int cpu) | ||
| 53 | { | ||
| 54 | arch_spinlock_t *lock; | ||
| 55 | |||
| 56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
| 57 | lock = per_cpu_ptr(lg->lock, cpu); | ||
| 58 | arch_spin_unlock(lock); | ||
| 59 | preempt_enable(); | ||
| 60 | } | ||
| 61 | EXPORT_SYMBOL(lg_local_unlock_cpu); | ||
| 62 | |||
| 63 | void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) | ||
| 64 | { | ||
| 65 | BUG_ON(cpu1 == cpu2); | ||
| 66 | |||
| 67 | /* lock in cpu order, just like lg_global_lock */ | ||
| 68 | if (cpu2 < cpu1) | ||
| 69 | swap(cpu1, cpu2); | ||
| 70 | |||
| 71 | preempt_disable(); | ||
| 72 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
| 73 | arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); | ||
| 74 | arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); | ||
| 75 | } | ||
| 76 | |||
| 77 | void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) | ||
| 78 | { | ||
| 79 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
| 80 | arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); | ||
| 81 | arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); | ||
| 82 | preempt_enable(); | ||
| 83 | } | ||
| 84 | |||
| 85 | void lg_global_lock(struct lglock *lg) | ||
| 86 | { | ||
| 87 | int i; | ||
| 88 | |||
| 89 | preempt_disable(); | ||
| 90 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); | ||
| 91 | for_each_possible_cpu(i) { | ||
| 92 | arch_spinlock_t *lock; | ||
| 93 | lock = per_cpu_ptr(lg->lock, i); | ||
| 94 | arch_spin_lock(lock); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL(lg_global_lock); | ||
| 98 | |||
| 99 | void lg_global_unlock(struct lglock *lg) | ||
| 100 | { | ||
| 101 | int i; | ||
| 102 | |||
| 103 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
| 104 | for_each_possible_cpu(i) { | ||
| 105 | arch_spinlock_t *lock; | ||
| 106 | lock = per_cpu_ptr(lg->lock, i); | ||
| 107 | arch_spin_unlock(lock); | ||
| 108 | } | ||
| 109 | preempt_enable(); | ||
| 110 | } | ||
| 111 | EXPORT_SYMBOL(lg_global_unlock); | ||
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
| @@ -8,152 +8,186 @@ | |||
| 8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
| 9 | #include <linux/errno.h> | 9 | #include <linux/errno.h> |
| 10 | 10 | ||
| 11 | int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, | 11 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, |
| 12 | const char *name, struct lock_class_key *rwsem_key) | 12 | const char *name, struct lock_class_key *rwsem_key) |
| 13 | { | 13 | { |
| 14 | brw->fast_read_ctr = alloc_percpu(int); | 14 | sem->read_count = alloc_percpu(int); |
| 15 | if (unlikely(!brw->fast_read_ctr)) | 15 | if (unlikely(!sem->read_count)) |
| 16 | return -ENOMEM; | 16 | return -ENOMEM; |
| 17 | 17 | ||
| 18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ | 18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ |
| 19 | __init_rwsem(&brw->rw_sem, name, rwsem_key); | 19 | rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); |
| 20 | rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); | 20 | __init_rwsem(&sem->rw_sem, name, rwsem_key); |
| 21 | atomic_set(&brw->slow_read_ctr, 0); | 21 | init_waitqueue_head(&sem->writer); |
| 22 | init_waitqueue_head(&brw->write_waitq); | 22 | sem->readers_block = 0; |
| 23 | return 0; | 23 | return 0; |
| 24 | } | 24 | } |
| 25 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); | 25 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); |
| 26 | 26 | ||
| 27 | void percpu_free_rwsem(struct percpu_rw_semaphore *brw) | 27 | void percpu_free_rwsem(struct percpu_rw_semaphore *sem) |
| 28 | { | 28 | { |
| 29 | /* | 29 | /* |
| 30 | * XXX: temporary kludge. The error path in alloc_super() | 30 | * XXX: temporary kludge. The error path in alloc_super() |
| 31 | * assumes that percpu_free_rwsem() is safe after kzalloc(). | 31 | * assumes that percpu_free_rwsem() is safe after kzalloc(). |
| 32 | */ | 32 | */ |
| 33 | if (!brw->fast_read_ctr) | 33 | if (!sem->read_count) |
| 34 | return; | 34 | return; |
| 35 | 35 | ||
| 36 | rcu_sync_dtor(&brw->rss); | 36 | rcu_sync_dtor(&sem->rss); |
| 37 | free_percpu(brw->fast_read_ctr); | 37 | free_percpu(sem->read_count); |
| 38 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ | 38 | sem->read_count = NULL; /* catch use after free bugs */ |
| 39 | } | 39 | } |
| 40 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); | 40 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); |
| 41 | 41 | ||
| 42 | /* | 42 | int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) |
| 43 | * This is the fast-path for down_read/up_read. If it succeeds we rely | ||
| 44 | * on the barriers provided by rcu_sync_enter/exit; see the comments in | ||
| 45 | * percpu_down_write() and percpu_up_write(). | ||
| 46 | * | ||
| 47 | * If this helper fails the callers rely on the normal rw_semaphore and | ||
| 48 | * atomic_dec_and_test(), so in this case we have the necessary barriers. | ||
| 49 | */ | ||
| 50 | static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) | ||
| 51 | { | 43 | { |
| 52 | bool success; | 44 | /* |
| 45 | * Due to having preemption disabled the decrement happens on | ||
| 46 | * the same CPU as the increment, avoiding the | ||
| 47 | * increment-on-one-CPU-and-decrement-on-another problem. | ||
| 48 | * | ||
| 49 | * If the reader misses the writer's assignment of readers_block, then | ||
| 50 | * the writer is guaranteed to see the reader's increment. | ||
| 51 | * | ||
| 52 | * Conversely, any readers that increment their sem->read_count after | ||
| 53 | * the writer looks are guaranteed to see the readers_block value, | ||
| 54 | * which in turn means that they are guaranteed to immediately | ||
| 55 | * decrement their sem->read_count, so that it doesn't matter that the | ||
| 56 | * writer missed them. | ||
| 57 | */ | ||
| 53 | 58 | ||
| 54 | preempt_disable(); | 59 | smp_mb(); /* A matches D */ |
| 55 | success = rcu_sync_is_idle(&brw->rss); | ||
| 56 | if (likely(success)) | ||
| 57 | __this_cpu_add(*brw->fast_read_ctr, val); | ||
| 58 | preempt_enable(); | ||
| 59 | 60 | ||
| 60 | return success; | 61 | /* |
| 61 | } | 62 | * If !readers_block the critical section starts here, matched by the |
| 63 | * release in percpu_up_write(). | ||
| 64 | */ | ||
| 65 | if (likely(!smp_load_acquire(&sem->readers_block))) | ||
| 66 | return 1; | ||
| 62 | 67 | ||
| 63 | /* | 68 | /* |
| 64 | * Like the normal down_read() this is not recursive, the writer can | 69 | * Per the above comment; we still have preemption disabled and |
| 65 | * come after the first percpu_down_read() and create the deadlock. | 70 | * will thus decrement on the same CPU as we incremented. |
| 66 | * | 71 | */ |
| 67 | * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, | 72 | __percpu_up_read(sem); |
| 68 | * percpu_up_read() does rwsem_release(). This pairs with the usage | ||
| 69 | * of ->rw_sem in percpu_down/up_write(). | ||
| 70 | */ | ||
| 71 | void percpu_down_read(struct percpu_rw_semaphore *brw) | ||
| 72 | { | ||
| 73 | might_sleep(); | ||
| 74 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); | ||
| 75 | 73 | ||
| 76 | if (likely(update_fast_ctr(brw, +1))) | 74 | if (try) |
| 77 | return; | 75 | return 0; |
| 78 | 76 | ||
| 79 | /* Avoid rwsem_acquire_read() and rwsem_release() */ | 77 | /* |
| 80 | __down_read(&brw->rw_sem); | 78 | * We either call schedule() in the wait, or we'll fall through |
| 81 | atomic_inc(&brw->slow_read_ctr); | 79 | * and reschedule on the preempt_enable() in percpu_down_read(). |
| 82 | __up_read(&brw->rw_sem); | 80 | */ |
| 83 | } | 81 | preempt_enable_no_resched(); |
| 84 | EXPORT_SYMBOL_GPL(percpu_down_read); | ||
| 85 | 82 | ||
| 86 | int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) | 83 | /* |
| 87 | { | 84 | * Avoid lockdep for the down/up_read() we already have them. |
| 88 | if (unlikely(!update_fast_ctr(brw, +1))) { | 85 | */ |
| 89 | if (!__down_read_trylock(&brw->rw_sem)) | 86 | __down_read(&sem->rw_sem); |
| 90 | return 0; | 87 | this_cpu_inc(*sem->read_count); |
| 91 | atomic_inc(&brw->slow_read_ctr); | 88 | __up_read(&sem->rw_sem); |
| 92 | __up_read(&brw->rw_sem); | 89 | |
| 93 | } | 90 | preempt_disable(); |
| 94 | |||
| 95 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); | ||
| 96 | return 1; | 91 | return 1; |
| 97 | } | 92 | } |
| 93 | EXPORT_SYMBOL_GPL(__percpu_down_read); | ||
| 98 | 94 | ||
| 99 | void percpu_up_read(struct percpu_rw_semaphore *brw) | 95 | void __percpu_up_read(struct percpu_rw_semaphore *sem) |
| 100 | { | 96 | { |
| 101 | rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); | 97 | smp_mb(); /* B matches C */ |
| 102 | 98 | /* | |
| 103 | if (likely(update_fast_ctr(brw, -1))) | 99 | * In other words, if they see our decrement (presumably to aggregate |
| 104 | return; | 100 | * zero, as that is the only time it matters) they will also see our |
| 101 | * critical section. | ||
| 102 | */ | ||
| 103 | __this_cpu_dec(*sem->read_count); | ||
| 105 | 104 | ||
| 106 | /* false-positive is possible but harmless */ | 105 | /* Prod writer to recheck readers_active */ |
| 107 | if (atomic_dec_and_test(&brw->slow_read_ctr)) | 106 | wake_up(&sem->writer); |
| 108 | wake_up_all(&brw->write_waitq); | ||
| 109 | } | 107 | } |
| 110 | EXPORT_SYMBOL_GPL(percpu_up_read); | 108 | EXPORT_SYMBOL_GPL(__percpu_up_read); |
| 109 | |||
| 110 | #define per_cpu_sum(var) \ | ||
| 111 | ({ \ | ||
| 112 | typeof(var) __sum = 0; \ | ||
| 113 | int cpu; \ | ||
| 114 | compiletime_assert_atomic_type(__sum); \ | ||
| 115 | for_each_possible_cpu(cpu) \ | ||
| 116 | __sum += per_cpu(var, cpu); \ | ||
| 117 | __sum; \ | ||
| 118 | }) | ||
| 111 | 119 | ||
| 112 | static int clear_fast_ctr(struct percpu_rw_semaphore *brw) | 120 | /* |
| 121 | * Return true if the modular sum of the sem->read_count per-CPU variable is | ||
| 122 | * zero. If this sum is zero, then it is stable due to the fact that if any | ||
| 123 | * newly arriving readers increment a given counter, they will immediately | ||
| 124 | * decrement that same counter. | ||
| 125 | */ | ||
| 126 | static bool readers_active_check(struct percpu_rw_semaphore *sem) | ||
| 113 | { | 127 | { |
| 114 | unsigned int sum = 0; | 128 | if (per_cpu_sum(*sem->read_count) != 0) |
| 115 | int cpu; | 129 | return false; |
| 130 | |||
| 131 | /* | ||
| 132 | * If we observed the decrement; ensure we see the entire critical | ||
| 133 | * section. | ||
| 134 | */ | ||
| 116 | 135 | ||
| 117 | for_each_possible_cpu(cpu) { | 136 | smp_mb(); /* C matches B */ |
| 118 | sum += per_cpu(*brw->fast_read_ctr, cpu); | ||
| 119 | per_cpu(*brw->fast_read_ctr, cpu) = 0; | ||
| 120 | } | ||
| 121 | 137 | ||
| 122 | return sum; | 138 | return true; |
| 123 | } | 139 | } |
| 124 | 140 | ||
| 125 | void percpu_down_write(struct percpu_rw_semaphore *brw) | 141 | void percpu_down_write(struct percpu_rw_semaphore *sem) |
| 126 | { | 142 | { |
| 143 | /* Notify readers to take the slow path. */ | ||
| 144 | rcu_sync_enter(&sem->rss); | ||
| 145 | |||
| 146 | down_write(&sem->rw_sem); | ||
| 147 | |||
| 127 | /* | 148 | /* |
| 128 | * Make rcu_sync_is_idle() == F and thus disable the fast-path in | 149 | * Notify new readers to block; up until now, and thus throughout the |
| 129 | * percpu_down_read() and percpu_up_read(), and wait for gp pass. | 150 | * longish rcu_sync_enter() above, new readers could still come in. |
| 130 | * | ||
| 131 | * The latter synchronises us with the preceding readers which used | ||
| 132 | * the fast-past, so we can not miss the result of __this_cpu_add() | ||
| 133 | * or anything else inside their criticial sections. | ||
| 134 | */ | 151 | */ |
| 135 | rcu_sync_enter(&brw->rss); | 152 | WRITE_ONCE(sem->readers_block, 1); |
| 136 | 153 | ||
| 137 | /* exclude other writers, and block the new readers completely */ | 154 | smp_mb(); /* D matches A */ |
| 138 | down_write(&brw->rw_sem); | ||
| 139 | 155 | ||
| 140 | /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ | 156 | /* |
| 141 | atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); | 157 | * If they don't see our writer of readers_block, then we are |
| 158 | * guaranteed to see their sem->read_count increment, and therefore | ||
| 159 | * will wait for them. | ||
| 160 | */ | ||
| 142 | 161 | ||
| 143 | /* wait for all readers to complete their percpu_up_read() */ | 162 | /* Wait for all now active readers to complete. */ |
| 144 | wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); | 163 | wait_event(sem->writer, readers_active_check(sem)); |
| 145 | } | 164 | } |
| 146 | EXPORT_SYMBOL_GPL(percpu_down_write); | 165 | EXPORT_SYMBOL_GPL(percpu_down_write); |
| 147 | 166 | ||
| 148 | void percpu_up_write(struct percpu_rw_semaphore *brw) | 167 | void percpu_up_write(struct percpu_rw_semaphore *sem) |
| 149 | { | 168 | { |
| 150 | /* release the lock, but the readers can't use the fast-path */ | ||
| 151 | up_write(&brw->rw_sem); | ||
| 152 | /* | 169 | /* |
| 153 | * Enable the fast-path in percpu_down_read() and percpu_up_read() | 170 | * Signal the writer is done, no fast path yet. |
| 154 | * but only after another gp pass; this adds the necessary barrier | 171 | * |
| 155 | * to ensure the reader can't miss the changes done by us. | 172 | * One reason that we cannot just immediately flip to readers_fast is |
| 173 | * that new readers might fail to see the results of this writer's | ||
| 174 | * critical section. | ||
| 175 | * | ||
| 176 | * Therefore we force it through the slow path which guarantees an | ||
| 177 | * acquire and thereby guarantees the critical section's consistency. | ||
| 178 | */ | ||
| 179 | smp_store_release(&sem->readers_block, 0); | ||
| 180 | |||
| 181 | /* | ||
| 182 | * Release the write lock, this will allow readers back in the game. | ||
| 183 | */ | ||
| 184 | up_write(&sem->rw_sem); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Once this completes (at least one RCU-sched grace period hence) the | ||
| 188 | * reader fast path will be available again. Safe to use outside the | ||
| 189 | * exclusive write lock because its counting. | ||
| 156 | */ | 190 | */ |
| 157 | rcu_sync_exit(&brw->rss); | 191 | rcu_sync_exit(&sem->rss); |
| 158 | } | 192 | } |
| 159 | EXPORT_SYMBOL_GPL(percpu_up_write); | 193 | EXPORT_SYMBOL_GPL(percpu_up_write); |
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
| @@ -70,11 +70,14 @@ struct pv_node { | |||
| 70 | static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) | 70 | static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) |
| 71 | { | 71 | { |
| 72 | struct __qspinlock *l = (void *)lock; | 72 | struct __qspinlock *l = (void *)lock; |
| 73 | int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && | ||
| 74 | (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); | ||
| 75 | 73 | ||
| 76 | qstat_inc(qstat_pv_lock_stealing, ret); | 74 | if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && |
| 77 | return ret; | 75 | (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { |
| 76 | qstat_inc(qstat_pv_lock_stealing, true); | ||
| 77 | return true; | ||
| 78 | } | ||
| 79 | |||
| 80 | return false; | ||
| 78 | } | 81 | } |
| 79 | 82 | ||
| 80 | /* | 83 | /* |
| @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) | |||
| 257 | static inline bool | 260 | static inline bool |
| 258 | pv_wait_early(struct pv_node *prev, int loop) | 261 | pv_wait_early(struct pv_node *prev, int loop) |
| 259 | { | 262 | { |
| 260 | |||
| 261 | if ((loop & PV_PREV_CHECK_MASK) != 0) | 263 | if ((loop & PV_PREV_CHECK_MASK) != 0) |
| 262 | return false; | 264 | return false; |
| 263 | 265 | ||
| @@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
| 286 | { | 288 | { |
| 287 | struct pv_node *pn = (struct pv_node *)node; | 289 | struct pv_node *pn = (struct pv_node *)node; |
| 288 | struct pv_node *pp = (struct pv_node *)prev; | 290 | struct pv_node *pp = (struct pv_node *)prev; |
| 289 | int waitcnt = 0; | ||
| 290 | int loop; | 291 | int loop; |
| 291 | bool wait_early; | 292 | bool wait_early; |
| 292 | 293 | ||
| 293 | /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ | 294 | for (;;) { |
| 294 | for (;; waitcnt++) { | ||
| 295 | for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { | 295 | for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { |
| 296 | if (READ_ONCE(node->locked)) | 296 | if (READ_ONCE(node->locked)) |
| 297 | return; | 297 | return; |
| @@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
| 315 | 315 | ||
| 316 | if (!READ_ONCE(node->locked)) { | 316 | if (!READ_ONCE(node->locked)) { |
| 317 | qstat_inc(qstat_pv_wait_node, true); | 317 | qstat_inc(qstat_pv_wait_node, true); |
| 318 | qstat_inc(qstat_pv_wait_again, waitcnt); | ||
| 319 | qstat_inc(qstat_pv_wait_early, wait_early); | 318 | qstat_inc(qstat_pv_wait_early, wait_early); |
| 320 | pv_wait(&pn->state, vcpu_halted); | 319 | pv_wait(&pn->state, vcpu_halted); |
| 321 | } | 320 | } |
| @@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) | |||
| 456 | pv_wait(&l->locked, _Q_SLOW_VAL); | 455 | pv_wait(&l->locked, _Q_SLOW_VAL); |
| 457 | 456 | ||
| 458 | /* | 457 | /* |
| 459 | * The unlocker should have freed the lock before kicking the | 458 | * Because of lock stealing, the queue head vCPU may not be |
| 460 | * CPU. So if the lock is still not free, it is a spurious | 459 | * able to acquire the lock before it has to wait again. |
| 461 | * wakeup or another vCPU has stolen the lock. The current | ||
| 462 | * vCPU should spin again. | ||
| 463 | */ | 460 | */ |
| 464 | qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); | ||
| 465 | } | 461 | } |
| 466 | 462 | ||
| 467 | /* | 463 | /* |
| @@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) | |||
| 544 | * unhash. Otherwise it would be possible to have multiple @lock | 540 | * unhash. Otherwise it would be possible to have multiple @lock |
| 545 | * entries, which would be BAD. | 541 | * entries, which would be BAD. |
| 546 | */ | 542 | */ |
| 547 | locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); | 543 | locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); |
| 548 | if (likely(locked == _Q_LOCKED_VAL)) | 544 | if (likely(locked == _Q_LOCKED_VAL)) |
| 549 | return; | 545 | return; |
| 550 | 546 | ||
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h | |||
| @@ -24,8 +24,8 @@ | |||
| 24 | * pv_latency_wake - average latency (ns) from vCPU kick to wakeup | 24 | * pv_latency_wake - average latency (ns) from vCPU kick to wakeup |
| 25 | * pv_lock_slowpath - # of locking operations via the slowpath | 25 | * pv_lock_slowpath - # of locking operations via the slowpath |
| 26 | * pv_lock_stealing - # of lock stealing operations | 26 | * pv_lock_stealing - # of lock stealing operations |
| 27 | * pv_spurious_wakeup - # of spurious wakeups | 27 | * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs |
| 28 | * pv_wait_again - # of vCPU wait's that happened after a vCPU kick | 28 | * pv_wait_again - # of wait's after a queue head vCPU kick |
| 29 | * pv_wait_early - # of early vCPU wait's | 29 | * pv_wait_early - # of early vCPU wait's |
| 30 | * pv_wait_head - # of vCPU wait's at the queue head | 30 | * pv_wait_head - # of vCPU wait's at the queue head |
| 31 | * pv_wait_node - # of vCPU wait's at a non-head queue node | 31 | * pv_wait_node - # of vCPU wait's at a non-head queue node |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -121,16 +121,19 @@ enum rwsem_wake_type { | |||
| 121 | * - woken process blocks are discarded from the list after having task zeroed | 121 | * - woken process blocks are discarded from the list after having task zeroed |
| 122 | * - writers are only marked woken if downgrading is false | 122 | * - writers are only marked woken if downgrading is false |
| 123 | */ | 123 | */ |
| 124 | static struct rw_semaphore * | 124 | static void __rwsem_mark_wake(struct rw_semaphore *sem, |
| 125 | __rwsem_mark_wake(struct rw_semaphore *sem, | 125 | enum rwsem_wake_type wake_type, |
| 126 | enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) | 126 | struct wake_q_head *wake_q) |
| 127 | { | 127 | { |
| 128 | struct rwsem_waiter *waiter; | 128 | struct rwsem_waiter *waiter, *tmp; |
| 129 | struct task_struct *tsk; | 129 | long oldcount, woken = 0, adjustment = 0; |
| 130 | struct list_head *next; | 130 | |
| 131 | long oldcount, woken, loop, adjustment; | 131 | /* |
| 132 | * Take a peek at the queue head waiter such that we can determine | ||
| 133 | * the wakeup(s) to perform. | ||
| 134 | */ | ||
| 135 | waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); | ||
| 132 | 136 | ||
| 133 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
| 134 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | 137 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { |
| 135 | if (wake_type == RWSEM_WAKE_ANY) { | 138 | if (wake_type == RWSEM_WAKE_ANY) { |
| 136 | /* | 139 | /* |
| @@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
| 142 | */ | 145 | */ |
| 143 | wake_q_add(wake_q, waiter->task); | 146 | wake_q_add(wake_q, waiter->task); |
| 144 | } | 147 | } |
| 145 | goto out; | 148 | |
| 149 | return; | ||
| 146 | } | 150 | } |
| 147 | 151 | ||
| 148 | /* Writers might steal the lock before we grant it to the next reader. | 152 | /* |
| 153 | * Writers might steal the lock before we grant it to the next reader. | ||
| 149 | * We prefer to do the first reader grant before counting readers | 154 | * We prefer to do the first reader grant before counting readers |
| 150 | * so we can bail out early if a writer stole the lock. | 155 | * so we can bail out early if a writer stole the lock. |
| 151 | */ | 156 | */ |
| 152 | adjustment = 0; | ||
| 153 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | 157 | if (wake_type != RWSEM_WAKE_READ_OWNED) { |
| 154 | adjustment = RWSEM_ACTIVE_READ_BIAS; | 158 | adjustment = RWSEM_ACTIVE_READ_BIAS; |
| 155 | try_reader_grant: | 159 | try_reader_grant: |
| 156 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); | 160 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); |
| 157 | |||
| 158 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | 161 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { |
| 159 | /* | 162 | /* |
| 160 | * If the count is still less than RWSEM_WAITING_BIAS | 163 | * If the count is still less than RWSEM_WAITING_BIAS |
| @@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
| 164 | */ | 167 | */ |
| 165 | if (atomic_long_add_return(-adjustment, &sem->count) < | 168 | if (atomic_long_add_return(-adjustment, &sem->count) < |
| 166 | RWSEM_WAITING_BIAS) | 169 | RWSEM_WAITING_BIAS) |
| 167 | goto out; | 170 | return; |
| 171 | |||
| 168 | /* Last active locker left. Retry waking readers. */ | 172 | /* Last active locker left. Retry waking readers. */ |
| 169 | goto try_reader_grant; | 173 | goto try_reader_grant; |
| 170 | } | 174 | } |
| @@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
| 176 | rwsem_set_reader_owned(sem); | 180 | rwsem_set_reader_owned(sem); |
| 177 | } | 181 | } |
| 178 | 182 | ||
| 179 | /* Grant an infinite number of read locks to the readers at the front | 183 | /* |
| 180 | * of the queue. Note we increment the 'active part' of the count by | 184 | * Grant an infinite number of read locks to the readers at the front |
| 181 | * the number of readers before waking any processes up. | 185 | * of the queue. We know that woken will be at least 1 as we accounted |
| 186 | * for above. Note we increment the 'active part' of the count by the | ||
| 187 | * number of readers before waking any processes up. | ||
| 182 | */ | 188 | */ |
| 183 | woken = 0; | 189 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { |
| 184 | do { | 190 | struct task_struct *tsk; |
| 185 | woken++; | ||
| 186 | 191 | ||
| 187 | if (waiter->list.next == &sem->wait_list) | 192 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) |
| 188 | break; | 193 | break; |
| 189 | 194 | ||
| 190 | waiter = list_entry(waiter->list.next, | 195 | woken++; |
| 191 | struct rwsem_waiter, list); | ||
| 192 | |||
| 193 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | ||
| 194 | |||
| 195 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; | ||
| 196 | if (waiter->type != RWSEM_WAITING_FOR_WRITE) | ||
| 197 | /* hit end of list above */ | ||
| 198 | adjustment -= RWSEM_WAITING_BIAS; | ||
| 199 | |||
| 200 | if (adjustment) | ||
| 201 | atomic_long_add(adjustment, &sem->count); | ||
| 202 | |||
| 203 | next = sem->wait_list.next; | ||
| 204 | loop = woken; | ||
| 205 | do { | ||
| 206 | waiter = list_entry(next, struct rwsem_waiter, list); | ||
| 207 | next = waiter->list.next; | ||
| 208 | tsk = waiter->task; | 196 | tsk = waiter->task; |
| 209 | 197 | ||
| 210 | wake_q_add(wake_q, tsk); | 198 | wake_q_add(wake_q, tsk); |
| 199 | list_del(&waiter->list); | ||
| 211 | /* | 200 | /* |
| 212 | * Ensure that the last operation is setting the reader | 201 | * Ensure that the last operation is setting the reader |
| 213 | * waiter to nil such that rwsem_down_read_failed() cannot | 202 | * waiter to nil such that rwsem_down_read_failed() cannot |
| @@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem, | |||
| 215 | * to the task to wakeup. | 204 | * to the task to wakeup. |
| 216 | */ | 205 | */ |
| 217 | smp_store_release(&waiter->task, NULL); | 206 | smp_store_release(&waiter->task, NULL); |
| 218 | } while (--loop); | 207 | } |
| 219 | 208 | ||
| 220 | sem->wait_list.next = next; | 209 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; |
| 221 | next->prev = &sem->wait_list; | 210 | if (list_empty(&sem->wait_list)) { |
| 211 | /* hit end of list above */ | ||
| 212 | adjustment -= RWSEM_WAITING_BIAS; | ||
| 213 | } | ||
| 222 | 214 | ||
| 223 | out: | 215 | if (adjustment) |
| 224 | return sem; | 216 | atomic_long_add(adjustment, &sem->count); |
| 225 | } | 217 | } |
| 226 | 218 | ||
| 227 | /* | 219 | /* |
| @@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 235 | struct task_struct *tsk = current; | 227 | struct task_struct *tsk = current; |
| 236 | WAKE_Q(wake_q); | 228 | WAKE_Q(wake_q); |
| 237 | 229 | ||
| 238 | /* set up my own style of waitqueue */ | ||
| 239 | waiter.task = tsk; | 230 | waiter.task = tsk; |
| 240 | waiter.type = RWSEM_WAITING_FOR_READ; | 231 | waiter.type = RWSEM_WAITING_FOR_READ; |
| 241 | 232 | ||
| @@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 247 | /* we're now waiting on the lock, but no longer actively locking */ | 238 | /* we're now waiting on the lock, but no longer actively locking */ |
| 248 | count = atomic_long_add_return(adjustment, &sem->count); | 239 | count = atomic_long_add_return(adjustment, &sem->count); |
| 249 | 240 | ||
| 250 | /* If there are no active locks, wake the front queued process(es). | 241 | /* |
| 242 | * If there are no active locks, wake the front queued process(es). | ||
| 251 | * | 243 | * |
| 252 | * If there are no writers and we are first in the queue, | 244 | * If there are no writers and we are first in the queue, |
| 253 | * wake our own waiter to join the existing active readers ! | 245 | * wake our own waiter to join the existing active readers ! |
| @@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 255 | if (count == RWSEM_WAITING_BIAS || | 247 | if (count == RWSEM_WAITING_BIAS || |
| 256 | (count > RWSEM_WAITING_BIAS && | 248 | (count > RWSEM_WAITING_BIAS && |
| 257 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | 249 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) |
| 258 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | 250 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
| 259 | 251 | ||
| 260 | raw_spin_unlock_irq(&sem->wait_lock); | 252 | raw_spin_unlock_irq(&sem->wait_lock); |
| 261 | wake_up_q(&wake_q); | 253 | wake_up_q(&wake_q); |
| @@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
| 505 | if (count > RWSEM_WAITING_BIAS) { | 497 | if (count > RWSEM_WAITING_BIAS) { |
| 506 | WAKE_Q(wake_q); | 498 | WAKE_Q(wake_q); |
| 507 | 499 | ||
| 508 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); | 500 | __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); |
| 509 | /* | 501 | /* |
| 510 | * The wakeup is normally called _after_ the wait_lock | 502 | * The wakeup is normally called _after_ the wait_lock |
| 511 | * is released, but given that we are proactively waking | 503 | * is released, but given that we are proactively waking |
| @@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | |||
| 614 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 606 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
| 615 | locked: | 607 | locked: |
| 616 | 608 | ||
| 617 | /* do nothing if list empty */ | ||
| 618 | if (!list_empty(&sem->wait_list)) | 609 | if (!list_empty(&sem->wait_list)) |
| 619 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | 610 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
| 620 | 611 | ||
| 621 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 612 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 622 | wake_up_q(&wake_q); | 613 | wake_up_q(&wake_q); |
| @@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | |||
| 638 | 629 | ||
| 639 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 630 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
| 640 | 631 | ||
| 641 | /* do nothing if list empty */ | ||
| 642 | if (!list_empty(&sem->wait_list)) | 632 | if (!list_empty(&sem->wait_list)) |
| 643 | sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); | 633 | __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); |
| 644 | 634 | ||
| 645 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 635 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 646 | wake_up_q(&wake_q); | 636 | wake_up_q(&wake_q); |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) | |||
| 247 | align_start = res->start & ~(SECTION_SIZE - 1); | 247 | align_start = res->start & ~(SECTION_SIZE - 1); |
| 248 | align_size = ALIGN(resource_size(res), SECTION_SIZE); | 248 | align_size = ALIGN(resource_size(res), SECTION_SIZE); |
| 249 | arch_remove_memory(align_start, align_size); | 249 | arch_remove_memory(align_start, align_size); |
| 250 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); | ||
| 250 | pgmap_radix_release(res); | 251 | pgmap_radix_release(res); |
| 251 | dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, | 252 | dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, |
| 252 | "%s: failed to free all reserved pages\n", __func__); | 253 | "%s: failed to free all reserved pages\n", __func__); |
| @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 282 | struct percpu_ref *ref, struct vmem_altmap *altmap) | 283 | struct percpu_ref *ref, struct vmem_altmap *altmap) |
| 283 | { | 284 | { |
| 284 | resource_size_t key, align_start, align_size, align_end; | 285 | resource_size_t key, align_start, align_size, align_end; |
| 286 | pgprot_t pgprot = PAGE_KERNEL; | ||
| 285 | struct dev_pagemap *pgmap; | 287 | struct dev_pagemap *pgmap; |
| 286 | struct page_map *page_map; | 288 | struct page_map *page_map; |
| 287 | int error, nid, is_ram; | 289 | int error, nid, is_ram; |
| @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 351 | if (nid < 0) | 353 | if (nid < 0) |
| 352 | nid = numa_mem_id(); | 354 | nid = numa_mem_id(); |
| 353 | 355 | ||
| 356 | error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, | ||
| 357 | align_size); | ||
| 358 | if (error) | ||
| 359 | goto err_pfn_remap; | ||
| 360 | |||
| 354 | error = arch_add_memory(nid, align_start, align_size, true); | 361 | error = arch_add_memory(nid, align_start, align_size, true); |
| 355 | if (error) | 362 | if (error) |
| 356 | goto err_add_memory; | 363 | goto err_add_memory; |
| @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 371 | return __va(res->start); | 378 | return __va(res->start); |
| 372 | 379 | ||
| 373 | err_add_memory: | 380 | err_add_memory: |
| 381 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); | ||
| 382 | err_pfn_remap: | ||
| 374 | err_radix: | 383 | err_radix: |
| 375 | pgmap_radix_release(res); | 384 | pgmap_radix_release(res); |
| 376 | devres_free(page_map); | 385 | devres_free(page_map); |
diff --git a/kernel/module.c b/kernel/module.c index 529efae9f481..f57dd63186e6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) | |||
| 1149 | buf[l++] = 'C'; | 1149 | buf[l++] = 'C'; |
| 1150 | if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) | 1150 | if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) |
| 1151 | buf[l++] = 'E'; | 1151 | buf[l++] = 'E'; |
| 1152 | if (mod->taints & (1 << TAINT_LIVEPATCH)) | ||
| 1153 | buf[l++] = 'K'; | ||
| 1152 | /* | 1154 | /* |
| 1153 | * TAINT_FORCED_RMMOD: could be added. | 1155 | * TAINT_FORCED_RMMOD: could be added. |
| 1154 | * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 1156 | * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
| @@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l | |||
| 2792 | } | 2794 | } |
| 2793 | 2795 | ||
| 2794 | #ifdef CONFIG_LIVEPATCH | 2796 | #ifdef CONFIG_LIVEPATCH |
| 2795 | static int find_livepatch_modinfo(struct module *mod, struct load_info *info) | 2797 | static int check_modinfo_livepatch(struct module *mod, struct load_info *info) |
| 2796 | { | 2798 | { |
| 2797 | mod->klp = get_modinfo(info, "livepatch") ? true : false; | 2799 | if (get_modinfo(info, "livepatch")) { |
| 2800 | mod->klp = true; | ||
| 2801 | add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); | ||
| 2802 | } | ||
| 2798 | 2803 | ||
| 2799 | return 0; | 2804 | return 0; |
| 2800 | } | 2805 | } |
| 2801 | #else /* !CONFIG_LIVEPATCH */ | 2806 | #else /* !CONFIG_LIVEPATCH */ |
| 2802 | static int find_livepatch_modinfo(struct module *mod, struct load_info *info) | 2807 | static int check_modinfo_livepatch(struct module *mod, struct load_info *info) |
| 2803 | { | 2808 | { |
| 2804 | if (get_modinfo(info, "livepatch")) { | 2809 | if (get_modinfo(info, "livepatch")) { |
| 2805 | pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", | 2810 | pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", |
| @@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
| 2969 | "is unknown, you have been warned.\n", mod->name); | 2974 | "is unknown, you have been warned.\n", mod->name); |
| 2970 | } | 2975 | } |
| 2971 | 2976 | ||
| 2972 | err = find_livepatch_modinfo(mod, info); | 2977 | err = check_modinfo_livepatch(mod, info); |
| 2973 | if (err) | 2978 | if (err) |
| 2974 | return err; | 2979 | return err; |
| 2975 | 2980 | ||
diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..7848f0566403 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
| 31 | #include <linux/sysfs.h> | 31 | #include <linux/sysfs.h> |
| 32 | #include <linux/rcupdate.h> | 32 | #include <linux/rcupdate.h> |
| 33 | #include <linux/module.h> | ||
| 33 | 34 | ||
| 34 | #define MAX_OBJ_NUM 1000 | 35 | #define MAX_OBJ_NUM 1000 |
| 35 | 36 | ||
| @@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) | |||
| 769 | cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); | 770 | cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); |
| 770 | } | 771 | } |
| 771 | 772 | ||
| 772 | 773 | static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) | |
| 773 | static int padata_cpu_callback(struct notifier_block *nfb, | ||
| 774 | unsigned long action, void *hcpu) | ||
| 775 | { | 774 | { |
| 776 | int err; | ||
| 777 | struct padata_instance *pinst; | 775 | struct padata_instance *pinst; |
| 778 | int cpu = (unsigned long)hcpu; | 776 | int ret; |
| 779 | 777 | ||
| 780 | pinst = container_of(nfb, struct padata_instance, cpu_notifier); | 778 | pinst = hlist_entry_safe(node, struct padata_instance, node); |
| 779 | if (!pinst_has_cpu(pinst, cpu)) | ||
| 780 | return 0; | ||
| 781 | 781 | ||
| 782 | switch (action) { | 782 | mutex_lock(&pinst->lock); |
| 783 | case CPU_ONLINE: | 783 | ret = __padata_add_cpu(pinst, cpu); |
| 784 | case CPU_ONLINE_FROZEN: | 784 | mutex_unlock(&pinst->lock); |
| 785 | case CPU_DOWN_FAILED: | 785 | return ret; |
| 786 | case CPU_DOWN_FAILED_FROZEN: | 786 | } |
| 787 | if (!pinst_has_cpu(pinst, cpu)) | ||
| 788 | break; | ||
| 789 | mutex_lock(&pinst->lock); | ||
| 790 | err = __padata_add_cpu(pinst, cpu); | ||
| 791 | mutex_unlock(&pinst->lock); | ||
| 792 | if (err) | ||
| 793 | return notifier_from_errno(err); | ||
| 794 | break; | ||
| 795 | 787 | ||
| 796 | case CPU_DOWN_PREPARE: | 788 | static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) |
| 797 | case CPU_DOWN_PREPARE_FROZEN: | 789 | { |
| 798 | case CPU_UP_CANCELED: | 790 | struct padata_instance *pinst; |
| 799 | case CPU_UP_CANCELED_FROZEN: | 791 | int ret; |
| 800 | if (!pinst_has_cpu(pinst, cpu)) | 792 | |
| 801 | break; | 793 | pinst = hlist_entry_safe(node, struct padata_instance, node); |
| 802 | mutex_lock(&pinst->lock); | 794 | if (!pinst_has_cpu(pinst, cpu)) |
| 803 | err = __padata_remove_cpu(pinst, cpu); | 795 | return 0; |
| 804 | mutex_unlock(&pinst->lock); | ||
| 805 | if (err) | ||
| 806 | return notifier_from_errno(err); | ||
| 807 | break; | ||
| 808 | } | ||
| 809 | 796 | ||
| 810 | return NOTIFY_OK; | 797 | mutex_lock(&pinst->lock); |
| 798 | ret = __padata_remove_cpu(pinst, cpu); | ||
| 799 | mutex_unlock(&pinst->lock); | ||
| 800 | return ret; | ||
| 811 | } | 801 | } |
| 802 | |||
| 803 | static enum cpuhp_state hp_online; | ||
| 812 | #endif | 804 | #endif |
| 813 | 805 | ||
| 814 | static void __padata_free(struct padata_instance *pinst) | 806 | static void __padata_free(struct padata_instance *pinst) |
| 815 | { | 807 | { |
| 816 | #ifdef CONFIG_HOTPLUG_CPU | 808 | #ifdef CONFIG_HOTPLUG_CPU |
| 817 | unregister_hotcpu_notifier(&pinst->cpu_notifier); | 809 | cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); |
| 818 | #endif | 810 | #endif |
| 819 | 811 | ||
| 820 | padata_stop(pinst); | 812 | padata_stop(pinst); |
| @@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
| 1012 | mutex_init(&pinst->lock); | 1004 | mutex_init(&pinst->lock); |
| 1013 | 1005 | ||
| 1014 | #ifdef CONFIG_HOTPLUG_CPU | 1006 | #ifdef CONFIG_HOTPLUG_CPU |
| 1015 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | 1007 | cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); |
| 1016 | pinst->cpu_notifier.priority = 0; | ||
| 1017 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
| 1018 | #endif | 1008 | #endif |
| 1019 | |||
| 1020 | return pinst; | 1009 | return pinst; |
| 1021 | 1010 | ||
| 1022 | err_free_masks: | 1011 | err_free_masks: |
| @@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst) | |||
| 1039 | kobject_put(&pinst->kobj); | 1028 | kobject_put(&pinst->kobj); |
| 1040 | } | 1029 | } |
| 1041 | EXPORT_SYMBOL(padata_free); | 1030 | EXPORT_SYMBOL(padata_free); |
| 1031 | |||
| 1032 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1033 | |||
| 1034 | static __init int padata_driver_init(void) | ||
| 1035 | { | ||
| 1036 | int ret; | ||
| 1037 | |||
| 1038 | ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", | ||
| 1039 | padata_cpu_online, | ||
| 1040 | padata_cpu_prep_down); | ||
| 1041 | if (ret < 0) | ||
| 1042 | return ret; | ||
| 1043 | hp_online = ret; | ||
| 1044 | return 0; | ||
| 1045 | } | ||
| 1046 | module_init(padata_driver_init); | ||
| 1047 | |||
| 1048 | static __exit void padata_driver_exit(void) | ||
| 1049 | { | ||
| 1050 | cpuhp_remove_multi_state(hp_online); | ||
| 1051 | } | ||
| 1052 | module_exit(padata_driver_exit); | ||
| 1053 | #endif | ||
diff --git a/kernel/panic.c b/kernel/panic.c index ca8cea1ef673..e6480e20379e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) | |||
| 71 | panic_smp_self_stop(); | 71 | panic_smp_self_stop(); |
| 72 | } | 72 | } |
| 73 | 73 | ||
| 74 | /* | ||
| 75 | * Stop other CPUs in panic. Architecture dependent code may override this | ||
| 76 | * with more suitable version. For example, if the architecture supports | ||
| 77 | * crash dump, it should save registers of each stopped CPU and disable | ||
| 78 | * per-CPU features such as virtualization extensions. | ||
| 79 | */ | ||
| 80 | void __weak crash_smp_send_stop(void) | ||
| 81 | { | ||
| 82 | static int cpus_stopped; | ||
| 83 | |||
| 84 | /* | ||
| 85 | * This function can be called twice in panic path, but obviously | ||
| 86 | * we execute this only once. | ||
| 87 | */ | ||
| 88 | if (cpus_stopped) | ||
| 89 | return; | ||
| 90 | |||
| 91 | /* | ||
| 92 | * Note smp_send_stop is the usual smp shutdown function, which | ||
| 93 | * unfortunately means it may not be hardened to work in a panic | ||
| 94 | * situation. | ||
| 95 | */ | ||
| 96 | smp_send_stop(); | ||
| 97 | cpus_stopped = 1; | ||
| 98 | } | ||
| 99 | |||
| 74 | atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); | 100 | atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); |
| 75 | 101 | ||
| 76 | /* | 102 | /* |
| @@ -164,14 +190,21 @@ void panic(const char *fmt, ...) | |||
| 164 | if (!_crash_kexec_post_notifiers) { | 190 | if (!_crash_kexec_post_notifiers) { |
| 165 | printk_nmi_flush_on_panic(); | 191 | printk_nmi_flush_on_panic(); |
| 166 | __crash_kexec(NULL); | 192 | __crash_kexec(NULL); |
| 167 | } | ||
| 168 | 193 | ||
| 169 | /* | 194 | /* |
| 170 | * Note smp_send_stop is the usual smp shutdown function, which | 195 | * Note smp_send_stop is the usual smp shutdown function, which |
| 171 | * unfortunately means it may not be hardened to work in a panic | 196 | * unfortunately means it may not be hardened to work in a |
| 172 | * situation. | 197 | * panic situation. |
| 173 | */ | 198 | */ |
| 174 | smp_send_stop(); | 199 | smp_send_stop(); |
| 200 | } else { | ||
| 201 | /* | ||
| 202 | * If we want to do crash dump after notifier calls and | ||
| 203 | * kmsg_dump, we will need architecture dependent extra | ||
| 204 | * works in addition to stopping other CPUs. | ||
| 205 | */ | ||
| 206 | crash_smp_send_stop(); | ||
| 207 | } | ||
| 175 | 208 | ||
| 176 | /* | 209 | /* |
| 177 | * Run any panic handlers, including those that might need to | 210 | * Run any panic handlers, including those that might need to |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..df9e8e9e0be7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) | |||
| 79 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | 79 | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
| 80 | #define MAX_PID_NS_LEVEL 32 | 80 | #define MAX_PID_NS_LEVEL 32 |
| 81 | 81 | ||
| 82 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) | ||
| 83 | { | ||
| 84 | return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); | ||
| 85 | } | ||
| 86 | |||
| 87 | static void dec_pid_namespaces(struct ucounts *ucounts) | ||
| 88 | { | ||
| 89 | dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); | ||
| 90 | } | ||
| 91 | |||
| 82 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, | 92 | static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, |
| 83 | struct pid_namespace *parent_pid_ns) | 93 | struct pid_namespace *parent_pid_ns) |
| 84 | { | 94 | { |
| 85 | struct pid_namespace *ns; | 95 | struct pid_namespace *ns; |
| 86 | unsigned int level = parent_pid_ns->level + 1; | 96 | unsigned int level = parent_pid_ns->level + 1; |
| 97 | struct ucounts *ucounts; | ||
| 87 | int i; | 98 | int i; |
| 88 | int err; | 99 | int err; |
| 89 | 100 | ||
| 90 | if (level > MAX_PID_NS_LEVEL) { | 101 | err = -ENOSPC; |
| 91 | err = -EINVAL; | 102 | if (level > MAX_PID_NS_LEVEL) |
| 103 | goto out; | ||
| 104 | ucounts = inc_pid_namespaces(user_ns); | ||
| 105 | if (!ucounts) | ||
| 92 | goto out; | 106 | goto out; |
| 93 | } | ||
| 94 | 107 | ||
| 95 | err = -ENOMEM; | 108 | err = -ENOMEM; |
| 96 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); | 109 | ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); |
| 97 | if (ns == NULL) | 110 | if (ns == NULL) |
| 98 | goto out; | 111 | goto out_dec; |
| 99 | 112 | ||
| 100 | ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 113 | ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
| 101 | if (!ns->pidmap[0].page) | 114 | if (!ns->pidmap[0].page) |
| @@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
| 114 | ns->level = level; | 127 | ns->level = level; |
| 115 | ns->parent = get_pid_ns(parent_pid_ns); | 128 | ns->parent = get_pid_ns(parent_pid_ns); |
| 116 | ns->user_ns = get_user_ns(user_ns); | 129 | ns->user_ns = get_user_ns(user_ns); |
| 130 | ns->ucounts = ucounts; | ||
| 117 | ns->nr_hashed = PIDNS_HASH_ADDING; | 131 | ns->nr_hashed = PIDNS_HASH_ADDING; |
| 118 | INIT_WORK(&ns->proc_work, proc_cleanup_work); | 132 | INIT_WORK(&ns->proc_work, proc_cleanup_work); |
| 119 | 133 | ||
| @@ -129,6 +143,8 @@ out_free_map: | |||
| 129 | kfree(ns->pidmap[0].page); | 143 | kfree(ns->pidmap[0].page); |
| 130 | out_free: | 144 | out_free: |
| 131 | kmem_cache_free(pid_ns_cachep, ns); | 145 | kmem_cache_free(pid_ns_cachep, ns); |
| 146 | out_dec: | ||
| 147 | dec_pid_namespaces(ucounts); | ||
| 132 | out: | 148 | out: |
| 133 | return ERR_PTR(err); | 149 | return ERR_PTR(err); |
| 134 | } | 150 | } |
| @@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
| 146 | ns_free_inum(&ns->ns); | 162 | ns_free_inum(&ns->ns); |
| 147 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 163 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
| 148 | kfree(ns->pidmap[i].page); | 164 | kfree(ns->pidmap[i].page); |
| 165 | dec_pid_namespaces(ns->ucounts); | ||
| 149 | put_user_ns(ns->user_ns); | 166 | put_user_ns(ns->user_ns); |
| 150 | call_rcu(&ns->rcu, delayed_free_pidns); | 167 | call_rcu(&ns->rcu, delayed_free_pidns); |
| 151 | } | 168 | } |
| @@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) | |||
| 388 | return 0; | 405 | return 0; |
| 389 | } | 406 | } |
| 390 | 407 | ||
| 408 | static struct ns_common *pidns_get_parent(struct ns_common *ns) | ||
| 409 | { | ||
| 410 | struct pid_namespace *active = task_active_pid_ns(current); | ||
| 411 | struct pid_namespace *pid_ns, *p; | ||
| 412 | |||
| 413 | /* See if the parent is in the current namespace */ | ||
| 414 | pid_ns = p = to_pid_ns(ns)->parent; | ||
| 415 | for (;;) { | ||
| 416 | if (!p) | ||
| 417 | return ERR_PTR(-EPERM); | ||
| 418 | if (p == active) | ||
| 419 | break; | ||
| 420 | p = p->parent; | ||
| 421 | } | ||
| 422 | |||
| 423 | return &get_pid_ns(pid_ns)->ns; | ||
| 424 | } | ||
| 425 | |||
| 426 | static struct user_namespace *pidns_owner(struct ns_common *ns) | ||
| 427 | { | ||
| 428 | return to_pid_ns(ns)->user_ns; | ||
| 429 | } | ||
| 430 | |||
| 391 | const struct proc_ns_operations pidns_operations = { | 431 | const struct proc_ns_operations pidns_operations = { |
| 392 | .name = "pid", | 432 | .name = "pid", |
| 393 | .type = CLONE_NEWPID, | 433 | .type = CLONE_NEWPID, |
| 394 | .get = pidns_get, | 434 | .get = pidns_get, |
| 395 | .put = pidns_put, | 435 | .put = pidns_put, |
| 396 | .install = pidns_install, | 436 | .install = pidns_install, |
| 437 | .owner = pidns_owner, | ||
| 438 | .get_parent = pidns_get_parent, | ||
| 397 | }; | 439 | }; |
| 398 | 440 | ||
| 399 | static __init int pid_namespaces_init(void) | 441 | static __init int pid_namespaces_init(void) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 68d3ebc12601..e8517b63eb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG | |||
| 186 | 186 | ||
| 187 | config DPM_WATCHDOG | 187 | config DPM_WATCHDOG |
| 188 | bool "Device suspend/resume watchdog" | 188 | bool "Device suspend/resume watchdog" |
| 189 | depends on PM_DEBUG && PSTORE | 189 | depends on PM_DEBUG && PSTORE && EXPERT |
| 190 | ---help--- | 190 | ---help--- |
| 191 | Sets up a watchdog timer to capture drivers that are | 191 | Sets up a watchdog timer to capture drivers that are |
| 192 | locked up attempting to suspend/resume a device. | 192 | locked up attempting to suspend/resume a device. |
| @@ -197,7 +197,7 @@ config DPM_WATCHDOG | |||
| 197 | config DPM_WATCHDOG_TIMEOUT | 197 | config DPM_WATCHDOG_TIMEOUT |
| 198 | int "Watchdog timeout in seconds" | 198 | int "Watchdog timeout in seconds" |
| 199 | range 1 120 | 199 | range 1 120 |
| 200 | default 60 | 200 | default 120 |
| 201 | depends on DPM_WATCHDOG | 201 | depends on DPM_WATCHDOG |
| 202 | 202 | ||
| 203 | config PM_TRACE | 203 | config PM_TRACE |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 33c79b6105c5..b26dbc48c75b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -306,8 +306,10 @@ static int create_image(int platform_mode) | |||
| 306 | if (error) | 306 | if (error) |
| 307 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", | 307 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", |
| 308 | error); | 308 | error); |
| 309 | if (!in_suspend) | 309 | if (!in_suspend) { |
| 310 | events_check_enabled = false; | 310 | events_check_enabled = false; |
| 311 | clear_free_pages(); | ||
| 312 | } | ||
| 311 | 313 | ||
| 312 | platform_leave(platform_mode); | 314 | platform_leave(platform_mode); |
| 313 | 315 | ||
| @@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str) | |||
| 1189 | return 1; | 1191 | return 1; |
| 1190 | } | 1192 | } |
| 1191 | 1193 | ||
| 1192 | static int __init page_poison_nohibernate_setup(char *str) | ||
| 1193 | { | ||
| 1194 | #ifdef CONFIG_PAGE_POISONING_ZERO | ||
| 1195 | /* | ||
| 1196 | * The zeroing option for page poison skips the checks on alloc. | ||
| 1197 | * since hibernation doesn't save free pages there's no way to | ||
| 1198 | * guarantee the pages will still be zeroed. | ||
| 1199 | */ | ||
| 1200 | if (!strcmp(str, "on")) { | ||
| 1201 | pr_info("Disabling hibernation due to page poisoning\n"); | ||
| 1202 | return nohibernate_setup(str); | ||
| 1203 | } | ||
| 1204 | #endif | ||
| 1205 | return 1; | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | __setup("noresume", noresume_setup); | 1194 | __setup("noresume", noresume_setup); |
| 1209 | __setup("resume_offset=", resume_offset_setup); | 1195 | __setup("resume_offset=", resume_offset_setup); |
| 1210 | __setup("resume=", resume_setup); | 1196 | __setup("resume=", resume_setup); |
| @@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup); | |||
| 1212 | __setup("resumewait", resumewait_setup); | 1198 | __setup("resumewait", resumewait_setup); |
| 1213 | __setup("resumedelay=", resumedelay_setup); | 1199 | __setup("resumedelay=", resumedelay_setup); |
| 1214 | __setup("nohibernate", nohibernate_setup); | 1200 | __setup("nohibernate", nohibernate_setup); |
| 1215 | __setup("page_poison=", page_poison_nohibernate_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ea50b1b7595..281a697fd458 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -644,6 +644,7 @@ static int __init pm_init(void) | |||
| 644 | return error; | 644 | return error; |
| 645 | hibernate_image_size_init(); | 645 | hibernate_image_size_init(); |
| 646 | hibernate_reserved_size_init(); | 646 | hibernate_reserved_size_init(); |
| 647 | pm_states_init(); | ||
| 647 | power_kobj = kobject_create_and_add("power", NULL); | 648 | power_kobj = kobject_create_and_add("power", NULL); |
| 648 | if (!power_kobj) | 649 | if (!power_kobj) |
| 649 | return -ENOMEM; | 650 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 242d8b827dd5..56d1d0dedf76 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void); | |||
| 110 | extern void free_basic_memory_bitmaps(void); | 110 | extern void free_basic_memory_bitmaps(void); |
| 111 | extern int hibernate_preallocate_memory(void); | 111 | extern int hibernate_preallocate_memory(void); |
| 112 | 112 | ||
| 113 | extern void clear_free_pages(void); | ||
| 114 | |||
| 113 | /** | 115 | /** |
| 114 | * Auxiliary structure used for reading the snapshot image data and | 116 | * Auxiliary structure used for reading the snapshot image data and |
| 115 | * metadata from and writing them to the list of page backup entries | 117 | * metadata from and writing them to the list of page backup entries |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -144,23 +144,12 @@ int freeze_processes(void) | |||
| 144 | /* | 144 | /* |
| 145 | * Now that the whole userspace is frozen we need to disbale | 145 | * Now that the whole userspace is frozen we need to disbale |
| 146 | * the OOM killer to disallow any further interference with | 146 | * the OOM killer to disallow any further interference with |
| 147 | * killable tasks. | 147 | * killable tasks. There is no guarantee oom victims will |
| 148 | * ever reach a point they go away we have to wait with a timeout. | ||
| 148 | */ | 149 | */ |
| 149 | if (!error && !oom_killer_disable()) | 150 | if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) |
| 150 | error = -EBUSY; | 151 | error = -EBUSY; |
| 151 | 152 | ||
| 152 | /* | ||
| 153 | * There is a hard to fix race between oom_reaper kernel thread | ||
| 154 | * and oom_killer_disable. oom_reaper calls exit_oom_victim | ||
| 155 | * before the victim reaches exit_mm so try to freeze all the tasks | ||
| 156 | * again and catch such a left over task. | ||
| 157 | */ | ||
| 158 | if (!error) { | ||
| 159 | pr_info("Double checking all user space processes after OOM killer disable... "); | ||
| 160 | error = try_to_freeze_tasks(true); | ||
| 161 | pr_cont("\n"); | ||
| 162 | } | ||
| 163 | |||
| 164 | if (error) | 153 | if (error) |
| 165 | thaw_processes(); | 154 | thaw_processes(); |
| 166 | return error; | 155 | return error; |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
| 482 | return; | 482 | return; |
| 483 | } | 483 | } |
| 484 | 484 | ||
| 485 | cancel_delayed_work_sync(&req->work); | 485 | /* |
| 486 | * This function may be called very early during boot, for example, | ||
| 487 | * from of_clk_init(), where irq needs to stay disabled. | ||
| 488 | * cancel_delayed_work_sync() assumes that irq is enabled on | ||
| 489 | * invocation and re-enables it on return. Avoid calling it until | ||
| 490 | * workqueue is initialized. | ||
| 491 | */ | ||
| 492 | if (keventd_up()) | ||
| 493 | cancel_delayed_work_sync(&req->work); | ||
| 494 | |||
| 486 | __pm_qos_update_request(req, new_value); | 495 | __pm_qos_update_request(req, new_value); |
| 487 | } | 496 | } |
| 488 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 497 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 9a0178c2ac1d..4f0f0604f1c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) | |||
| 835 | */ | 835 | */ |
| 836 | static bool rtree_next_node(struct memory_bitmap *bm) | 836 | static bool rtree_next_node(struct memory_bitmap *bm) |
| 837 | { | 837 | { |
| 838 | bm->cur.node = list_entry(bm->cur.node->list.next, | 838 | if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { |
| 839 | struct rtree_node, list); | 839 | bm->cur.node = list_entry(bm->cur.node->list.next, |
| 840 | if (&bm->cur.node->list != &bm->cur.zone->leaves) { | 840 | struct rtree_node, list); |
| 841 | bm->cur.node_pfn += BM_BITS_PER_BLOCK; | 841 | bm->cur.node_pfn += BM_BITS_PER_BLOCK; |
| 842 | bm->cur.node_bit = 0; | 842 | bm->cur.node_bit = 0; |
| 843 | touch_softlockup_watchdog(); | 843 | touch_softlockup_watchdog(); |
| @@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) | |||
| 845 | } | 845 | } |
| 846 | 846 | ||
| 847 | /* No more nodes, goto next zone */ | 847 | /* No more nodes, goto next zone */ |
| 848 | bm->cur.zone = list_entry(bm->cur.zone->list.next, | 848 | if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { |
| 849 | bm->cur.zone = list_entry(bm->cur.zone->list.next, | ||
| 849 | struct mem_zone_bm_rtree, list); | 850 | struct mem_zone_bm_rtree, list); |
| 850 | if (&bm->cur.zone->list != &bm->zones) { | ||
| 851 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, | 851 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, |
| 852 | struct rtree_node, list); | 852 | struct rtree_node, list); |
| 853 | bm->cur.node_pfn = 0; | 853 | bm->cur.node_pfn = 0; |
| @@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void) | |||
| 1132 | pr_debug("PM: Basic memory bitmaps freed\n"); | 1132 | pr_debug("PM: Basic memory bitmaps freed\n"); |
| 1133 | } | 1133 | } |
| 1134 | 1134 | ||
| 1135 | void clear_free_pages(void) | ||
| 1136 | { | ||
| 1137 | #ifdef CONFIG_PAGE_POISONING_ZERO | ||
| 1138 | struct memory_bitmap *bm = free_pages_map; | ||
| 1139 | unsigned long pfn; | ||
| 1140 | |||
| 1141 | if (WARN_ON(!(free_pages_map))) | ||
| 1142 | return; | ||
| 1143 | |||
| 1144 | memory_bm_position_reset(bm); | ||
| 1145 | pfn = memory_bm_next_pfn(bm); | ||
| 1146 | while (pfn != BM_END_OF_MAP) { | ||
| 1147 | if (pfn_valid(pfn)) | ||
| 1148 | clear_highpage(pfn_to_page(pfn)); | ||
| 1149 | |||
| 1150 | pfn = memory_bm_next_pfn(bm); | ||
| 1151 | } | ||
| 1152 | memory_bm_position_reset(bm); | ||
| 1153 | pr_info("PM: free pages cleared after restore\n"); | ||
| 1154 | #endif /* PAGE_POISONING_ZERO */ | ||
| 1155 | } | ||
| 1156 | |||
| 1135 | /** | 1157 | /** |
| 1136 | * snapshot_additional_pages - Estimate the number of extra pages needed. | 1158 | * snapshot_additional_pages - Estimate the number of extra pages needed. |
| 1137 | * @zone: Memory zone to carry out the computation for. | 1159 | * @zone: Memory zone to carry out the computation for. |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0acab9d7f96f..1e7f5da648d9 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state) | |||
| 118 | */ | 118 | */ |
| 119 | static bool relative_states; | 119 | static bool relative_states; |
| 120 | 120 | ||
| 121 | void __init pm_states_init(void) | ||
| 122 | { | ||
| 123 | /* | ||
| 124 | * freeze state should be supported even without any suspend_ops, | ||
| 125 | * initialize pm_states accordingly here | ||
| 126 | */ | ||
| 127 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; | ||
| 128 | } | ||
| 129 | |||
| 121 | static int __init sleep_states_setup(char *str) | 130 | static int __init sleep_states_setup(char *str) |
| 122 | { | 131 | { |
| 123 | relative_states = !strncmp(str, "1", 1); | 132 | relative_states = !strncmp(str, "1", 1); |
| 124 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; | ||
| 125 | return 1; | 133 | return 1; |
| 126 | } | 134 | } |
| 127 | 135 | ||
| @@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state) | |||
| 211 | { | 219 | { |
| 212 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) | 220 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) |
| 213 | return freeze_ops->begin(); | 221 | return freeze_ops->begin(); |
| 214 | else if (suspend_ops->begin) | 222 | else if (suspend_ops && suspend_ops->begin) |
| 215 | return suspend_ops->begin(state); | 223 | return suspend_ops->begin(state); |
| 216 | else | 224 | else |
| 217 | return 0; | 225 | return 0; |
| @@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state) | |||
| 221 | { | 229 | { |
| 222 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | 230 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) |
| 223 | freeze_ops->end(); | 231 | freeze_ops->end(); |
| 224 | else if (suspend_ops->end) | 232 | else if (suspend_ops && suspend_ops->end) |
| 225 | suspend_ops->end(); | 233 | suspend_ops->end(); |
| 226 | } | 234 | } |
| 227 | 235 | ||
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c | |||
| @@ -9,10 +9,10 @@ | |||
| 9 | 9 | ||
| 10 | char *_braille_console_setup(char **str, char **brl_options) | 10 | char *_braille_console_setup(char **str, char **brl_options) |
| 11 | { | 11 | { |
| 12 | if (!memcmp(*str, "brl,", 4)) { | 12 | if (!strncmp(*str, "brl,", 4)) { |
| 13 | *brl_options = ""; | 13 | *brl_options = ""; |
| 14 | *str += 4; | 14 | *str += 4; |
| 15 | } else if (!memcmp(str, "brl=", 4)) { | 15 | } else if (!strncmp(*str, "brl=", 4)) { |
| 16 | *brl_options = *str + 4; | 16 | *brl_options = *str + 4; |
| 17 | *str = strchr(*brl_options, ','); | 17 | *str = strchr(*brl_options, ','); |
| 18 | if (!*str) | 18 | if (!*str) |
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..16bab471c7e2 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c | |||
| @@ -99,27 +99,33 @@ again: | |||
| 99 | return add; | 99 | return add; |
| 100 | } | 100 | } |
| 101 | 101 | ||
| 102 | /* | 102 | static void printk_nmi_flush_line(const char *text, int len) |
| 103 | * printk one line from the temporary buffer from @start index until | ||
| 104 | * and including the @end index. | ||
| 105 | */ | ||
| 106 | static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) | ||
| 107 | { | 103 | { |
| 108 | const char *buf = s->buffer + start; | ||
| 109 | |||
| 110 | /* | 104 | /* |
| 111 | * The buffers are flushed in NMI only on panic. The messages must | 105 | * The buffers are flushed in NMI only on panic. The messages must |
| 112 | * go only into the ring buffer at this stage. Consoles will get | 106 | * go only into the ring buffer at this stage. Consoles will get |
| 113 | * explicitly called later when a crashdump is not generated. | 107 | * explicitly called later when a crashdump is not generated. |
| 114 | */ | 108 | */ |
| 115 | if (in_nmi()) | 109 | if (in_nmi()) |
| 116 | printk_deferred("%.*s", (end - start) + 1, buf); | 110 | printk_deferred("%.*s", len, text); |
| 117 | else | 111 | else |
| 118 | printk("%.*s", (end - start) + 1, buf); | 112 | printk("%.*s", len, text); |
| 119 | 113 | ||
| 120 | } | 114 | } |
| 121 | 115 | ||
| 122 | /* | 116 | /* |
| 117 | * printk one line from the temporary buffer from @start index until | ||
| 118 | * and including the @end index. | ||
| 119 | */ | ||
| 120 | static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, | ||
| 121 | int start, int end) | ||
| 122 | { | ||
| 123 | const char *buf = s->buffer + start; | ||
| 124 | |||
| 125 | printk_nmi_flush_line(buf, (end - start) + 1); | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 123 | * Flush data from the associated per_CPU buffer. The function | 129 | * Flush data from the associated per_CPU buffer. The function |
| 124 | * can be called either via IRQ work or independently. | 130 | * can be called either via IRQ work or independently. |
| 125 | */ | 131 | */ |
| @@ -150,9 +156,11 @@ more: | |||
| 150 | * the buffer an unexpected way. If we printed something then | 156 | * the buffer an unexpected way. If we printed something then |
| 151 | * @len must only increase. | 157 | * @len must only increase. |
| 152 | */ | 158 | */ |
| 153 | if (i && i >= len) | 159 | if (i && i >= len) { |
| 154 | pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", | 160 | const char *msg = "printk_nmi_flush: internal error\n"; |
| 155 | i, len); | 161 | |
| 162 | printk_nmi_flush_line(msg, strlen(msg)); | ||
| 163 | } | ||
| 156 | 164 | ||
| 157 | if (!len) | 165 | if (!len) |
| 158 | goto out; /* Someone else has already flushed the buffer. */ | 166 | goto out; /* Someone else has already flushed the buffer. */ |
| @@ -166,14 +174,14 @@ more: | |||
| 166 | /* Print line by line. */ | 174 | /* Print line by line. */ |
| 167 | for (; i < size; i++) { | 175 | for (; i < size; i++) { |
| 168 | if (s->buffer[i] == '\n') { | 176 | if (s->buffer[i] == '\n') { |
| 169 | print_nmi_seq_line(s, last_i, i); | 177 | printk_nmi_flush_seq_line(s, last_i, i); |
| 170 | last_i = i + 1; | 178 | last_i = i + 1; |
| 171 | } | 179 | } |
| 172 | } | 180 | } |
| 173 | /* Check if there was a partial line. */ | 181 | /* Check if there was a partial line. */ |
| 174 | if (last_i < size) { | 182 | if (last_i < size) { |
| 175 | print_nmi_seq_line(s, last_i, size - 1); | 183 | printk_nmi_flush_seq_line(s, last_i, size - 1); |
| 176 | pr_cont("\n"); | 184 | printk_nmi_flush_line("\n", strlen("\n")); |
| 177 | } | 185 | } |
| 178 | 186 | ||
| 179 | /* | 187 | /* |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -253,6 +253,17 @@ static int preferred_console = -1; | |||
| 253 | int console_set_on_cmdline; | 253 | int console_set_on_cmdline; |
| 254 | EXPORT_SYMBOL(console_set_on_cmdline); | 254 | EXPORT_SYMBOL(console_set_on_cmdline); |
| 255 | 255 | ||
| 256 | #ifdef CONFIG_OF | ||
| 257 | static bool of_specified_console; | ||
| 258 | |||
| 259 | void console_set_by_of(void) | ||
| 260 | { | ||
| 261 | of_specified_console = true; | ||
| 262 | } | ||
| 263 | #else | ||
| 264 | # define of_specified_console false | ||
| 265 | #endif | ||
| 266 | |||
| 256 | /* Flag: console code may call schedule() */ | 267 | /* Flag: console code may call schedule() */ |
| 257 | static int console_may_schedule; | 268 | static int console_may_schedule; |
| 258 | 269 | ||
| @@ -655,11 +666,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, | |||
| 655 | * better readable output. 'c' in the record flags mark the first | 666 | * better readable output. 'c' in the record flags mark the first |
| 656 | * fragment of a line, '+' the following. | 667 | * fragment of a line, '+' the following. |
| 657 | */ | 668 | */ |
| 658 | if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) | 669 | if (msg->flags & LOG_CONT) |
| 659 | cont = 'c'; | 670 | cont = (prev_flags & LOG_CONT) ? '+' : 'c'; |
| 660 | else if ((msg->flags & LOG_CONT) || | ||
| 661 | ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) | ||
| 662 | cont = '+'; | ||
| 663 | 671 | ||
| 664 | return scnprintf(buf, size, "%u,%llu,%llu,%c;", | 672 | return scnprintf(buf, size, "%u,%llu,%llu,%c;", |
| 665 | (msg->facility << 3) | msg->level, seq, ts_usec, cont); | 673 | (msg->facility << 3) | msg->level, seq, ts_usec, cont); |
| @@ -786,6 +794,8 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) | |||
| 786 | return ret; | 794 | return ret; |
| 787 | } | 795 | } |
| 788 | 796 | ||
| 797 | static void cont_flush(void); | ||
| 798 | |||
| 789 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | 799 | static ssize_t devkmsg_read(struct file *file, char __user *buf, |
| 790 | size_t count, loff_t *ppos) | 800 | size_t count, loff_t *ppos) |
| 791 | { | 801 | { |
| @@ -801,6 +811,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
| 801 | if (ret) | 811 | if (ret) |
| 802 | return ret; | 812 | return ret; |
| 803 | raw_spin_lock_irq(&logbuf_lock); | 813 | raw_spin_lock_irq(&logbuf_lock); |
| 814 | cont_flush(); | ||
| 804 | while (user->seq == log_next_seq) { | 815 | while (user->seq == log_next_seq) { |
| 805 | if (file->f_flags & O_NONBLOCK) { | 816 | if (file->f_flags & O_NONBLOCK) { |
| 806 | ret = -EAGAIN; | 817 | ret = -EAGAIN; |
| @@ -863,6 +874,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 863 | return -ESPIPE; | 874 | return -ESPIPE; |
| 864 | 875 | ||
| 865 | raw_spin_lock_irq(&logbuf_lock); | 876 | raw_spin_lock_irq(&logbuf_lock); |
| 877 | cont_flush(); | ||
| 866 | switch (whence) { | 878 | switch (whence) { |
| 867 | case SEEK_SET: | 879 | case SEEK_SET: |
| 868 | /* the first record */ | 880 | /* the first record */ |
| @@ -901,6 +913,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | |||
| 901 | poll_wait(file, &log_wait, wait); | 913 | poll_wait(file, &log_wait, wait); |
| 902 | 914 | ||
| 903 | raw_spin_lock_irq(&logbuf_lock); | 915 | raw_spin_lock_irq(&logbuf_lock); |
| 916 | cont_flush(); | ||
| 904 | if (user->seq < log_next_seq) { | 917 | if (user->seq < log_next_seq) { |
| 905 | /* return error when data has vanished underneath us */ | 918 | /* return error when data has vanished underneath us */ |
| 906 | if (user->seq < log_first_seq) | 919 | if (user->seq < log_first_seq) |
| @@ -1287,6 +1300,7 @@ static int syslog_print(char __user *buf, int size) | |||
| 1287 | size_t skip; | 1300 | size_t skip; |
| 1288 | 1301 | ||
| 1289 | raw_spin_lock_irq(&logbuf_lock); | 1302 | raw_spin_lock_irq(&logbuf_lock); |
| 1303 | cont_flush(); | ||
| 1290 | if (syslog_seq < log_first_seq) { | 1304 | if (syslog_seq < log_first_seq) { |
| 1291 | /* messages are gone, move to first one */ | 1305 | /* messages are gone, move to first one */ |
| 1292 | syslog_seq = log_first_seq; | 1306 | syslog_seq = log_first_seq; |
| @@ -1346,6 +1360,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 1346 | return -ENOMEM; | 1360 | return -ENOMEM; |
| 1347 | 1361 | ||
| 1348 | raw_spin_lock_irq(&logbuf_lock); | 1362 | raw_spin_lock_irq(&logbuf_lock); |
| 1363 | cont_flush(); | ||
| 1349 | if (buf) { | 1364 | if (buf) { |
| 1350 | u64 next_seq; | 1365 | u64 next_seq; |
| 1351 | u64 seq; | 1366 | u64 seq; |
| @@ -1507,6 +1522,7 @@ int do_syslog(int type, char __user *buf, int len, int source) | |||
| 1507 | /* Number of chars in the log buffer */ | 1522 | /* Number of chars in the log buffer */ |
| 1508 | case SYSLOG_ACTION_SIZE_UNREAD: | 1523 | case SYSLOG_ACTION_SIZE_UNREAD: |
| 1509 | raw_spin_lock_irq(&logbuf_lock); | 1524 | raw_spin_lock_irq(&logbuf_lock); |
| 1525 | cont_flush(); | ||
| 1510 | if (syslog_seq < log_first_seq) { | 1526 | if (syslog_seq < log_first_seq) { |
| 1511 | /* messages are gone, move to first one */ | 1527 | /* messages are gone, move to first one */ |
| 1512 | syslog_seq = log_first_seq; | 1528 | syslog_seq = log_first_seq; |
| @@ -1643,35 +1659,33 @@ static struct cont { | |||
| 1643 | bool flushed:1; /* buffer sealed and committed */ | 1659 | bool flushed:1; /* buffer sealed and committed */ |
| 1644 | } cont; | 1660 | } cont; |
| 1645 | 1661 | ||
| 1646 | static void cont_flush(enum log_flags flags) | 1662 | static void cont_flush(void) |
| 1647 | { | 1663 | { |
| 1648 | if (cont.flushed) | 1664 | if (cont.flushed) |
| 1649 | return; | 1665 | return; |
| 1650 | if (cont.len == 0) | 1666 | if (cont.len == 0) |
| 1651 | return; | 1667 | return; |
| 1652 | |||
| 1653 | if (cont.cons) { | 1668 | if (cont.cons) { |
| 1654 | /* | 1669 | /* |
| 1655 | * If a fragment of this line was directly flushed to the | 1670 | * If a fragment of this line was directly flushed to the |
| 1656 | * console; wait for the console to pick up the rest of the | 1671 | * console; wait for the console to pick up the rest of the |
| 1657 | * line. LOG_NOCONS suppresses a duplicated output. | 1672 | * line. LOG_NOCONS suppresses a duplicated output. |
| 1658 | */ | 1673 | */ |
| 1659 | log_store(cont.facility, cont.level, flags | LOG_NOCONS, | 1674 | log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, |
| 1660 | cont.ts_nsec, NULL, 0, cont.buf, cont.len); | 1675 | cont.ts_nsec, NULL, 0, cont.buf, cont.len); |
| 1661 | cont.flags = flags; | ||
| 1662 | cont.flushed = true; | 1676 | cont.flushed = true; |
| 1663 | } else { | 1677 | } else { |
| 1664 | /* | 1678 | /* |
| 1665 | * If no fragment of this line ever reached the console, | 1679 | * If no fragment of this line ever reached the console, |
| 1666 | * just submit it to the store and free the buffer. | 1680 | * just submit it to the store and free the buffer. |
| 1667 | */ | 1681 | */ |
| 1668 | log_store(cont.facility, cont.level, flags, 0, | 1682 | log_store(cont.facility, cont.level, cont.flags, 0, |
| 1669 | NULL, 0, cont.buf, cont.len); | 1683 | NULL, 0, cont.buf, cont.len); |
| 1670 | cont.len = 0; | 1684 | cont.len = 0; |
| 1671 | } | 1685 | } |
| 1672 | } | 1686 | } |
| 1673 | 1687 | ||
| 1674 | static bool cont_add(int facility, int level, const char *text, size_t len) | 1688 | static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) |
| 1675 | { | 1689 | { |
| 1676 | if (cont.len && cont.flushed) | 1690 | if (cont.len && cont.flushed) |
| 1677 | return false; | 1691 | return false; |
| @@ -1682,7 +1696,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
| 1682 | * the line gets too long, split it up in separate records. | 1696 | * the line gets too long, split it up in separate records. |
| 1683 | */ | 1697 | */ |
| 1684 | if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { | 1698 | if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { |
| 1685 | cont_flush(LOG_CONT); | 1699 | cont_flush(); |
| 1686 | return false; | 1700 | return false; |
| 1687 | } | 1701 | } |
| 1688 | 1702 | ||
| @@ -1691,7 +1705,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
| 1691 | cont.level = level; | 1705 | cont.level = level; |
| 1692 | cont.owner = current; | 1706 | cont.owner = current; |
| 1693 | cont.ts_nsec = local_clock(); | 1707 | cont.ts_nsec = local_clock(); |
| 1694 | cont.flags = 0; | 1708 | cont.flags = flags; |
| 1695 | cont.cons = 0; | 1709 | cont.cons = 0; |
| 1696 | cont.flushed = false; | 1710 | cont.flushed = false; |
| 1697 | } | 1711 | } |
| @@ -1699,8 +1713,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len) | |||
| 1699 | memcpy(cont.buf + cont.len, text, len); | 1713 | memcpy(cont.buf + cont.len, text, len); |
| 1700 | cont.len += len; | 1714 | cont.len += len; |
| 1701 | 1715 | ||
| 1716 | // The original flags come from the first line, | ||
| 1717 | // but later continuations can add a newline. | ||
| 1718 | if (flags & LOG_NEWLINE) { | ||
| 1719 | cont.flags |= LOG_NEWLINE; | ||
| 1720 | cont_flush(); | ||
| 1721 | } | ||
| 1722 | |||
| 1702 | if (cont.len > (sizeof(cont.buf) * 80) / 100) | 1723 | if (cont.len > (sizeof(cont.buf) * 80) / 100) |
| 1703 | cont_flush(LOG_CONT); | 1724 | cont_flush(); |
| 1704 | 1725 | ||
| 1705 | return true; | 1726 | return true; |
| 1706 | } | 1727 | } |
| @@ -1733,6 +1754,35 @@ static size_t cont_print_text(char *text, size_t size) | |||
| 1733 | return textlen; | 1754 | return textlen; |
| 1734 | } | 1755 | } |
| 1735 | 1756 | ||
| 1757 | static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) | ||
| 1758 | { | ||
| 1759 | /* | ||
| 1760 | * If an earlier line was buffered, and we're a continuation | ||
| 1761 | * write from the same process, try to add it to the buffer. | ||
| 1762 | */ | ||
| 1763 | if (cont.len) { | ||
| 1764 | if (cont.owner == current && (lflags & LOG_CONT)) { | ||
| 1765 | if (cont_add(facility, level, lflags, text, text_len)) | ||
| 1766 | return text_len; | ||
| 1767 | } | ||
| 1768 | /* Otherwise, make sure it's flushed */ | ||
| 1769 | cont_flush(); | ||
| 1770 | } | ||
| 1771 | |||
| 1772 | /* Skip empty continuation lines that couldn't be added - they just flush */ | ||
| 1773 | if (!text_len && (lflags & LOG_CONT)) | ||
| 1774 | return 0; | ||
| 1775 | |||
| 1776 | /* If it doesn't end in a newline, try to buffer the current line */ | ||
| 1777 | if (!(lflags & LOG_NEWLINE)) { | ||
| 1778 | if (cont_add(facility, level, lflags, text, text_len)) | ||
| 1779 | return text_len; | ||
| 1780 | } | ||
| 1781 | |||
| 1782 | /* Store it in the record log */ | ||
| 1783 | return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); | ||
| 1784 | } | ||
| 1785 | |||
| 1736 | asmlinkage int vprintk_emit(int facility, int level, | 1786 | asmlinkage int vprintk_emit(int facility, int level, |
| 1737 | const char *dict, size_t dictlen, | 1787 | const char *dict, size_t dictlen, |
| 1738 | const char *fmt, va_list args) | 1788 | const char *fmt, va_list args) |
| @@ -1819,10 +1869,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1819 | 1869 | ||
| 1820 | /* strip kernel syslog prefix and extract log level or control flags */ | 1870 | /* strip kernel syslog prefix and extract log level or control flags */ |
| 1821 | if (facility == 0) { | 1871 | if (facility == 0) { |
| 1822 | int kern_level = printk_get_level(text); | 1872 | int kern_level; |
| 1823 | 1873 | ||
| 1824 | if (kern_level) { | 1874 | while ((kern_level = printk_get_level(text)) != 0) { |
| 1825 | const char *end_of_header = printk_skip_level(text); | ||
| 1826 | switch (kern_level) { | 1875 | switch (kern_level) { |
| 1827 | case '0' ... '7': | 1876 | case '0' ... '7': |
| 1828 | if (level == LOGLEVEL_DEFAULT) | 1877 | if (level == LOGLEVEL_DEFAULT) |
| @@ -1830,14 +1879,13 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1830 | /* fallthrough */ | 1879 | /* fallthrough */ |
| 1831 | case 'd': /* KERN_DEFAULT */ | 1880 | case 'd': /* KERN_DEFAULT */ |
| 1832 | lflags |= LOG_PREFIX; | 1881 | lflags |= LOG_PREFIX; |
| 1882 | break; | ||
| 1883 | case 'c': /* KERN_CONT */ | ||
| 1884 | lflags |= LOG_CONT; | ||
| 1833 | } | 1885 | } |
| 1834 | /* | 1886 | |
| 1835 | * No need to check length here because vscnprintf | 1887 | text_len -= 2; |
| 1836 | * put '\0' at the end of the string. Only valid and | 1888 | text += 2; |
| 1837 | * newly printed level is detected. | ||
| 1838 | */ | ||
| 1839 | text_len -= end_of_header - text; | ||
| 1840 | text = (char *)end_of_header; | ||
| 1841 | } | 1889 | } |
| 1842 | } | 1890 | } |
| 1843 | 1891 | ||
| @@ -1847,45 +1895,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1847 | if (dict) | 1895 | if (dict) |
| 1848 | lflags |= LOG_PREFIX|LOG_NEWLINE; | 1896 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
| 1849 | 1897 | ||
| 1850 | if (!(lflags & LOG_NEWLINE)) { | 1898 | printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); |
| 1851 | /* | ||
| 1852 | * Flush the conflicting buffer. An earlier newline was missing, | ||
| 1853 | * or another task also prints continuation lines. | ||
| 1854 | */ | ||
| 1855 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) | ||
| 1856 | cont_flush(LOG_NEWLINE); | ||
| 1857 | |||
| 1858 | /* buffer line if possible, otherwise store it right away */ | ||
| 1859 | if (cont_add(facility, level, text, text_len)) | ||
| 1860 | printed_len += text_len; | ||
| 1861 | else | ||
| 1862 | printed_len += log_store(facility, level, | ||
| 1863 | lflags | LOG_CONT, 0, | ||
| 1864 | dict, dictlen, text, text_len); | ||
| 1865 | } else { | ||
| 1866 | bool stored = false; | ||
| 1867 | |||
| 1868 | /* | ||
| 1869 | * If an earlier newline was missing and it was the same task, | ||
| 1870 | * either merge it with the current buffer and flush, or if | ||
| 1871 | * there was a race with interrupts (prefix == true) then just | ||
| 1872 | * flush it out and store this line separately. | ||
| 1873 | * If the preceding printk was from a different task and missed | ||
| 1874 | * a newline, flush and append the newline. | ||
| 1875 | */ | ||
| 1876 | if (cont.len) { | ||
| 1877 | if (cont.owner == current && !(lflags & LOG_PREFIX)) | ||
| 1878 | stored = cont_add(facility, level, text, | ||
| 1879 | text_len); | ||
| 1880 | cont_flush(LOG_NEWLINE); | ||
| 1881 | } | ||
| 1882 | |||
| 1883 | if (stored) | ||
| 1884 | printed_len += text_len; | ||
| 1885 | else | ||
| 1886 | printed_len += log_store(facility, level, lflags, 0, | ||
| 1887 | dict, dictlen, text, text_len); | ||
| 1888 | } | ||
| 1889 | 1899 | ||
| 1890 | logbuf_cpu = UINT_MAX; | 1900 | logbuf_cpu = UINT_MAX; |
| 1891 | raw_spin_unlock(&logbuf_lock); | 1901 | raw_spin_unlock(&logbuf_lock); |
| @@ -2647,7 +2657,7 @@ void register_console(struct console *newcon) | |||
| 2647 | * didn't select a console we take the first one | 2657 | * didn't select a console we take the first one |
| 2648 | * that registers here. | 2658 | * that registers here. |
| 2649 | */ | 2659 | */ |
| 2650 | if (preferred_console < 0) { | 2660 | if (preferred_console < 0 && !of_specified_console) { |
| 2651 | if (newcon->index < 0) | 2661 | if (newcon->index < 0) |
| 2652 | newcon->index = 0; | 2662 | newcon->index = 0; |
| 2653 | if (newcon->setup == NULL || | 2663 | if (newcon->setup == NULL || |
| @@ -3029,6 +3039,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
| 3029 | dumper->active = true; | 3039 | dumper->active = true; |
| 3030 | 3040 | ||
| 3031 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 3041 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 3042 | cont_flush(); | ||
| 3032 | dumper->cur_seq = clear_seq; | 3043 | dumper->cur_seq = clear_seq; |
| 3033 | dumper->cur_idx = clear_idx; | 3044 | dumper->cur_idx = clear_idx; |
| 3034 | dumper->next_seq = log_next_seq; | 3045 | dumper->next_seq = log_next_seq; |
| @@ -3119,6 +3130,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | |||
| 3119 | bool ret; | 3130 | bool ret; |
| 3120 | 3131 | ||
| 3121 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 3132 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 3133 | cont_flush(); | ||
| 3122 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); | 3134 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); |
| 3123 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 3135 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
| 3124 | 3136 | ||
| @@ -3161,6 +3173,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
| 3161 | goto out; | 3173 | goto out; |
| 3162 | 3174 | ||
| 3163 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 3175 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
| 3176 | cont_flush(); | ||
| 3164 | if (dumper->cur_seq < log_first_seq) { | 3177 | if (dumper->cur_seq < log_first_seq) { |
| 3165 | /* messages are gone, move to first available one */ | 3178 | /* messages are gone, move to first available one */ |
| 3166 | dumper->cur_seq = log_first_seq; | 3179 | dumper->cur_seq = log_first_seq; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1d3b7665d0be..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 73 | { | 73 | { |
| 74 | BUG_ON(!child->ptrace); | 74 | BUG_ON(!child->ptrace); |
| 75 | 75 | ||
| 76 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
| 77 | |||
| 76 | child->parent = child->real_parent; | 78 | child->parent = child->real_parent; |
| 77 | list_del_init(&child->ptrace_entry); | 79 | list_del_init(&child->ptrace_entry); |
| 78 | 80 | ||
| @@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 489 | 491 | ||
| 490 | /* Architecture-specific hardware disable .. */ | 492 | /* Architecture-specific hardware disable .. */ |
| 491 | ptrace_disable(child); | 493 | ptrace_disable(child); |
| 492 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
| 493 | 494 | ||
| 494 | write_lock_irq(&tasklist_lock); | 495 | write_lock_irq(&tasklist_lock); |
| 495 | /* | 496 | /* |
| @@ -536,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst | |||
| 536 | int this_len, retval; | 537 | int this_len, retval; |
| 537 | 538 | ||
| 538 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | 539 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; |
| 539 | retval = access_process_vm(tsk, src, buf, this_len, 0); | 540 | retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); |
| 540 | if (!retval) { | 541 | if (!retval) { |
| 541 | if (copied) | 542 | if (copied) |
| 542 | break; | 543 | break; |
| @@ -563,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
| 563 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; | 564 | this_len = (len > sizeof(buf)) ? sizeof(buf) : len; |
| 564 | if (copy_from_user(buf, src, this_len)) | 565 | if (copy_from_user(buf, src, this_len)) |
| 565 | return -EFAULT; | 566 | return -EFAULT; |
| 566 | retval = access_process_vm(tsk, dst, buf, this_len, 1); | 567 | retval = access_process_vm(tsk, dst, buf, this_len, |
| 568 | FOLL_FORCE | FOLL_WRITE); | ||
| 567 | if (!retval) { | 569 | if (!retval) { |
| 568 | if (copied) | 570 | if (copied) |
| 569 | break; | 571 | break; |
| @@ -1126,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, | |||
| 1126 | unsigned long tmp; | 1128 | unsigned long tmp; |
| 1127 | int copied; | 1129 | int copied; |
| 1128 | 1130 | ||
| 1129 | copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); | 1131 | copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); |
| 1130 | if (copied != sizeof(tmp)) | 1132 | if (copied != sizeof(tmp)) |
| 1131 | return -EIO; | 1133 | return -EIO; |
| 1132 | return put_user(tmp, (unsigned long __user *)data); | 1134 | return put_user(tmp, (unsigned long __user *)data); |
| @@ -1137,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, | |||
| 1137 | { | 1139 | { |
| 1138 | int copied; | 1140 | int copied; |
| 1139 | 1141 | ||
| 1140 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | 1142 | copied = access_process_vm(tsk, addr, &data, sizeof(data), |
| 1143 | FOLL_FORCE | FOLL_WRITE); | ||
| 1141 | return (copied == sizeof(data)) ? 0 : -EIO; | 1144 | return (copied == sizeof(data)) ? 0 : -EIO; |
| 1142 | } | 1145 | } |
| 1143 | 1146 | ||
| @@ -1154,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 1154 | switch (request) { | 1157 | switch (request) { |
| 1155 | case PTRACE_PEEKTEXT: | 1158 | case PTRACE_PEEKTEXT: |
| 1156 | case PTRACE_PEEKDATA: | 1159 | case PTRACE_PEEKDATA: |
| 1157 | ret = access_process_vm(child, addr, &word, sizeof(word), 0); | 1160 | ret = access_process_vm(child, addr, &word, sizeof(word), |
| 1161 | FOLL_FORCE); | ||
| 1158 | if (ret != sizeof(word)) | 1162 | if (ret != sizeof(word)) |
| 1159 | ret = -EIO; | 1163 | ret = -EIO; |
| 1160 | else | 1164 | else |
| @@ -1163,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 1163 | 1167 | ||
| 1164 | case PTRACE_POKETEXT: | 1168 | case PTRACE_POKETEXT: |
| 1165 | case PTRACE_POKEDATA: | 1169 | case PTRACE_POKEDATA: |
| 1166 | ret = access_process_vm(child, addr, &data, sizeof(data), 1); | 1170 | ret = access_process_vm(child, addr, &data, sizeof(data), |
| 1171 | FOLL_FORCE | FOLL_WRITE); | ||
| 1167 | ret = (ret != sizeof(data) ? -EIO : 0); | 1172 | ret = (ret != sizeof(data) ? -EIO : 0); |
| 1168 | break; | 1173 | break; |
| 1169 | 1174 | ||
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
| @@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
| 52 | 52 | ||
| 53 | #define PERF_FLAG "-perf:" | 53 | #define PERF_FLAG "-perf:" |
| 54 | #define PERFOUT_STRING(s) \ | 54 | #define PERFOUT_STRING(s) \ |
| 55 | pr_alert("%s" PERF_FLAG s "\n", perf_type) | 55 | pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) |
| 56 | #define VERBOSE_PERFOUT_STRING(s) \ | 56 | #define VERBOSE_PERFOUT_STRING(s) \ |
| 57 | do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) | 57 | do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) |
| 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 58 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
| @@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) | |||
| 400 | sp.sched_priority = 0; | 400 | sp.sched_priority = 0; |
| 401 | sched_setscheduler_nocheck(current, | 401 | sched_setscheduler_nocheck(current, |
| 402 | SCHED_NORMAL, &sp); | 402 | SCHED_NORMAL, &sp); |
| 403 | pr_alert("%s" PERF_FLAG | 403 | pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", |
| 404 | "rcu_perf_writer %ld has %d measurements\n", | 404 | perf_type, PERF_FLAG, me, MIN_MEAS); |
| 405 | perf_type, me, MIN_MEAS); | ||
| 406 | if (atomic_inc_return(&n_rcu_perf_writer_finished) >= | 405 | if (atomic_inc_return(&n_rcu_perf_writer_finished) >= |
| 407 | nrealwriters) { | 406 | nrealwriters) { |
| 408 | schedule_timeout_interruptible(10); | 407 | schedule_timeout_interruptible(10); |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..bf08fee53dc7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void) | |||
| 1238 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1238 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
| 1239 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1239 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
| 1240 | static unsigned long rtcv_snap = ULONG_MAX; | 1240 | static unsigned long rtcv_snap = ULONG_MAX; |
| 1241 | struct task_struct *wtp; | ||
| 1241 | 1242 | ||
| 1242 | for_each_possible_cpu(cpu) { | 1243 | for_each_possible_cpu(cpu) { |
| 1243 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1244 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
| @@ -1258,8 +1259,9 @@ rcu_torture_stats_print(void) | |||
| 1258 | atomic_read(&n_rcu_torture_alloc), | 1259 | atomic_read(&n_rcu_torture_alloc), |
| 1259 | atomic_read(&n_rcu_torture_alloc_fail), | 1260 | atomic_read(&n_rcu_torture_alloc_fail), |
| 1260 | atomic_read(&n_rcu_torture_free)); | 1261 | atomic_read(&n_rcu_torture_free)); |
| 1261 | pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", | 1262 | pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", |
| 1262 | atomic_read(&n_rcu_torture_mberror), | 1263 | atomic_read(&n_rcu_torture_mberror), |
| 1264 | n_rcu_torture_barrier_error, | ||
| 1263 | n_rcu_torture_boost_ktrerror, | 1265 | n_rcu_torture_boost_ktrerror, |
| 1264 | n_rcu_torture_boost_rterror); | 1266 | n_rcu_torture_boost_rterror); |
| 1265 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", | 1267 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", |
| @@ -1312,10 +1314,12 @@ rcu_torture_stats_print(void) | |||
| 1312 | 1314 | ||
| 1313 | rcutorture_get_gp_data(cur_ops->ttype, | 1315 | rcutorture_get_gp_data(cur_ops->ttype, |
| 1314 | &flags, &gpnum, &completed); | 1316 | &flags, &gpnum, &completed); |
| 1315 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", | 1317 | wtp = READ_ONCE(writer_task); |
| 1318 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", | ||
| 1316 | rcu_torture_writer_state_getname(), | 1319 | rcu_torture_writer_state_getname(), |
| 1317 | rcu_torture_writer_state, | 1320 | rcu_torture_writer_state, |
| 1318 | gpnum, completed, flags); | 1321 | gpnum, completed, flags, |
| 1322 | wtp == NULL ? ~0UL : wtp->state); | ||
| 1319 | show_rcu_gp_kthreads(); | 1323 | show_rcu_gp_kthreads(); |
| 1320 | rcu_ftrace_dump(DUMP_ALL); | 1324 | rcu_ftrace_dump(DUMP_ALL); |
| 1321 | } | 1325 | } |
| @@ -1362,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) | |||
| 1362 | onoff_interval, onoff_holdoff); | 1366 | onoff_interval, onoff_holdoff); |
| 1363 | } | 1367 | } |
| 1364 | 1368 | ||
| 1365 | static void rcutorture_booster_cleanup(int cpu) | 1369 | static int rcutorture_booster_cleanup(unsigned int cpu) |
| 1366 | { | 1370 | { |
| 1367 | struct task_struct *t; | 1371 | struct task_struct *t; |
| 1368 | 1372 | ||
| 1369 | if (boost_tasks[cpu] == NULL) | 1373 | if (boost_tasks[cpu] == NULL) |
| 1370 | return; | 1374 | return 0; |
| 1371 | mutex_lock(&boost_mutex); | 1375 | mutex_lock(&boost_mutex); |
| 1372 | t = boost_tasks[cpu]; | 1376 | t = boost_tasks[cpu]; |
| 1373 | boost_tasks[cpu] = NULL; | 1377 | boost_tasks[cpu] = NULL; |
| @@ -1375,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu) | |||
| 1375 | 1379 | ||
| 1376 | /* This must be outside of the mutex, otherwise deadlock! */ | 1380 | /* This must be outside of the mutex, otherwise deadlock! */ |
| 1377 | torture_stop_kthread(rcu_torture_boost, t); | 1381 | torture_stop_kthread(rcu_torture_boost, t); |
| 1382 | return 0; | ||
| 1378 | } | 1383 | } |
| 1379 | 1384 | ||
| 1380 | static int rcutorture_booster_init(int cpu) | 1385 | static int rcutorture_booster_init(unsigned int cpu) |
| 1381 | { | 1386 | { |
| 1382 | int retval; | 1387 | int retval; |
| 1383 | 1388 | ||
| @@ -1577,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void) | |||
| 1577 | } | 1582 | } |
| 1578 | } | 1583 | } |
| 1579 | 1584 | ||
| 1580 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1585 | static enum cpuhp_state rcutor_hp; |
| 1581 | unsigned long action, void *hcpu) | ||
| 1582 | { | ||
| 1583 | long cpu = (long)hcpu; | ||
| 1584 | |||
| 1585 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 1586 | case CPU_ONLINE: | ||
| 1587 | case CPU_DOWN_FAILED: | ||
| 1588 | (void)rcutorture_booster_init(cpu); | ||
| 1589 | break; | ||
| 1590 | case CPU_DOWN_PREPARE: | ||
| 1591 | rcutorture_booster_cleanup(cpu); | ||
| 1592 | break; | ||
| 1593 | default: | ||
| 1594 | break; | ||
| 1595 | } | ||
| 1596 | return NOTIFY_OK; | ||
| 1597 | } | ||
| 1598 | |||
| 1599 | static struct notifier_block rcutorture_cpu_nb = { | ||
| 1600 | .notifier_call = rcutorture_cpu_notify, | ||
| 1601 | }; | ||
| 1602 | 1586 | ||
| 1603 | static void | 1587 | static void |
| 1604 | rcu_torture_cleanup(void) | 1588 | rcu_torture_cleanup(void) |
| @@ -1638,11 +1622,8 @@ rcu_torture_cleanup(void) | |||
| 1638 | for (i = 0; i < ncbflooders; i++) | 1622 | for (i = 0; i < ncbflooders; i++) |
| 1639 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); | 1623 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); |
| 1640 | if ((test_boost == 1 && cur_ops->can_boost) || | 1624 | if ((test_boost == 1 && cur_ops->can_boost) || |
| 1641 | test_boost == 2) { | 1625 | test_boost == 2) |
| 1642 | unregister_cpu_notifier(&rcutorture_cpu_nb); | 1626 | cpuhp_remove_state(rcutor_hp); |
| 1643 | for_each_possible_cpu(i) | ||
| 1644 | rcutorture_booster_cleanup(i); | ||
| 1645 | } | ||
| 1646 | 1627 | ||
| 1647 | /* | 1628 | /* |
| 1648 | * Wait for all RCU callbacks to fire, then do flavor-specific | 1629 | * Wait for all RCU callbacks to fire, then do flavor-specific |
| @@ -1869,14 +1850,13 @@ rcu_torture_init(void) | |||
| 1869 | test_boost == 2) { | 1850 | test_boost == 2) { |
| 1870 | 1851 | ||
| 1871 | boost_starttime = jiffies + test_boost_interval * HZ; | 1852 | boost_starttime = jiffies + test_boost_interval * HZ; |
| 1872 | register_cpu_notifier(&rcutorture_cpu_nb); | 1853 | |
| 1873 | for_each_possible_cpu(i) { | 1854 | firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", |
| 1874 | if (cpu_is_offline(i)) | 1855 | rcutorture_booster_init, |
| 1875 | continue; /* Heuristic: CPU can go offline. */ | 1856 | rcutorture_booster_cleanup); |
| 1876 | firsterr = rcutorture_booster_init(i); | 1857 | if (firsterr < 0) |
| 1877 | if (firsterr) | 1858 | goto unwind; |
| 1878 | goto unwind; | 1859 | rcutor_hp = firsterr; |
| 1879 | } | ||
| 1880 | } | 1860 | } |
| 1881 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); | 1861 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); |
| 1882 | if (firsterr) | 1862 | if (firsterr) |
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c | |||
| @@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) | |||
| 68 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), | 68 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), |
| 69 | "suspicious rcu_sync_is_idle() usage"); | 69 | "suspicious rcu_sync_is_idle() usage"); |
| 70 | } | 70 | } |
| 71 | |||
| 72 | EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); | ||
| 71 | #endif | 73 | #endif |
| 72 | 74 | ||
| 73 | /** | 75 | /** |
| @@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | |||
| 83 | } | 85 | } |
| 84 | 86 | ||
| 85 | /** | 87 | /** |
| 88 | * Must be called after rcu_sync_init() and before first use. | ||
| 89 | * | ||
| 90 | * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() | ||
| 91 | * pairs turn into NO-OPs. | ||
| 92 | */ | ||
| 93 | void rcu_sync_enter_start(struct rcu_sync *rsp) | ||
| 94 | { | ||
| 95 | rsp->gp_count++; | ||
| 96 | rsp->gp_state = GP_PASSED; | ||
| 97 | } | ||
| 98 | |||
| 99 | /** | ||
| 86 | * rcu_sync_enter() - Force readers onto slowpath | 100 | * rcu_sync_enter() - Force readers onto slowpath |
| 87 | * @rsp: Pointer to rcu_sync structure to use for synchronization | 101 | * @rsp: Pointer to rcu_sync structure to use for synchronization |
| 88 | * | 102 | * |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 170 | false)); | 170 | false)); |
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | static void rcu_process_callbacks(struct softirq_action *unused) | 173 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) |
| 174 | { | 174 | { |
| 175 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 175 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
| 176 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 176 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..69a5611a7e7c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -41,7 +41,6 @@ | |||
| 41 | #include <linux/export.h> | 41 | #include <linux/export.h> |
| 42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
| 43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
| 44 | #include <linux/module.h> | ||
| 45 | #include <linux/percpu.h> | 44 | #include <linux/percpu.h> |
| 46 | #include <linux/notifier.h> | 45 | #include <linux/notifier.h> |
| 47 | #include <linux/cpu.h> | 46 | #include <linux/cpu.h> |
| @@ -60,7 +59,6 @@ | |||
| 60 | #include "tree.h" | 59 | #include "tree.h" |
| 61 | #include "rcu.h" | 60 | #include "rcu.h" |
| 62 | 61 | ||
| 63 | MODULE_ALIAS("rcutree"); | ||
| 64 | #ifdef MODULE_PARAM_PREFIX | 62 | #ifdef MODULE_PARAM_PREFIX |
| 65 | #undef MODULE_PARAM_PREFIX | 63 | #undef MODULE_PARAM_PREFIX |
| 66 | #endif | 64 | #endif |
| @@ -1848,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1848 | struct rcu_data *rdp) | 1846 | struct rcu_data *rdp) |
| 1849 | { | 1847 | { |
| 1850 | bool ret; | 1848 | bool ret; |
| 1849 | bool need_gp; | ||
| 1851 | 1850 | ||
| 1852 | /* Handle the ends of any preceding grace periods first. */ | 1851 | /* Handle the ends of any preceding grace periods first. */ |
| 1853 | if (rdp->completed == rnp->completed && | 1852 | if (rdp->completed == rnp->completed && |
| @@ -1874,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1874 | */ | 1873 | */ |
| 1875 | rdp->gpnum = rnp->gpnum; | 1874 | rdp->gpnum = rnp->gpnum; |
| 1876 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1875 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
| 1877 | rdp->cpu_no_qs.b.norm = true; | 1876 | need_gp = !!(rnp->qsmask & rdp->grpmask); |
| 1877 | rdp->cpu_no_qs.b.norm = need_gp; | ||
| 1878 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1878 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); |
| 1879 | rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); | 1879 | rdp->core_needs_qs = need_gp; |
| 1880 | zero_cpu_stall_ticks(rdp); | 1880 | zero_cpu_stall_ticks(rdp); |
| 1881 | WRITE_ONCE(rdp->gpwrap, false); | 1881 | WRITE_ONCE(rdp->gpwrap, false); |
| 1882 | } | 1882 | } |
| @@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 2344 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2344 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 2345 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2345 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2346 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); | 2346 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); |
| 2347 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ | 2347 | rcu_gp_kthread_wake(rsp); |
| 2348 | } | 2348 | } |
| 2349 | 2349 | ||
| 2350 | /* | 2350 | /* |
| @@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2970 | } | 2970 | } |
| 2971 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2971 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2972 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); | 2972 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); |
| 2973 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ | 2973 | rcu_gp_kthread_wake(rsp); |
| 2974 | } | 2974 | } |
| 2975 | 2975 | ||
| 2976 | /* | 2976 | /* |
| @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
| 3013 | /* | 3013 | /* |
| 3014 | * Do RCU core processing for the current CPU. | 3014 | * Do RCU core processing for the current CPU. |
| 3015 | */ | 3015 | */ |
| 3016 | static void rcu_process_callbacks(struct softirq_action *unused) | 3016 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) |
| 3017 | { | 3017 | { |
| 3018 | struct rcu_state *rsp; | 3018 | struct rcu_state *rsp; |
| 3019 | 3019 | ||
| @@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 3792 | rnp = rdp->mynode; | 3792 | rnp = rdp->mynode; |
| 3793 | mask = rdp->grpmask; | 3793 | mask = rdp->grpmask; |
| 3794 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 3794 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
| 3795 | rnp->qsmaskinitnext |= mask; | ||
| 3796 | rnp->expmaskinitnext |= mask; | ||
| 3797 | if (!rdp->beenonline) | 3795 | if (!rdp->beenonline) |
| 3798 | WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); | 3796 | WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); |
| 3799 | rdp->beenonline = true; /* We have now been online. */ | 3797 | rdp->beenonline = true; /* We have now been online. */ |
| @@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
| 3860 | return 0; | 3858 | return 0; |
| 3861 | } | 3859 | } |
| 3862 | 3860 | ||
| 3861 | /* | ||
| 3862 | * Mark the specified CPU as being online so that subsequent grace periods | ||
| 3863 | * (both expedited and normal) will wait on it. Note that this means that | ||
| 3864 | * incoming CPUs are not allowed to use RCU read-side critical sections | ||
| 3865 | * until this function is called. Failing to observe this restriction | ||
| 3866 | * will result in lockdep splats. | ||
| 3867 | */ | ||
| 3868 | void rcu_cpu_starting(unsigned int cpu) | ||
| 3869 | { | ||
| 3870 | unsigned long flags; | ||
| 3871 | unsigned long mask; | ||
| 3872 | struct rcu_data *rdp; | ||
| 3873 | struct rcu_node *rnp; | ||
| 3874 | struct rcu_state *rsp; | ||
| 3875 | |||
| 3876 | for_each_rcu_flavor(rsp) { | ||
| 3877 | rdp = this_cpu_ptr(rsp->rda); | ||
| 3878 | rnp = rdp->mynode; | ||
| 3879 | mask = rdp->grpmask; | ||
| 3880 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3881 | rnp->qsmaskinitnext |= mask; | ||
| 3882 | rnp->expmaskinitnext |= mask; | ||
| 3883 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3884 | } | ||
| 3885 | } | ||
| 3886 | |||
| 3863 | #ifdef CONFIG_HOTPLUG_CPU | 3887 | #ifdef CONFIG_HOTPLUG_CPU |
| 3864 | /* | 3888 | /* |
| 3865 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | 3889 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() |
| @@ -4209,8 +4233,10 @@ void __init rcu_init(void) | |||
| 4209 | * or the scheduler are operational. | 4233 | * or the scheduler are operational. |
| 4210 | */ | 4234 | */ |
| 4211 | pm_notifier(rcu_pm_notify, 0); | 4235 | pm_notifier(rcu_pm_notify, 0); |
| 4212 | for_each_online_cpu(cpu) | 4236 | for_each_online_cpu(cpu) { |
| 4213 | rcutree_prepare_cpu(cpu); | 4237 | rcutree_prepare_cpu(cpu); |
| 4238 | rcu_cpu_starting(cpu); | ||
| 4239 | } | ||
| 4214 | } | 4240 | } |
| 4215 | 4241 | ||
| 4216 | #include "tree_exp.h" | 4242 | #include "tree_exp.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..e99a5234d9ed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -400,6 +400,7 @@ struct rcu_data { | |||
| 400 | #ifdef CONFIG_RCU_FAST_NO_HZ | 400 | #ifdef CONFIG_RCU_FAST_NO_HZ |
| 401 | struct rcu_head oom_head; | 401 | struct rcu_head oom_head; |
| 402 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 402 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 403 | atomic_long_t exp_workdone0; /* # done by workqueue. */ | ||
| 403 | atomic_long_t exp_workdone1; /* # done by others #1. */ | 404 | atomic_long_t exp_workdone1; /* # done by others #1. */ |
| 404 | atomic_long_t exp_workdone2; /* # done by others #2. */ | 405 | atomic_long_t exp_workdone2; /* # done by others #2. */ |
| 405 | atomic_long_t exp_workdone3; /* # done by others #3. */ | 406 | atomic_long_t exp_workdone3; /* # done by others #3. */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..24343eb87b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
| @@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
| 359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 359 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 360 | 360 | ||
| 361 | if (raw_smp_processor_id() == cpu || | 361 | if (raw_smp_processor_id() == cpu || |
| 362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | 362 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || |
| 363 | !(rnp->qsmaskinitnext & rdp->grpmask)) | ||
| 363 | mask_ofl_test |= rdp->grpmask; | 364 | mask_ofl_test |= rdp->grpmask; |
| 364 | } | 365 | } |
| 365 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | 366 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; |
| @@ -384,17 +385,16 @@ retry_ipi: | |||
| 384 | mask_ofl_ipi &= ~mask; | 385 | mask_ofl_ipi &= ~mask; |
| 385 | continue; | 386 | continue; |
| 386 | } | 387 | } |
| 387 | /* Failed, raced with offline. */ | 388 | /* Failed, raced with CPU hotplug operation. */ |
| 388 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 389 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 389 | if (cpu_online(cpu) && | 390 | if ((rnp->qsmaskinitnext & mask) && |
| 390 | (rnp->expmask & mask)) { | 391 | (rnp->expmask & mask)) { |
| 392 | /* Online, so delay for a bit and try again. */ | ||
| 391 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 393 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 392 | schedule_timeout_uninterruptible(1); | 394 | schedule_timeout_uninterruptible(1); |
| 393 | if (cpu_online(cpu) && | 395 | goto retry_ipi; |
| 394 | (rnp->expmask & mask)) | ||
| 395 | goto retry_ipi; | ||
| 396 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 397 | } | 396 | } |
| 397 | /* CPU really is offline, so we can ignore it. */ | ||
| 398 | if (!(rnp->expmask & mask)) | 398 | if (!(rnp->expmask & mask)) |
| 399 | mask_ofl_ipi &= ~mask; | 399 | mask_ofl_ipi &= ~mask; |
| 400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 400 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| @@ -427,12 +427,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 427 | jiffies_stall); | 427 | jiffies_stall); |
| 428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) | 428 | if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) |
| 429 | return; | 429 | return; |
| 430 | if (ret < 0) { | 430 | WARN_ON(ret < 0); /* workqueues should not be signaled. */ |
| 431 | /* Hit a signal, disable CPU stall warnings. */ | 431 | if (rcu_cpu_stall_suppress) |
| 432 | swait_event(rsp->expedited_wq, | 432 | continue; |
| 433 | sync_rcu_preempt_exp_done(rnp_root)); | 433 | panic_on_rcu_stall(); |
| 434 | return; | ||
| 435 | } | ||
| 436 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", | 434 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", |
| 437 | rsp->name); | 435 | rsp->name); |
| 438 | ndetected = 0; | 436 | ndetected = 0; |
| @@ -500,7 +498,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
| 500 | * next GP, to proceed. | 498 | * next GP, to proceed. |
| 501 | */ | 499 | */ |
| 502 | mutex_lock(&rsp->exp_wake_mutex); | 500 | mutex_lock(&rsp->exp_wake_mutex); |
| 503 | mutex_unlock(&rsp->exp_mutex); | ||
| 504 | 501 | ||
| 505 | rcu_for_each_node_breadth_first(rsp, rnp) { | 502 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 506 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { | 503 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { |
| @@ -516,6 +513,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
| 516 | mutex_unlock(&rsp->exp_wake_mutex); | 513 | mutex_unlock(&rsp->exp_wake_mutex); |
| 517 | } | 514 | } |
| 518 | 515 | ||
| 516 | /* Let the workqueue handler know what it is supposed to do. */ | ||
| 517 | struct rcu_exp_work { | ||
| 518 | smp_call_func_t rew_func; | ||
| 519 | struct rcu_state *rew_rsp; | ||
| 520 | unsigned long rew_s; | ||
| 521 | struct work_struct rew_work; | ||
| 522 | }; | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Work-queue handler to drive an expedited grace period forward. | ||
| 526 | */ | ||
| 527 | static void wait_rcu_exp_gp(struct work_struct *wp) | ||
| 528 | { | ||
| 529 | struct rcu_exp_work *rewp; | ||
| 530 | |||
| 531 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 532 | rewp = container_of(wp, struct rcu_exp_work, rew_work); | ||
| 533 | sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); | ||
| 534 | |||
| 535 | /* Wait and clean up, including waking everyone. */ | ||
| 536 | rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); | ||
| 537 | } | ||
| 538 | |||
| 539 | /* | ||
| 540 | * Given an rcu_state pointer and a smp_call_function() handler, kick | ||
| 541 | * off the specified flavor of expedited grace period. | ||
| 542 | */ | ||
| 543 | static void _synchronize_rcu_expedited(struct rcu_state *rsp, | ||
| 544 | smp_call_func_t func) | ||
| 545 | { | ||
| 546 | struct rcu_data *rdp; | ||
| 547 | struct rcu_exp_work rew; | ||
| 548 | struct rcu_node *rnp; | ||
| 549 | unsigned long s; | ||
| 550 | |||
| 551 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 552 | if (rcu_gp_is_normal()) { | ||
| 553 | wait_rcu_gp(rsp->call); | ||
| 554 | return; | ||
| 555 | } | ||
| 556 | |||
| 557 | /* Take a snapshot of the sequence number. */ | ||
| 558 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 559 | if (exp_funnel_lock(rsp, s)) | ||
| 560 | return; /* Someone else did our work for us. */ | ||
| 561 | |||
| 562 | /* Marshall arguments and schedule the expedited grace period. */ | ||
| 563 | rew.rew_func = func; | ||
| 564 | rew.rew_rsp = rsp; | ||
| 565 | rew.rew_s = s; | ||
| 566 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); | ||
| 567 | schedule_work(&rew.rew_work); | ||
| 568 | |||
| 569 | /* Wait for expedited grace period to complete. */ | ||
| 570 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | ||
| 571 | rnp = rcu_get_root(rsp); | ||
| 572 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | ||
| 573 | sync_exp_work_done(rsp, | ||
| 574 | &rdp->exp_workdone0, s)); | ||
| 575 | |||
| 576 | /* Let the next expedited grace period start. */ | ||
| 577 | mutex_unlock(&rsp->exp_mutex); | ||
| 578 | } | ||
| 579 | |||
| 519 | /** | 580 | /** |
| 520 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | 581 | * synchronize_sched_expedited - Brute-force RCU-sched grace period |
| 521 | * | 582 | * |
| @@ -534,29 +595,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
| 534 | */ | 595 | */ |
| 535 | void synchronize_sched_expedited(void) | 596 | void synchronize_sched_expedited(void) |
| 536 | { | 597 | { |
| 537 | unsigned long s; | ||
| 538 | struct rcu_state *rsp = &rcu_sched_state; | 598 | struct rcu_state *rsp = &rcu_sched_state; |
| 539 | 599 | ||
| 540 | /* If only one CPU, this is automatically a grace period. */ | 600 | /* If only one CPU, this is automatically a grace period. */ |
| 541 | if (rcu_blocking_is_gp()) | 601 | if (rcu_blocking_is_gp()) |
| 542 | return; | 602 | return; |
| 543 | 603 | ||
| 544 | /* If expedited grace periods are prohibited, fall back to normal. */ | 604 | _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); |
| 545 | if (rcu_gp_is_normal()) { | ||
| 546 | wait_rcu_gp(call_rcu_sched); | ||
| 547 | return; | ||
| 548 | } | ||
| 549 | |||
| 550 | /* Take a snapshot of the sequence number. */ | ||
| 551 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 552 | if (exp_funnel_lock(rsp, s)) | ||
| 553 | return; /* Someone else did our work for us. */ | ||
| 554 | |||
| 555 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 556 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | ||
| 557 | |||
| 558 | /* Wait and clean up, including waking everyone. */ | ||
| 559 | rcu_exp_wait_wake(rsp, s); | ||
| 560 | } | 605 | } |
| 561 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 606 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |
| 562 | 607 | ||
| @@ -620,23 +665,8 @@ static void sync_rcu_exp_handler(void *info) | |||
| 620 | void synchronize_rcu_expedited(void) | 665 | void synchronize_rcu_expedited(void) |
| 621 | { | 666 | { |
| 622 | struct rcu_state *rsp = rcu_state_p; | 667 | struct rcu_state *rsp = rcu_state_p; |
| 623 | unsigned long s; | ||
| 624 | |||
| 625 | /* If expedited grace periods are prohibited, fall back to normal. */ | ||
| 626 | if (rcu_gp_is_normal()) { | ||
| 627 | wait_rcu_gp(call_rcu); | ||
| 628 | return; | ||
| 629 | } | ||
| 630 | |||
| 631 | s = rcu_exp_gp_seq_snap(rsp); | ||
| 632 | if (exp_funnel_lock(rsp, s)) | ||
| 633 | return; /* Someone else did our work for us. */ | ||
| 634 | |||
| 635 | /* Initialize the rcu_node tree in preparation for the wait. */ | ||
| 636 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); | ||
| 637 | 668 | ||
| 638 | /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ | 669 | _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); |
| 639 | rcu_exp_wait_wake(rsp, s); | ||
| 640 | } | 670 | } |
| 641 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 671 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
| 642 | 672 | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2173 | cl++; | 2173 | cl++; |
| 2174 | c++; | 2174 | c++; |
| 2175 | local_bh_enable(); | 2175 | local_bh_enable(); |
| 2176 | cond_resched_rcu_qs(); | ||
| 2176 | list = next; | 2177 | list = next; |
| 2177 | } | 2178 | } |
| 2178 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | 2179 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
| @@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) | |||
| 185 | int cpu; | 185 | int cpu; |
| 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; | 186 | struct rcu_state *rsp = (struct rcu_state *)m->private; |
| 187 | struct rcu_data *rdp; | 187 | struct rcu_data *rdp; |
| 188 | unsigned long s1 = 0, s2 = 0, s3 = 0; | 188 | unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; |
| 189 | 189 | ||
| 190 | for_each_possible_cpu(cpu) { | 190 | for_each_possible_cpu(cpu) { |
| 191 | rdp = per_cpu_ptr(rsp->rda, cpu); | 191 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 192 | s0 += atomic_long_read(&rdp->exp_workdone0); | ||
| 192 | s1 += atomic_long_read(&rdp->exp_workdone1); | 193 | s1 += atomic_long_read(&rdp->exp_workdone1); |
| 193 | s2 += atomic_long_read(&rdp->exp_workdone2); | 194 | s2 += atomic_long_read(&rdp->exp_workdone2); |
| 194 | s3 += atomic_long_read(&rdp->exp_workdone3); | 195 | s3 += atomic_long_read(&rdp->exp_workdone3); |
| 195 | } | 196 | } |
| 196 | seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", | 197 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", |
| 197 | rsp->expedited_sequence, s1, s2, s3, | 198 | rsp->expedited_sequence, s0, s1, s2, s3, |
| 198 | atomic_long_read(&rsp->expedited_normal), | 199 | atomic_long_read(&rsp->expedited_normal), |
| 199 | atomic_read(&rsp->expedited_need_qs), | 200 | atomic_read(&rsp->expedited_need_qs), |
| 200 | rsp->expedited_sequence / 2); | 201 | rsp->expedited_sequence / 2); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -46,7 +46,7 @@ | |||
| 46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
| 47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
| 48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
| 49 | #include <linux/module.h> | 49 | #include <linux/moduleparam.h> |
| 50 | #include <linux/kthread.h> | 50 | #include <linux/kthread.h> |
| 51 | #include <linux/tick.h> | 51 | #include <linux/tick.h> |
| 52 | 52 | ||
| @@ -54,7 +54,6 @@ | |||
| 54 | 54 | ||
| 55 | #include "rcu.h" | 55 | #include "rcu.h" |
| 56 | 56 | ||
| 57 | MODULE_ALIAS("rcupdate"); | ||
| 58 | #ifdef MODULE_PARAM_PREFIX | 57 | #ifdef MODULE_PARAM_PREFIX |
| 59 | #undef MODULE_PARAM_PREFIX | 58 | #undef MODULE_PARAM_PREFIX |
| 60 | #endif | 59 | #endif |
diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..da79a109dbeb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) | |||
| 214 | __free_page(buf->page_array[i]); | 214 | __free_page(buf->page_array[i]); |
| 215 | relay_free_page_array(buf->page_array); | 215 | relay_free_page_array(buf->page_array); |
| 216 | } | 216 | } |
| 217 | chan->buf[buf->cpu] = NULL; | 217 | *per_cpu_ptr(chan->buf, buf->cpu) = NULL; |
| 218 | kfree(buf->padding); | 218 | kfree(buf->padding); |
| 219 | kfree(buf); | 219 | kfree(buf); |
| 220 | kref_put(&chan->kref, relay_destroy_channel); | 220 | kref_put(&chan->kref, relay_destroy_channel); |
| @@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = { | |||
| 328 | 328 | ||
| 329 | /** | 329 | /** |
| 330 | * wakeup_readers - wake up readers waiting on a channel | 330 | * wakeup_readers - wake up readers waiting on a channel |
| 331 | * @data: contains the channel buffer | 331 | * @work: contains the channel buffer |
| 332 | * | 332 | * |
| 333 | * This is the timer function used to defer reader waking. | 333 | * This is the function used to defer reader waking |
| 334 | */ | 334 | */ |
| 335 | static void wakeup_readers(unsigned long data) | 335 | static void wakeup_readers(struct irq_work *work) |
| 336 | { | 336 | { |
| 337 | struct rchan_buf *buf = (struct rchan_buf *)data; | 337 | struct rchan_buf *buf; |
| 338 | |||
| 339 | buf = container_of(work, struct rchan_buf, wakeup_work); | ||
| 338 | wake_up_interruptible(&buf->read_wait); | 340 | wake_up_interruptible(&buf->read_wait); |
| 339 | } | 341 | } |
| 340 | 342 | ||
| @@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
| 352 | if (init) { | 354 | if (init) { |
| 353 | init_waitqueue_head(&buf->read_wait); | 355 | init_waitqueue_head(&buf->read_wait); |
| 354 | kref_init(&buf->kref); | 356 | kref_init(&buf->kref); |
| 355 | setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); | 357 | init_irq_work(&buf->wakeup_work, wakeup_readers); |
| 356 | } else | 358 | } else { |
| 357 | del_timer_sync(&buf->timer); | 359 | irq_work_sync(&buf->wakeup_work); |
| 360 | } | ||
| 358 | 361 | ||
| 359 | buf->subbufs_produced = 0; | 362 | buf->subbufs_produced = 0; |
| 360 | buf->subbufs_consumed = 0; | 363 | buf->subbufs_consumed = 0; |
| @@ -382,20 +385,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) | |||
| 382 | */ | 385 | */ |
| 383 | void relay_reset(struct rchan *chan) | 386 | void relay_reset(struct rchan *chan) |
| 384 | { | 387 | { |
| 388 | struct rchan_buf *buf; | ||
| 385 | unsigned int i; | 389 | unsigned int i; |
| 386 | 390 | ||
| 387 | if (!chan) | 391 | if (!chan) |
| 388 | return; | 392 | return; |
| 389 | 393 | ||
| 390 | if (chan->is_global && chan->buf[0]) { | 394 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { |
| 391 | __relay_reset(chan->buf[0], 0); | 395 | __relay_reset(buf, 0); |
| 392 | return; | 396 | return; |
| 393 | } | 397 | } |
| 394 | 398 | ||
| 395 | mutex_lock(&relay_channels_mutex); | 399 | mutex_lock(&relay_channels_mutex); |
| 396 | for_each_possible_cpu(i) | 400 | for_each_possible_cpu(i) |
| 397 | if (chan->buf[i]) | 401 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
| 398 | __relay_reset(chan->buf[i], 0); | 402 | __relay_reset(buf, 0); |
| 399 | mutex_unlock(&relay_channels_mutex); | 403 | mutex_unlock(&relay_channels_mutex); |
| 400 | } | 404 | } |
| 401 | EXPORT_SYMBOL_GPL(relay_reset); | 405 | EXPORT_SYMBOL_GPL(relay_reset); |
| @@ -440,7 +444,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | |||
| 440 | struct dentry *dentry; | 444 | struct dentry *dentry; |
| 441 | 445 | ||
| 442 | if (chan->is_global) | 446 | if (chan->is_global) |
| 443 | return chan->buf[0]; | 447 | return *per_cpu_ptr(chan->buf, 0); |
| 444 | 448 | ||
| 445 | buf = relay_create_buf(chan); | 449 | buf = relay_create_buf(chan); |
| 446 | if (!buf) | 450 | if (!buf) |
| @@ -464,7 +468,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | |||
| 464 | __relay_reset(buf, 1); | 468 | __relay_reset(buf, 1); |
| 465 | 469 | ||
| 466 | if(chan->is_global) { | 470 | if(chan->is_global) { |
| 467 | chan->buf[0] = buf; | 471 | *per_cpu_ptr(chan->buf, 0) = buf; |
| 468 | buf->cpu = 0; | 472 | buf->cpu = 0; |
| 469 | } | 473 | } |
| 470 | 474 | ||
| @@ -486,7 +490,7 @@ free_buf: | |||
| 486 | static void relay_close_buf(struct rchan_buf *buf) | 490 | static void relay_close_buf(struct rchan_buf *buf) |
| 487 | { | 491 | { |
| 488 | buf->finalized = 1; | 492 | buf->finalized = 1; |
| 489 | del_timer_sync(&buf->timer); | 493 | irq_work_sync(&buf->wakeup_work); |
| 490 | buf->chan->cb->remove_buf_file(buf->dentry); | 494 | buf->chan->cb->remove_buf_file(buf->dentry); |
| 491 | kref_put(&buf->kref, relay_remove_buf); | 495 | kref_put(&buf->kref, relay_remove_buf); |
| 492 | } | 496 | } |
| @@ -512,46 +516,25 @@ static void setup_callbacks(struct rchan *chan, | |||
| 512 | chan->cb = cb; | 516 | chan->cb = cb; |
| 513 | } | 517 | } |
| 514 | 518 | ||
| 515 | /** | 519 | int relay_prepare_cpu(unsigned int cpu) |
| 516 | * relay_hotcpu_callback - CPU hotplug callback | ||
| 517 | * @nb: notifier block | ||
| 518 | * @action: hotplug action to take | ||
| 519 | * @hcpu: CPU number | ||
| 520 | * | ||
| 521 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) | ||
| 522 | */ | ||
| 523 | static int relay_hotcpu_callback(struct notifier_block *nb, | ||
| 524 | unsigned long action, | ||
| 525 | void *hcpu) | ||
| 526 | { | 520 | { |
| 527 | unsigned int hotcpu = (unsigned long)hcpu; | ||
| 528 | struct rchan *chan; | 521 | struct rchan *chan; |
| 522 | struct rchan_buf *buf; | ||
| 529 | 523 | ||
| 530 | switch(action) { | 524 | mutex_lock(&relay_channels_mutex); |
| 531 | case CPU_UP_PREPARE: | 525 | list_for_each_entry(chan, &relay_channels, list) { |
| 532 | case CPU_UP_PREPARE_FROZEN: | 526 | if ((buf = *per_cpu_ptr(chan->buf, cpu))) |
| 533 | mutex_lock(&relay_channels_mutex); | 527 | continue; |
| 534 | list_for_each_entry(chan, &relay_channels, list) { | 528 | buf = relay_open_buf(chan, cpu); |
| 535 | if (chan->buf[hotcpu]) | 529 | if (!buf) { |
| 536 | continue; | 530 | pr_err("relay: cpu %d buffer creation failed\n", cpu); |
| 537 | chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); | 531 | mutex_unlock(&relay_channels_mutex); |
| 538 | if(!chan->buf[hotcpu]) { | 532 | return -ENOMEM; |
| 539 | printk(KERN_ERR | ||
| 540 | "relay_hotcpu_callback: cpu %d buffer " | ||
| 541 | "creation failed\n", hotcpu); | ||
| 542 | mutex_unlock(&relay_channels_mutex); | ||
| 543 | return notifier_from_errno(-ENOMEM); | ||
| 544 | } | ||
| 545 | } | 533 | } |
| 546 | mutex_unlock(&relay_channels_mutex); | 534 | *per_cpu_ptr(chan->buf, cpu) = buf; |
| 547 | break; | ||
| 548 | case CPU_DEAD: | ||
| 549 | case CPU_DEAD_FROZEN: | ||
| 550 | /* No need to flush the cpu : will be flushed upon | ||
| 551 | * final relay_flush() call. */ | ||
| 552 | break; | ||
| 553 | } | 535 | } |
| 554 | return NOTIFY_OK; | 536 | mutex_unlock(&relay_channels_mutex); |
| 537 | return 0; | ||
| 555 | } | 538 | } |
| 556 | 539 | ||
| 557 | /** | 540 | /** |
| @@ -583,6 +566,7 @@ struct rchan *relay_open(const char *base_filename, | |||
| 583 | { | 566 | { |
| 584 | unsigned int i; | 567 | unsigned int i; |
| 585 | struct rchan *chan; | 568 | struct rchan *chan; |
| 569 | struct rchan_buf *buf; | ||
| 586 | 570 | ||
| 587 | if (!(subbuf_size && n_subbufs)) | 571 | if (!(subbuf_size && n_subbufs)) |
| 588 | return NULL; | 572 | return NULL; |
| @@ -593,6 +577,7 @@ struct rchan *relay_open(const char *base_filename, | |||
| 593 | if (!chan) | 577 | if (!chan) |
| 594 | return NULL; | 578 | return NULL; |
| 595 | 579 | ||
| 580 | chan->buf = alloc_percpu(struct rchan_buf *); | ||
| 596 | chan->version = RELAYFS_CHANNEL_VERSION; | 581 | chan->version = RELAYFS_CHANNEL_VERSION; |
| 597 | chan->n_subbufs = n_subbufs; | 582 | chan->n_subbufs = n_subbufs; |
| 598 | chan->subbuf_size = subbuf_size; | 583 | chan->subbuf_size = subbuf_size; |
| @@ -608,9 +593,10 @@ struct rchan *relay_open(const char *base_filename, | |||
| 608 | 593 | ||
| 609 | mutex_lock(&relay_channels_mutex); | 594 | mutex_lock(&relay_channels_mutex); |
| 610 | for_each_online_cpu(i) { | 595 | for_each_online_cpu(i) { |
| 611 | chan->buf[i] = relay_open_buf(chan, i); | 596 | buf = relay_open_buf(chan, i); |
| 612 | if (!chan->buf[i]) | 597 | if (!buf) |
| 613 | goto free_bufs; | 598 | goto free_bufs; |
| 599 | *per_cpu_ptr(chan->buf, i) = buf; | ||
| 614 | } | 600 | } |
| 615 | list_add(&chan->list, &relay_channels); | 601 | list_add(&chan->list, &relay_channels); |
| 616 | mutex_unlock(&relay_channels_mutex); | 602 | mutex_unlock(&relay_channels_mutex); |
| @@ -619,8 +605,8 @@ struct rchan *relay_open(const char *base_filename, | |||
| 619 | 605 | ||
| 620 | free_bufs: | 606 | free_bufs: |
| 621 | for_each_possible_cpu(i) { | 607 | for_each_possible_cpu(i) { |
| 622 | if (chan->buf[i]) | 608 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
| 623 | relay_close_buf(chan->buf[i]); | 609 | relay_close_buf(buf); |
| 624 | } | 610 | } |
| 625 | 611 | ||
| 626 | kref_put(&chan->kref, relay_destroy_channel); | 612 | kref_put(&chan->kref, relay_destroy_channel); |
| @@ -666,6 +652,7 @@ int relay_late_setup_files(struct rchan *chan, | |||
| 666 | unsigned int i, curr_cpu; | 652 | unsigned int i, curr_cpu; |
| 667 | unsigned long flags; | 653 | unsigned long flags; |
| 668 | struct dentry *dentry; | 654 | struct dentry *dentry; |
| 655 | struct rchan_buf *buf; | ||
| 669 | struct rchan_percpu_buf_dispatcher disp; | 656 | struct rchan_percpu_buf_dispatcher disp; |
| 670 | 657 | ||
| 671 | if (!chan || !base_filename) | 658 | if (!chan || !base_filename) |
| @@ -684,10 +671,11 @@ int relay_late_setup_files(struct rchan *chan, | |||
| 684 | 671 | ||
| 685 | if (chan->is_global) { | 672 | if (chan->is_global) { |
| 686 | err = -EINVAL; | 673 | err = -EINVAL; |
| 687 | if (!WARN_ON_ONCE(!chan->buf[0])) { | 674 | buf = *per_cpu_ptr(chan->buf, 0); |
| 688 | dentry = relay_create_buf_file(chan, chan->buf[0], 0); | 675 | if (!WARN_ON_ONCE(!buf)) { |
| 676 | dentry = relay_create_buf_file(chan, buf, 0); | ||
| 689 | if (dentry && !WARN_ON_ONCE(!chan->is_global)) { | 677 | if (dentry && !WARN_ON_ONCE(!chan->is_global)) { |
| 690 | relay_set_buf_dentry(chan->buf[0], dentry); | 678 | relay_set_buf_dentry(buf, dentry); |
| 691 | err = 0; | 679 | err = 0; |
| 692 | } | 680 | } |
| 693 | } | 681 | } |
| @@ -702,13 +690,14 @@ int relay_late_setup_files(struct rchan *chan, | |||
| 702 | * on all currently online CPUs. | 690 | * on all currently online CPUs. |
| 703 | */ | 691 | */ |
| 704 | for_each_online_cpu(i) { | 692 | for_each_online_cpu(i) { |
| 705 | if (unlikely(!chan->buf[i])) { | 693 | buf = *per_cpu_ptr(chan->buf, i); |
| 694 | if (unlikely(!buf)) { | ||
| 706 | WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); | 695 | WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); |
| 707 | err = -EINVAL; | 696 | err = -EINVAL; |
| 708 | break; | 697 | break; |
| 709 | } | 698 | } |
| 710 | 699 | ||
| 711 | dentry = relay_create_buf_file(chan, chan->buf[i], i); | 700 | dentry = relay_create_buf_file(chan, buf, i); |
| 712 | if (unlikely(!dentry)) { | 701 | if (unlikely(!dentry)) { |
| 713 | err = -EINVAL; | 702 | err = -EINVAL; |
| 714 | break; | 703 | break; |
| @@ -716,10 +705,10 @@ int relay_late_setup_files(struct rchan *chan, | |||
| 716 | 705 | ||
| 717 | if (curr_cpu == i) { | 706 | if (curr_cpu == i) { |
| 718 | local_irq_save(flags); | 707 | local_irq_save(flags); |
| 719 | relay_set_buf_dentry(chan->buf[i], dentry); | 708 | relay_set_buf_dentry(buf, dentry); |
| 720 | local_irq_restore(flags); | 709 | local_irq_restore(flags); |
| 721 | } else { | 710 | } else { |
| 722 | disp.buf = chan->buf[i]; | 711 | disp.buf = buf; |
| 723 | disp.dentry = dentry; | 712 | disp.dentry = dentry; |
| 724 | smp_mb(); | 713 | smp_mb(); |
| 725 | /* relay_channels_mutex must be held, so wait. */ | 714 | /* relay_channels_mutex must be held, so wait. */ |
| @@ -768,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | |||
| 768 | buf->early_bytes += buf->chan->subbuf_size - | 757 | buf->early_bytes += buf->chan->subbuf_size - |
| 769 | buf->padding[old_subbuf]; | 758 | buf->padding[old_subbuf]; |
| 770 | smp_mb(); | 759 | smp_mb(); |
| 771 | if (waitqueue_active(&buf->read_wait)) | 760 | if (waitqueue_active(&buf->read_wait)) { |
| 772 | /* | 761 | /* |
| 773 | * Calling wake_up_interruptible() from here | 762 | * Calling wake_up_interruptible() from here |
| 774 | * will deadlock if we happen to be logging | 763 | * will deadlock if we happen to be logging |
| 775 | * from the scheduler (trying to re-grab | 764 | * from the scheduler (trying to re-grab |
| 776 | * rq->lock), so defer it. | 765 | * rq->lock), so defer it. |
| 777 | */ | 766 | */ |
| 778 | mod_timer(&buf->timer, jiffies + 1); | 767 | irq_work_queue(&buf->wakeup_work); |
| 768 | } | ||
| 779 | } | 769 | } |
| 780 | 770 | ||
| 781 | old = buf->data; | 771 | old = buf->data; |
| @@ -822,11 +812,10 @@ void relay_subbufs_consumed(struct rchan *chan, | |||
| 822 | if (!chan) | 812 | if (!chan) |
| 823 | return; | 813 | return; |
| 824 | 814 | ||
| 825 | if (cpu >= NR_CPUS || !chan->buf[cpu] || | 815 | buf = *per_cpu_ptr(chan->buf, cpu); |
| 826 | subbufs_consumed > chan->n_subbufs) | 816 | if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) |
| 827 | return; | 817 | return; |
| 828 | 818 | ||
| 829 | buf = chan->buf[cpu]; | ||
| 830 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) | 819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) |
| 831 | buf->subbufs_consumed = buf->subbufs_produced; | 820 | buf->subbufs_consumed = buf->subbufs_produced; |
| 832 | else | 821 | else |
| @@ -842,18 +831,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); | |||
| 842 | */ | 831 | */ |
| 843 | void relay_close(struct rchan *chan) | 832 | void relay_close(struct rchan *chan) |
| 844 | { | 833 | { |
| 834 | struct rchan_buf *buf; | ||
| 845 | unsigned int i; | 835 | unsigned int i; |
| 846 | 836 | ||
| 847 | if (!chan) | 837 | if (!chan) |
| 848 | return; | 838 | return; |
| 849 | 839 | ||
| 850 | mutex_lock(&relay_channels_mutex); | 840 | mutex_lock(&relay_channels_mutex); |
| 851 | if (chan->is_global && chan->buf[0]) | 841 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) |
| 852 | relay_close_buf(chan->buf[0]); | 842 | relay_close_buf(buf); |
| 853 | else | 843 | else |
| 854 | for_each_possible_cpu(i) | 844 | for_each_possible_cpu(i) |
| 855 | if (chan->buf[i]) | 845 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
| 856 | relay_close_buf(chan->buf[i]); | 846 | relay_close_buf(buf); |
| 857 | 847 | ||
| 858 | if (chan->last_toobig) | 848 | if (chan->last_toobig) |
| 859 | printk(KERN_WARNING "relay: one or more items not logged " | 849 | printk(KERN_WARNING "relay: one or more items not logged " |
| @@ -874,20 +864,21 @@ EXPORT_SYMBOL_GPL(relay_close); | |||
| 874 | */ | 864 | */ |
| 875 | void relay_flush(struct rchan *chan) | 865 | void relay_flush(struct rchan *chan) |
| 876 | { | 866 | { |
| 867 | struct rchan_buf *buf; | ||
| 877 | unsigned int i; | 868 | unsigned int i; |
| 878 | 869 | ||
| 879 | if (!chan) | 870 | if (!chan) |
| 880 | return; | 871 | return; |
| 881 | 872 | ||
| 882 | if (chan->is_global && chan->buf[0]) { | 873 | if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { |
| 883 | relay_switch_subbuf(chan->buf[0], 0); | 874 | relay_switch_subbuf(buf, 0); |
| 884 | return; | 875 | return; |
| 885 | } | 876 | } |
| 886 | 877 | ||
| 887 | mutex_lock(&relay_channels_mutex); | 878 | mutex_lock(&relay_channels_mutex); |
| 888 | for_each_possible_cpu(i) | 879 | for_each_possible_cpu(i) |
| 889 | if (chan->buf[i]) | 880 | if ((buf = *per_cpu_ptr(chan->buf, i))) |
| 890 | relay_switch_subbuf(chan->buf[i], 0); | 881 | relay_switch_subbuf(buf, 0); |
| 891 | mutex_unlock(&relay_channels_mutex); | 882 | mutex_unlock(&relay_channels_mutex); |
| 892 | } | 883 | } |
| 893 | EXPORT_SYMBOL_GPL(relay_flush); | 884 | EXPORT_SYMBOL_GPL(relay_flush); |
| @@ -1121,51 +1112,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, | |||
| 1121 | return end_pos; | 1112 | return end_pos; |
| 1122 | } | 1113 | } |
| 1123 | 1114 | ||
| 1124 | /* | 1115 | static ssize_t relay_file_read(struct file *filp, |
| 1125 | * subbuf_read_actor - read up to one subbuf's worth of data | 1116 | char __user *buffer, |
| 1126 | */ | 1117 | size_t count, |
| 1127 | static int subbuf_read_actor(size_t read_start, | 1118 | loff_t *ppos) |
| 1128 | struct rchan_buf *buf, | ||
| 1129 | size_t avail, | ||
| 1130 | read_descriptor_t *desc) | ||
| 1131 | { | ||
| 1132 | void *from; | ||
| 1133 | int ret = 0; | ||
| 1134 | |||
| 1135 | from = buf->start + read_start; | ||
| 1136 | ret = avail; | ||
| 1137 | if (copy_to_user(desc->arg.buf, from, avail)) { | ||
| 1138 | desc->error = -EFAULT; | ||
| 1139 | ret = 0; | ||
| 1140 | } | ||
| 1141 | desc->arg.data += ret; | ||
| 1142 | desc->written += ret; | ||
| 1143 | desc->count -= ret; | ||
| 1144 | |||
| 1145 | return ret; | ||
| 1146 | } | ||
| 1147 | |||
| 1148 | typedef int (*subbuf_actor_t) (size_t read_start, | ||
| 1149 | struct rchan_buf *buf, | ||
| 1150 | size_t avail, | ||
| 1151 | read_descriptor_t *desc); | ||
| 1152 | |||
| 1153 | /* | ||
| 1154 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | ||
| 1155 | */ | ||
| 1156 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | ||
| 1157 | subbuf_actor_t subbuf_actor, | ||
| 1158 | read_descriptor_t *desc) | ||
| 1159 | { | 1119 | { |
| 1160 | struct rchan_buf *buf = filp->private_data; | 1120 | struct rchan_buf *buf = filp->private_data; |
| 1161 | size_t read_start, avail; | 1121 | size_t read_start, avail; |
| 1122 | size_t written = 0; | ||
| 1162 | int ret; | 1123 | int ret; |
| 1163 | 1124 | ||
| 1164 | if (!desc->count) | 1125 | if (!count) |
| 1165 | return 0; | 1126 | return 0; |
| 1166 | 1127 | ||
| 1167 | inode_lock(file_inode(filp)); | 1128 | inode_lock(file_inode(filp)); |
| 1168 | do { | 1129 | do { |
| 1130 | void *from; | ||
| 1131 | |||
| 1169 | if (!relay_file_read_avail(buf, *ppos)) | 1132 | if (!relay_file_read_avail(buf, *ppos)) |
| 1170 | break; | 1133 | break; |
| 1171 | 1134 | ||
| @@ -1174,32 +1137,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | |||
| 1174 | if (!avail) | 1137 | if (!avail) |
| 1175 | break; | 1138 | break; |
| 1176 | 1139 | ||
| 1177 | avail = min(desc->count, avail); | 1140 | avail = min(count, avail); |
| 1178 | ret = subbuf_actor(read_start, buf, avail, desc); | 1141 | from = buf->start + read_start; |
| 1179 | if (desc->error < 0) | 1142 | ret = avail; |
| 1143 | if (copy_to_user(buffer, from, avail)) | ||
| 1180 | break; | 1144 | break; |
| 1181 | 1145 | ||
| 1182 | if (ret) { | 1146 | buffer += ret; |
| 1183 | relay_file_read_consume(buf, read_start, ret); | 1147 | written += ret; |
| 1184 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 1148 | count -= ret; |
| 1185 | } | ||
| 1186 | } while (desc->count && ret); | ||
| 1187 | inode_unlock(file_inode(filp)); | ||
| 1188 | 1149 | ||
| 1189 | return desc->written; | 1150 | relay_file_read_consume(buf, read_start, ret); |
| 1190 | } | 1151 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
| 1152 | } while (count); | ||
| 1153 | inode_unlock(file_inode(filp)); | ||
| 1191 | 1154 | ||
| 1192 | static ssize_t relay_file_read(struct file *filp, | 1155 | return written; |
| 1193 | char __user *buffer, | ||
| 1194 | size_t count, | ||
| 1195 | loff_t *ppos) | ||
| 1196 | { | ||
| 1197 | read_descriptor_t desc; | ||
| 1198 | desc.written = 0; | ||
| 1199 | desc.count = count; | ||
| 1200 | desc.arg.buf = buffer; | ||
| 1201 | desc.error = 0; | ||
| 1202 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); | ||
| 1203 | } | 1156 | } |
| 1204 | 1157 | ||
| 1205 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) | 1158 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) |
| @@ -1377,12 +1330,3 @@ const struct file_operations relay_file_operations = { | |||
| 1377 | .splice_read = relay_file_splice_read, | 1330 | .splice_read = relay_file_splice_read, |
| 1378 | }; | 1331 | }; |
| 1379 | EXPORT_SYMBOL_GPL(relay_file_operations); | 1332 | EXPORT_SYMBOL_GPL(relay_file_operations); |
| 1380 | |||
| 1381 | static __init int relay_init(void) | ||
| 1382 | { | ||
| 1383 | |||
| 1384 | hotcpu_notifier(relay_hotcpu_callback, 0); | ||
| 1385 | return 0; | ||
| 1386 | } | ||
| 1387 | |||
| 1388 | early_initcall(relay_init); | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..94732d1ab00a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
| 581 | * If needed we can still optimize that later with an | 581 | * If needed we can still optimize that later with an |
| 582 | * empty IRQ. | 582 | * empty IRQ. |
| 583 | */ | 583 | */ |
| 584 | if (cpu_is_offline(cpu)) | ||
| 585 | return true; /* Don't try to wake offline CPUs. */ | ||
| 584 | if (tick_nohz_full_cpu(cpu)) { | 586 | if (tick_nohz_full_cpu(cpu)) { |
| 585 | if (cpu != smp_processor_id() || | 587 | if (cpu != smp_processor_id() || |
| 586 | tick_nohz_tick_stopped()) | 588 | tick_nohz_tick_stopped()) |
| @@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu) | |||
| 591 | return false; | 593 | return false; |
| 592 | } | 594 | } |
| 593 | 595 | ||
| 596 | /* | ||
| 597 | * Wake up the specified CPU. If the CPU is going offline, it is the | ||
| 598 | * caller's responsibility to deal with the lost wakeup, for example, | ||
| 599 | * by hooking into the CPU_DEAD notifier like timers and hrtimers do. | ||
| 600 | */ | ||
| 594 | void wake_up_nohz_cpu(int cpu) | 601 | void wake_up_nohz_cpu(int cpu) |
| 595 | { | 602 | { |
| 596 | if (!wake_up_full_nohz_cpu(cpu)) | 603 | if (!wake_up_full_nohz_cpu(cpu)) |
| @@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data) | |||
| 1063 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because | 1070 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because |
| 1064 | * we're holding p->pi_lock. | 1071 | * we're holding p->pi_lock. |
| 1065 | */ | 1072 | */ |
| 1066 | if (task_rq(p) == rq && task_on_rq_queued(p)) | 1073 | if (task_rq(p) == rq) { |
| 1067 | rq = __migrate_task(rq, p, arg->dest_cpu); | 1074 | if (task_on_rq_queued(p)) |
| 1075 | rq = __migrate_task(rq, p, arg->dest_cpu); | ||
| 1076 | else | ||
| 1077 | p->wake_cpu = arg->dest_cpu; | ||
| 1078 | } | ||
| 1068 | raw_spin_unlock(&rq->lock); | 1079 | raw_spin_unlock(&rq->lock); |
| 1069 | raw_spin_unlock(&p->pi_lock); | 1080 | raw_spin_unlock(&p->pi_lock); |
| 1070 | 1081 | ||
| @@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 1105 | 1116 | ||
| 1106 | p->sched_class->set_cpus_allowed(p, new_mask); | 1117 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 1107 | 1118 | ||
| 1108 | if (running) | ||
| 1109 | p->sched_class->set_curr_task(rq); | ||
| 1110 | if (queued) | 1119 | if (queued) |
| 1111 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 1120 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
| 1121 | if (running) | ||
| 1122 | set_curr_task(rq, p); | ||
| 1112 | } | 1123 | } |
| 1113 | 1124 | ||
| 1114 | /* | 1125 | /* |
| @@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
| 1265 | /* | 1276 | /* |
| 1266 | * Task isn't running anymore; make it appear like we migrated | 1277 | * Task isn't running anymore; make it appear like we migrated |
| 1267 | * it before it went to sleep. This means on wakeup we make the | 1278 | * it before it went to sleep. This means on wakeup we make the |
| 1268 | * previous cpu our targer instead of where it really is. | 1279 | * previous cpu our target instead of where it really is. |
| 1269 | */ | 1280 | */ |
| 1270 | p->wake_cpu = cpu; | 1281 | p->wake_cpu = cpu; |
| 1271 | } | 1282 | } |
| @@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, | |||
| 1629 | static void | 1640 | static void |
| 1630 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | 1641 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
| 1631 | { | 1642 | { |
| 1632 | #ifdef CONFIG_SCHEDSTATS | 1643 | struct rq *rq; |
| 1633 | struct rq *rq = this_rq(); | ||
| 1634 | 1644 | ||
| 1635 | #ifdef CONFIG_SMP | 1645 | if (!schedstat_enabled()) |
| 1636 | int this_cpu = smp_processor_id(); | 1646 | return; |
| 1637 | 1647 | ||
| 1638 | if (cpu == this_cpu) { | 1648 | rq = this_rq(); |
| 1639 | schedstat_inc(rq, ttwu_local); | 1649 | |
| 1640 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 1650 | #ifdef CONFIG_SMP |
| 1651 | if (cpu == rq->cpu) { | ||
| 1652 | schedstat_inc(rq->ttwu_local); | ||
| 1653 | schedstat_inc(p->se.statistics.nr_wakeups_local); | ||
| 1641 | } else { | 1654 | } else { |
| 1642 | struct sched_domain *sd; | 1655 | struct sched_domain *sd; |
| 1643 | 1656 | ||
| 1644 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 1657 | schedstat_inc(p->se.statistics.nr_wakeups_remote); |
| 1645 | rcu_read_lock(); | 1658 | rcu_read_lock(); |
| 1646 | for_each_domain(this_cpu, sd) { | 1659 | for_each_domain(rq->cpu, sd) { |
| 1647 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 1660 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
| 1648 | schedstat_inc(sd, ttwu_wake_remote); | 1661 | schedstat_inc(sd->ttwu_wake_remote); |
| 1649 | break; | 1662 | break; |
| 1650 | } | 1663 | } |
| 1651 | } | 1664 | } |
| @@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
| 1653 | } | 1666 | } |
| 1654 | 1667 | ||
| 1655 | if (wake_flags & WF_MIGRATED) | 1668 | if (wake_flags & WF_MIGRATED) |
| 1656 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 1669 | schedstat_inc(p->se.statistics.nr_wakeups_migrate); |
| 1657 | |||
| 1658 | #endif /* CONFIG_SMP */ | 1670 | #endif /* CONFIG_SMP */ |
| 1659 | 1671 | ||
| 1660 | schedstat_inc(rq, ttwu_count); | 1672 | schedstat_inc(rq->ttwu_count); |
| 1661 | schedstat_inc(p, se.statistics.nr_wakeups); | 1673 | schedstat_inc(p->se.statistics.nr_wakeups); |
| 1662 | 1674 | ||
| 1663 | if (wake_flags & WF_SYNC) | 1675 | if (wake_flags & WF_SYNC) |
| 1664 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 1676 | schedstat_inc(p->se.statistics.nr_wakeups_sync); |
| 1665 | |||
| 1666 | #endif /* CONFIG_SCHEDSTATS */ | ||
| 1667 | } | 1677 | } |
| 1668 | 1678 | ||
| 1669 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1679 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
| @@ -2016,6 +2026,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2016 | success = 1; /* we're going to change ->state */ | 2026 | success = 1; /* we're going to change ->state */ |
| 2017 | cpu = task_cpu(p); | 2027 | cpu = task_cpu(p); |
| 2018 | 2028 | ||
| 2029 | /* | ||
| 2030 | * Ensure we load p->on_rq _after_ p->state, otherwise it would | ||
| 2031 | * be possible to, falsely, observe p->on_rq == 0 and get stuck | ||
| 2032 | * in smp_cond_load_acquire() below. | ||
| 2033 | * | ||
| 2034 | * sched_ttwu_pending() try_to_wake_up() | ||
| 2035 | * [S] p->on_rq = 1; [L] P->state | ||
| 2036 | * UNLOCK rq->lock -----. | ||
| 2037 | * \ | ||
| 2038 | * +--- RMB | ||
| 2039 | * schedule() / | ||
| 2040 | * LOCK rq->lock -----' | ||
| 2041 | * UNLOCK rq->lock | ||
| 2042 | * | ||
| 2043 | * [task p] | ||
| 2044 | * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq | ||
| 2045 | * | ||
| 2046 | * Pairs with the UNLOCK+LOCK on rq->lock from the | ||
| 2047 | * last wakeup of our task and the schedule that got our task | ||
| 2048 | * current. | ||
| 2049 | */ | ||
| 2050 | smp_rmb(); | ||
| 2019 | if (p->on_rq && ttwu_remote(p, wake_flags)) | 2051 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
| 2020 | goto stat; | 2052 | goto stat; |
| 2021 | 2053 | ||
| @@ -2062,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2062 | 2094 | ||
| 2063 | ttwu_queue(p, cpu, wake_flags); | 2095 | ttwu_queue(p, cpu, wake_flags); |
| 2064 | stat: | 2096 | stat: |
| 2065 | if (schedstat_enabled()) | 2097 | ttwu_stat(p, cpu, wake_flags); |
| 2066 | ttwu_stat(p, cpu, wake_flags); | ||
| 2067 | out: | 2098 | out: |
| 2068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2099 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2069 | 2100 | ||
| @@ -2073,6 +2104,7 @@ out: | |||
| 2073 | /** | 2104 | /** |
| 2074 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2105 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
| 2075 | * @p: the thread to be awakened | 2106 | * @p: the thread to be awakened |
| 2107 | * @cookie: context's cookie for pinning | ||
| 2076 | * | 2108 | * |
| 2077 | * Put @p on the run-queue if it's not already there. The caller must | 2109 | * Put @p on the run-queue if it's not already there. The caller must |
| 2078 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2110 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
| @@ -2111,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
| 2111 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2143 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 2112 | 2144 | ||
| 2113 | ttwu_do_wakeup(rq, p, 0, cookie); | 2145 | ttwu_do_wakeup(rq, p, 0, cookie); |
| 2114 | if (schedstat_enabled()) | 2146 | ttwu_stat(p, smp_processor_id(), 0); |
| 2115 | ttwu_stat(p, smp_processor_id(), 0); | ||
| 2116 | out: | 2147 | out: |
| 2117 | raw_spin_unlock(&p->pi_lock); | 2148 | raw_spin_unlock(&p->pi_lock); |
| 2118 | } | 2149 | } |
| @@ -2750,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
| 2750 | * task and put them back on the free list. | 2781 | * task and put them back on the free list. |
| 2751 | */ | 2782 | */ |
| 2752 | kprobe_flush_task(prev); | 2783 | kprobe_flush_task(prev); |
| 2784 | |||
| 2785 | /* Task is done with its stack. */ | ||
| 2786 | put_task_stack(prev); | ||
| 2787 | |||
| 2753 | put_task_struct(prev); | 2788 | put_task_struct(prev); |
| 2754 | } | 2789 | } |
| 2755 | 2790 | ||
| @@ -3170,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { } | |||
| 3170 | */ | 3205 | */ |
| 3171 | static noinline void __schedule_bug(struct task_struct *prev) | 3206 | static noinline void __schedule_bug(struct task_struct *prev) |
| 3172 | { | 3207 | { |
| 3208 | /* Save this before calling printk(), since that will clobber it */ | ||
| 3209 | unsigned long preempt_disable_ip = get_preempt_disable_ip(current); | ||
| 3210 | |||
| 3173 | if (oops_in_progress) | 3211 | if (oops_in_progress) |
| 3174 | return; | 3212 | return; |
| 3175 | 3213 | ||
| @@ -3180,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 3180 | print_modules(); | 3218 | print_modules(); |
| 3181 | if (irqs_disabled()) | 3219 | if (irqs_disabled()) |
| 3182 | print_irqtrace_events(prev); | 3220 | print_irqtrace_events(prev); |
| 3183 | #ifdef CONFIG_DEBUG_PREEMPT | 3221 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
| 3184 | if (in_atomic_preempt_off()) { | 3222 | && in_atomic_preempt_off()) { |
| 3185 | pr_err("Preemption disabled at:"); | 3223 | pr_err("Preemption disabled at:"); |
| 3186 | print_ip_sym(current->preempt_disable_ip); | 3224 | print_ip_sym(preempt_disable_ip); |
| 3187 | pr_cont("\n"); | 3225 | pr_cont("\n"); |
| 3188 | } | 3226 | } |
| 3189 | #endif | ||
| 3190 | if (panic_on_warn) | 3227 | if (panic_on_warn) |
| 3191 | panic("scheduling while atomic\n"); | 3228 | panic("scheduling while atomic\n"); |
| 3192 | 3229 | ||
| @@ -3212,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 3212 | 3249 | ||
| 3213 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3250 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| 3214 | 3251 | ||
| 3215 | schedstat_inc(this_rq(), sched_count); | 3252 | schedstat_inc(this_rq()->sched_count); |
| 3216 | } | 3253 | } |
| 3217 | 3254 | ||
| 3218 | /* | 3255 | /* |
| @@ -3305,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3305 | rq = cpu_rq(cpu); | 3342 | rq = cpu_rq(cpu); |
| 3306 | prev = rq->curr; | 3343 | prev = rq->curr; |
| 3307 | 3344 | ||
| 3308 | /* | ||
| 3309 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
| 3310 | * however we must fix that up, otherwise the next task will see an | ||
| 3311 | * inconsistent (higher) preempt count. | ||
| 3312 | * | ||
| 3313 | * It also avoids the below schedule_debug() test from complaining | ||
| 3314 | * about this. | ||
| 3315 | */ | ||
| 3316 | if (unlikely(prev->state == TASK_DEAD)) | ||
| 3317 | preempt_enable_no_resched_notrace(); | ||
| 3318 | |||
| 3319 | schedule_debug(prev); | 3345 | schedule_debug(prev); |
| 3320 | 3346 | ||
| 3321 | if (sched_feat(HRTICK)) | 3347 | if (sched_feat(HRTICK)) |
| @@ -3381,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3381 | 3407 | ||
| 3382 | balance_callback(rq); | 3408 | balance_callback(rq); |
| 3383 | } | 3409 | } |
| 3384 | STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ | 3410 | |
| 3411 | void __noreturn do_task_dead(void) | ||
| 3412 | { | ||
| 3413 | /* | ||
| 3414 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
| 3415 | * when the following two conditions become true. | ||
| 3416 | * - There is race condition of mmap_sem (It is acquired by | ||
| 3417 | * exit_mm()), and | ||
| 3418 | * - SMI occurs before setting TASK_RUNINNG. | ||
| 3419 | * (or hypervisor of virtual machine switches to other guest) | ||
| 3420 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
| 3421 | * | ||
| 3422 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
| 3423 | * is held by try_to_wake_up() | ||
| 3424 | */ | ||
| 3425 | smp_mb(); | ||
| 3426 | raw_spin_unlock_wait(¤t->pi_lock); | ||
| 3427 | |||
| 3428 | /* causes final put_task_struct in finish_task_switch(). */ | ||
| 3429 | __set_current_state(TASK_DEAD); | ||
| 3430 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
| 3431 | __schedule(false); | ||
| 3432 | BUG(); | ||
| 3433 | /* Avoid "noreturn function does return". */ | ||
| 3434 | for (;;) | ||
| 3435 | cpu_relax(); /* For when BUG is null */ | ||
| 3436 | } | ||
| 3385 | 3437 | ||
| 3386 | static inline void sched_submit_work(struct task_struct *tsk) | 3438 | static inline void sched_submit_work(struct task_struct *tsk) |
| 3387 | { | 3439 | { |
| @@ -3665,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3665 | 3717 | ||
| 3666 | p->prio = prio; | 3718 | p->prio = prio; |
| 3667 | 3719 | ||
| 3668 | if (running) | ||
| 3669 | p->sched_class->set_curr_task(rq); | ||
| 3670 | if (queued) | 3720 | if (queued) |
| 3671 | enqueue_task(rq, p, queue_flag); | 3721 | enqueue_task(rq, p, queue_flag); |
| 3722 | if (running) | ||
| 3723 | set_curr_task(rq, p); | ||
| 3672 | 3724 | ||
| 3673 | check_class_changed(rq, p, prev_class, oldprio); | 3725 | check_class_changed(rq, p, prev_class, oldprio); |
| 3674 | out_unlock: | 3726 | out_unlock: |
| @@ -3682,7 +3734,8 @@ out_unlock: | |||
| 3682 | 3734 | ||
| 3683 | void set_user_nice(struct task_struct *p, long nice) | 3735 | void set_user_nice(struct task_struct *p, long nice) |
| 3684 | { | 3736 | { |
| 3685 | int old_prio, delta, queued; | 3737 | bool queued, running; |
| 3738 | int old_prio, delta; | ||
| 3686 | struct rq_flags rf; | 3739 | struct rq_flags rf; |
| 3687 | struct rq *rq; | 3740 | struct rq *rq; |
| 3688 | 3741 | ||
| @@ -3704,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3704 | goto out_unlock; | 3757 | goto out_unlock; |
| 3705 | } | 3758 | } |
| 3706 | queued = task_on_rq_queued(p); | 3759 | queued = task_on_rq_queued(p); |
| 3760 | running = task_current(rq, p); | ||
| 3707 | if (queued) | 3761 | if (queued) |
| 3708 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3762 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 3763 | if (running) | ||
| 3764 | put_prev_task(rq, p); | ||
| 3709 | 3765 | ||
| 3710 | p->static_prio = NICE_TO_PRIO(nice); | 3766 | p->static_prio = NICE_TO_PRIO(nice); |
| 3711 | set_load_weight(p); | 3767 | set_load_weight(p); |
| @@ -3722,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3722 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3778 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
| 3723 | resched_curr(rq); | 3779 | resched_curr(rq); |
| 3724 | } | 3780 | } |
| 3781 | if (running) | ||
| 3782 | set_curr_task(rq, p); | ||
| 3725 | out_unlock: | 3783 | out_unlock: |
| 3726 | task_rq_unlock(rq, p, &rf); | 3784 | task_rq_unlock(rq, p, &rf); |
| 3727 | } | 3785 | } |
| @@ -4221,8 +4279,6 @@ change: | |||
| 4221 | prev_class = p->sched_class; | 4279 | prev_class = p->sched_class; |
| 4222 | __setscheduler(rq, p, attr, pi); | 4280 | __setscheduler(rq, p, attr, pi); |
| 4223 | 4281 | ||
| 4224 | if (running) | ||
| 4225 | p->sched_class->set_curr_task(rq); | ||
| 4226 | if (queued) { | 4282 | if (queued) { |
| 4227 | /* | 4283 | /* |
| 4228 | * We enqueue to tail when the priority of a task is | 4284 | * We enqueue to tail when the priority of a task is |
| @@ -4233,6 +4289,8 @@ change: | |||
| 4233 | 4289 | ||
| 4234 | enqueue_task(rq, p, queue_flags); | 4290 | enqueue_task(rq, p, queue_flags); |
| 4235 | } | 4291 | } |
| 4292 | if (running) | ||
| 4293 | set_curr_task(rq, p); | ||
| 4236 | 4294 | ||
| 4237 | check_class_changed(rq, p, prev_class, oldprio); | 4295 | check_class_changed(rq, p, prev_class, oldprio); |
| 4238 | preempt_disable(); /* avoid rq from going away on us */ | 4296 | preempt_disable(); /* avoid rq from going away on us */ |
| @@ -4824,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 4824 | { | 4882 | { |
| 4825 | struct rq *rq = this_rq_lock(); | 4883 | struct rq *rq = this_rq_lock(); |
| 4826 | 4884 | ||
| 4827 | schedstat_inc(rq, yld_count); | 4885 | schedstat_inc(rq->yld_count); |
| 4828 | current->sched_class->yield_task(rq); | 4886 | current->sched_class->yield_task(rq); |
| 4829 | 4887 | ||
| 4830 | /* | 4888 | /* |
| @@ -4841,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 4841 | return 0; | 4899 | return 0; |
| 4842 | } | 4900 | } |
| 4843 | 4901 | ||
| 4902 | #ifndef CONFIG_PREEMPT | ||
| 4844 | int __sched _cond_resched(void) | 4903 | int __sched _cond_resched(void) |
| 4845 | { | 4904 | { |
| 4846 | if (should_resched(0)) { | 4905 | if (should_resched(0)) { |
| @@ -4850,6 +4909,7 @@ int __sched _cond_resched(void) | |||
| 4850 | return 0; | 4909 | return 0; |
| 4851 | } | 4910 | } |
| 4852 | EXPORT_SYMBOL(_cond_resched); | 4911 | EXPORT_SYMBOL(_cond_resched); |
| 4912 | #endif | ||
| 4853 | 4913 | ||
| 4854 | /* | 4914 | /* |
| 4855 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4915 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
| @@ -4975,7 +5035,7 @@ again: | |||
| 4975 | 5035 | ||
| 4976 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | 5036 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
| 4977 | if (yielded) { | 5037 | if (yielded) { |
| 4978 | schedstat_inc(rq, yld_count); | 5038 | schedstat_inc(rq->yld_count); |
| 4979 | /* | 5039 | /* |
| 4980 | * Make p's CPU reschedule; pick_next_entity takes care of | 5040 | * Make p's CPU reschedule; pick_next_entity takes care of |
| 4981 | * fairness. | 5041 | * fairness. |
| @@ -5395,10 +5455,10 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 5395 | 5455 | ||
| 5396 | p->numa_preferred_nid = nid; | 5456 | p->numa_preferred_nid = nid; |
| 5397 | 5457 | ||
| 5398 | if (running) | ||
| 5399 | p->sched_class->set_curr_task(rq); | ||
| 5400 | if (queued) | 5458 | if (queued) |
| 5401 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 5459 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
| 5460 | if (running) | ||
| 5461 | set_curr_task(rq, p); | ||
| 5402 | task_rq_unlock(rq, p, &rf); | 5462 | task_rq_unlock(rq, p, &rf); |
| 5403 | } | 5463 | } |
| 5404 | #endif /* CONFIG_NUMA_BALANCING */ | 5464 | #endif /* CONFIG_NUMA_BALANCING */ |
| @@ -5695,6 +5755,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5695 | } | 5755 | } |
| 5696 | } | 5756 | } |
| 5697 | #else /* !CONFIG_SCHED_DEBUG */ | 5757 | #else /* !CONFIG_SCHED_DEBUG */ |
| 5758 | |||
| 5759 | # define sched_debug_enabled 0 | ||
| 5698 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5760 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 5699 | static inline bool sched_debug(void) | 5761 | static inline bool sched_debug(void) |
| 5700 | { | 5762 | { |
| @@ -5713,6 +5775,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
| 5713 | SD_BALANCE_FORK | | 5775 | SD_BALANCE_FORK | |
| 5714 | SD_BALANCE_EXEC | | 5776 | SD_BALANCE_EXEC | |
| 5715 | SD_SHARE_CPUCAPACITY | | 5777 | SD_SHARE_CPUCAPACITY | |
| 5778 | SD_ASYM_CPUCAPACITY | | ||
| 5716 | SD_SHARE_PKG_RESOURCES | | 5779 | SD_SHARE_PKG_RESOURCES | |
| 5717 | SD_SHARE_POWERDOMAIN)) { | 5780 | SD_SHARE_POWERDOMAIN)) { |
| 5718 | if (sd->groups != sd->groups->next) | 5781 | if (sd->groups != sd->groups->next) |
| @@ -5743,6 +5806,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 5743 | SD_BALANCE_NEWIDLE | | 5806 | SD_BALANCE_NEWIDLE | |
| 5744 | SD_BALANCE_FORK | | 5807 | SD_BALANCE_FORK | |
| 5745 | SD_BALANCE_EXEC | | 5808 | SD_BALANCE_EXEC | |
| 5809 | SD_ASYM_CPUCAPACITY | | ||
| 5746 | SD_SHARE_CPUCAPACITY | | 5810 | SD_SHARE_CPUCAPACITY | |
| 5747 | SD_SHARE_PKG_RESOURCES | | 5811 | SD_SHARE_PKG_RESOURCES | |
| 5748 | SD_PREFER_SIBLING | | 5812 | SD_PREFER_SIBLING | |
| @@ -5887,10 +5951,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) | |||
| 5887 | } while (sg != first); | 5951 | } while (sg != first); |
| 5888 | } | 5952 | } |
| 5889 | 5953 | ||
| 5890 | static void free_sched_domain(struct rcu_head *rcu) | 5954 | static void destroy_sched_domain(struct sched_domain *sd) |
| 5891 | { | 5955 | { |
| 5892 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
| 5893 | |||
| 5894 | /* | 5956 | /* |
| 5895 | * If its an overlapping domain it has private groups, iterate and | 5957 | * If its an overlapping domain it has private groups, iterate and |
| 5896 | * nuke them all. | 5958 | * nuke them all. |
| @@ -5901,18 +5963,26 @@ static void free_sched_domain(struct rcu_head *rcu) | |||
| 5901 | kfree(sd->groups->sgc); | 5963 | kfree(sd->groups->sgc); |
| 5902 | kfree(sd->groups); | 5964 | kfree(sd->groups); |
| 5903 | } | 5965 | } |
| 5966 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
| 5967 | kfree(sd->shared); | ||
| 5904 | kfree(sd); | 5968 | kfree(sd); |
| 5905 | } | 5969 | } |
| 5906 | 5970 | ||
| 5907 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | 5971 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) |
| 5908 | { | 5972 | { |
| 5909 | call_rcu(&sd->rcu, free_sched_domain); | 5973 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
| 5974 | |||
| 5975 | while (sd) { | ||
| 5976 | struct sched_domain *parent = sd->parent; | ||
| 5977 | destroy_sched_domain(sd); | ||
| 5978 | sd = parent; | ||
| 5979 | } | ||
| 5910 | } | 5980 | } |
| 5911 | 5981 | ||
| 5912 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | 5982 | static void destroy_sched_domains(struct sched_domain *sd) |
| 5913 | { | 5983 | { |
| 5914 | for (; sd; sd = sd->parent) | 5984 | if (sd) |
| 5915 | destroy_sched_domain(sd, cpu); | 5985 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); |
| 5916 | } | 5986 | } |
| 5917 | 5987 | ||
| 5918 | /* | 5988 | /* |
| @@ -5927,14 +5997,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
| 5927 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5997 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
| 5928 | DEFINE_PER_CPU(int, sd_llc_size); | 5998 | DEFINE_PER_CPU(int, sd_llc_size); |
| 5929 | DEFINE_PER_CPU(int, sd_llc_id); | 5999 | DEFINE_PER_CPU(int, sd_llc_id); |
| 6000 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
| 5930 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | 6001 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); |
| 5931 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 5932 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | 6002 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); |
| 5933 | 6003 | ||
| 5934 | static void update_top_cache_domain(int cpu) | 6004 | static void update_top_cache_domain(int cpu) |
| 5935 | { | 6005 | { |
| 6006 | struct sched_domain_shared *sds = NULL; | ||
| 5936 | struct sched_domain *sd; | 6007 | struct sched_domain *sd; |
| 5937 | struct sched_domain *busy_sd = NULL; | ||
| 5938 | int id = cpu; | 6008 | int id = cpu; |
| 5939 | int size = 1; | 6009 | int size = 1; |
| 5940 | 6010 | ||
| @@ -5942,13 +6012,13 @@ static void update_top_cache_domain(int cpu) | |||
| 5942 | if (sd) { | 6012 | if (sd) { |
| 5943 | id = cpumask_first(sched_domain_span(sd)); | 6013 | id = cpumask_first(sched_domain_span(sd)); |
| 5944 | size = cpumask_weight(sched_domain_span(sd)); | 6014 | size = cpumask_weight(sched_domain_span(sd)); |
| 5945 | busy_sd = sd->parent; /* sd_busy */ | 6015 | sds = sd->shared; |
| 5946 | } | 6016 | } |
| 5947 | rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); | ||
| 5948 | 6017 | ||
| 5949 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6018 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
| 5950 | per_cpu(sd_llc_size, cpu) = size; | 6019 | per_cpu(sd_llc_size, cpu) = size; |
| 5951 | per_cpu(sd_llc_id, cpu) = id; | 6020 | per_cpu(sd_llc_id, cpu) = id; |
| 6021 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
| 5952 | 6022 | ||
| 5953 | sd = lowest_flag_domain(cpu, SD_NUMA); | 6023 | sd = lowest_flag_domain(cpu, SD_NUMA); |
| 5954 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | 6024 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); |
| @@ -5984,7 +6054,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5984 | */ | 6054 | */ |
| 5985 | if (parent->flags & SD_PREFER_SIBLING) | 6055 | if (parent->flags & SD_PREFER_SIBLING) |
| 5986 | tmp->flags |= SD_PREFER_SIBLING; | 6056 | tmp->flags |= SD_PREFER_SIBLING; |
| 5987 | destroy_sched_domain(parent, cpu); | 6057 | destroy_sched_domain(parent); |
| 5988 | } else | 6058 | } else |
| 5989 | tmp = tmp->parent; | 6059 | tmp = tmp->parent; |
| 5990 | } | 6060 | } |
| @@ -5992,7 +6062,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5992 | if (sd && sd_degenerate(sd)) { | 6062 | if (sd && sd_degenerate(sd)) { |
| 5993 | tmp = sd; | 6063 | tmp = sd; |
| 5994 | sd = sd->parent; | 6064 | sd = sd->parent; |
| 5995 | destroy_sched_domain(tmp, cpu); | 6065 | destroy_sched_domain(tmp); |
| 5996 | if (sd) | 6066 | if (sd) |
| 5997 | sd->child = NULL; | 6067 | sd->child = NULL; |
| 5998 | } | 6068 | } |
| @@ -6002,7 +6072,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6002 | rq_attach_root(rq, rd); | 6072 | rq_attach_root(rq, rd); |
| 6003 | tmp = rq->sd; | 6073 | tmp = rq->sd; |
| 6004 | rcu_assign_pointer(rq->sd, sd); | 6074 | rcu_assign_pointer(rq->sd, sd); |
| 6005 | destroy_sched_domains(tmp, cpu); | 6075 | destroy_sched_domains(tmp); |
| 6006 | 6076 | ||
| 6007 | update_top_cache_domain(cpu); | 6077 | update_top_cache_domain(cpu); |
| 6008 | } | 6078 | } |
| @@ -6245,7 +6315,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | |||
| 6245 | return; | 6315 | return; |
| 6246 | 6316 | ||
| 6247 | update_group_capacity(sd, cpu); | 6317 | update_group_capacity(sd, cpu); |
| 6248 | atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); | ||
| 6249 | } | 6318 | } |
| 6250 | 6319 | ||
| 6251 | /* | 6320 | /* |
| @@ -6333,6 +6402,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
| 6333 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | 6402 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
| 6334 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | 6403 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
| 6335 | 6404 | ||
| 6405 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
| 6406 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
| 6407 | |||
| 6336 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | 6408 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
| 6337 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 6409 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
| 6338 | 6410 | ||
| @@ -6352,26 +6424,37 @@ static int sched_domains_curr_level; | |||
| 6352 | /* | 6424 | /* |
| 6353 | * SD_flags allowed in topology descriptions. | 6425 | * SD_flags allowed in topology descriptions. |
| 6354 | * | 6426 | * |
| 6355 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | 6427 | * These flags are purely descriptive of the topology and do not prescribe |
| 6356 | * SD_SHARE_PKG_RESOURCES - describes shared caches | 6428 | * behaviour. Behaviour is artificial and mapped in the below sd_init() |
| 6357 | * SD_NUMA - describes NUMA topologies | 6429 | * function: |
| 6358 | * SD_SHARE_POWERDOMAIN - describes shared power domain | 6430 | * |
| 6431 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
| 6432 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
| 6433 | * SD_NUMA - describes NUMA topologies | ||
| 6434 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
| 6435 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
| 6359 | * | 6436 | * |
| 6360 | * Odd one out: | 6437 | * Odd one out, which beside describing the topology has a quirk also |
| 6361 | * SD_ASYM_PACKING - describes SMT quirks | 6438 | * prescribes the desired behaviour that goes along with it: |
| 6439 | * | ||
| 6440 | * SD_ASYM_PACKING - describes SMT quirks | ||
| 6362 | */ | 6441 | */ |
| 6363 | #define TOPOLOGY_SD_FLAGS \ | 6442 | #define TOPOLOGY_SD_FLAGS \ |
| 6364 | (SD_SHARE_CPUCAPACITY | \ | 6443 | (SD_SHARE_CPUCAPACITY | \ |
| 6365 | SD_SHARE_PKG_RESOURCES | \ | 6444 | SD_SHARE_PKG_RESOURCES | \ |
| 6366 | SD_NUMA | \ | 6445 | SD_NUMA | \ |
| 6367 | SD_ASYM_PACKING | \ | 6446 | SD_ASYM_PACKING | \ |
| 6447 | SD_ASYM_CPUCAPACITY | \ | ||
| 6368 | SD_SHARE_POWERDOMAIN) | 6448 | SD_SHARE_POWERDOMAIN) |
| 6369 | 6449 | ||
| 6370 | static struct sched_domain * | 6450 | static struct sched_domain * |
| 6371 | sd_init(struct sched_domain_topology_level *tl, int cpu) | 6451 | sd_init(struct sched_domain_topology_level *tl, |
| 6452 | const struct cpumask *cpu_map, | ||
| 6453 | struct sched_domain *child, int cpu) | ||
| 6372 | { | 6454 | { |
| 6373 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | 6455 | struct sd_data *sdd = &tl->data; |
| 6374 | int sd_weight, sd_flags = 0; | 6456 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
| 6457 | int sd_id, sd_weight, sd_flags = 0; | ||
| 6375 | 6458 | ||
| 6376 | #ifdef CONFIG_NUMA | 6459 | #ifdef CONFIG_NUMA |
| 6377 | /* | 6460 | /* |
| @@ -6420,15 +6503,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6420 | .smt_gain = 0, | 6503 | .smt_gain = 0, |
| 6421 | .max_newidle_lb_cost = 0, | 6504 | .max_newidle_lb_cost = 0, |
| 6422 | .next_decay_max_lb_cost = jiffies, | 6505 | .next_decay_max_lb_cost = jiffies, |
| 6506 | .child = child, | ||
| 6423 | #ifdef CONFIG_SCHED_DEBUG | 6507 | #ifdef CONFIG_SCHED_DEBUG |
| 6424 | .name = tl->name, | 6508 | .name = tl->name, |
| 6425 | #endif | 6509 | #endif |
| 6426 | }; | 6510 | }; |
| 6427 | 6511 | ||
| 6512 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
| 6513 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
| 6514 | |||
| 6428 | /* | 6515 | /* |
| 6429 | * Convert topological properties into behaviour. | 6516 | * Convert topological properties into behaviour. |
| 6430 | */ | 6517 | */ |
| 6431 | 6518 | ||
| 6519 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
| 6520 | struct sched_domain *t = sd; | ||
| 6521 | |||
| 6522 | for_each_lower_domain(t) | ||
| 6523 | t->flags |= SD_BALANCE_WAKE; | ||
| 6524 | } | ||
| 6525 | |||
| 6432 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6526 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
| 6433 | sd->flags |= SD_PREFER_SIBLING; | 6527 | sd->flags |= SD_PREFER_SIBLING; |
| 6434 | sd->imbalance_pct = 110; | 6528 | sd->imbalance_pct = 110; |
| @@ -6460,7 +6554,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6460 | sd->idle_idx = 1; | 6554 | sd->idle_idx = 1; |
| 6461 | } | 6555 | } |
| 6462 | 6556 | ||
| 6463 | sd->private = &tl->data; | 6557 | /* |
| 6558 | * For all levels sharing cache; connect a sched_domain_shared | ||
| 6559 | * instance. | ||
| 6560 | */ | ||
| 6561 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
| 6562 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
| 6563 | atomic_inc(&sd->shared->ref); | ||
| 6564 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
| 6565 | } | ||
| 6566 | |||
| 6567 | sd->private = sdd; | ||
| 6464 | 6568 | ||
| 6465 | return sd; | 6569 | return sd; |
| 6466 | } | 6570 | } |
| @@ -6487,6 +6591,9 @@ static struct sched_domain_topology_level *sched_domain_topology = | |||
| 6487 | 6591 | ||
| 6488 | void set_sched_topology(struct sched_domain_topology_level *tl) | 6592 | void set_sched_topology(struct sched_domain_topology_level *tl) |
| 6489 | { | 6593 | { |
| 6594 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
| 6595 | return; | ||
| 6596 | |||
| 6490 | sched_domain_topology = tl; | 6597 | sched_domain_topology = tl; |
| 6491 | } | 6598 | } |
| 6492 | 6599 | ||
| @@ -6767,6 +6874,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6767 | if (!sdd->sd) | 6874 | if (!sdd->sd) |
| 6768 | return -ENOMEM; | 6875 | return -ENOMEM; |
| 6769 | 6876 | ||
| 6877 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
| 6878 | if (!sdd->sds) | ||
| 6879 | return -ENOMEM; | ||
| 6880 | |||
| 6770 | sdd->sg = alloc_percpu(struct sched_group *); | 6881 | sdd->sg = alloc_percpu(struct sched_group *); |
| 6771 | if (!sdd->sg) | 6882 | if (!sdd->sg) |
| 6772 | return -ENOMEM; | 6883 | return -ENOMEM; |
| @@ -6777,6 +6888,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6777 | 6888 | ||
| 6778 | for_each_cpu(j, cpu_map) { | 6889 | for_each_cpu(j, cpu_map) { |
| 6779 | struct sched_domain *sd; | 6890 | struct sched_domain *sd; |
| 6891 | struct sched_domain_shared *sds; | ||
| 6780 | struct sched_group *sg; | 6892 | struct sched_group *sg; |
| 6781 | struct sched_group_capacity *sgc; | 6893 | struct sched_group_capacity *sgc; |
| 6782 | 6894 | ||
| @@ -6787,6 +6899,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6787 | 6899 | ||
| 6788 | *per_cpu_ptr(sdd->sd, j) = sd; | 6900 | *per_cpu_ptr(sdd->sd, j) = sd; |
| 6789 | 6901 | ||
| 6902 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
| 6903 | GFP_KERNEL, cpu_to_node(j)); | ||
| 6904 | if (!sds) | ||
| 6905 | return -ENOMEM; | ||
| 6906 | |||
| 6907 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
| 6908 | |||
| 6790 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6909 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| 6791 | GFP_KERNEL, cpu_to_node(j)); | 6910 | GFP_KERNEL, cpu_to_node(j)); |
| 6792 | if (!sg) | 6911 | if (!sg) |
| @@ -6826,6 +6945,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
| 6826 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6945 | kfree(*per_cpu_ptr(sdd->sd, j)); |
| 6827 | } | 6946 | } |
| 6828 | 6947 | ||
| 6948 | if (sdd->sds) | ||
| 6949 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
| 6829 | if (sdd->sg) | 6950 | if (sdd->sg) |
| 6830 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6951 | kfree(*per_cpu_ptr(sdd->sg, j)); |
| 6831 | if (sdd->sgc) | 6952 | if (sdd->sgc) |
| @@ -6833,6 +6954,8 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
| 6833 | } | 6954 | } |
| 6834 | free_percpu(sdd->sd); | 6955 | free_percpu(sdd->sd); |
| 6835 | sdd->sd = NULL; | 6956 | sdd->sd = NULL; |
| 6957 | free_percpu(sdd->sds); | ||
| 6958 | sdd->sds = NULL; | ||
| 6836 | free_percpu(sdd->sg); | 6959 | free_percpu(sdd->sg); |
| 6837 | sdd->sg = NULL; | 6960 | sdd->sg = NULL; |
| 6838 | free_percpu(sdd->sgc); | 6961 | free_percpu(sdd->sgc); |
| @@ -6844,16 +6967,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6844 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 6967 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 6845 | struct sched_domain *child, int cpu) | 6968 | struct sched_domain *child, int cpu) |
| 6846 | { | 6969 | { |
| 6847 | struct sched_domain *sd = sd_init(tl, cpu); | 6970 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); |
| 6848 | if (!sd) | ||
| 6849 | return child; | ||
| 6850 | 6971 | ||
| 6851 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
| 6852 | if (child) { | 6972 | if (child) { |
| 6853 | sd->level = child->level + 1; | 6973 | sd->level = child->level + 1; |
| 6854 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6974 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
| 6855 | child->parent = sd; | 6975 | child->parent = sd; |
| 6856 | sd->child = child; | ||
| 6857 | 6976 | ||
| 6858 | if (!cpumask_subset(sched_domain_span(child), | 6977 | if (!cpumask_subset(sched_domain_span(child), |
| 6859 | sched_domain_span(sd))) { | 6978 | sched_domain_span(sd))) { |
| @@ -6884,6 +7003,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
| 6884 | enum s_alloc alloc_state; | 7003 | enum s_alloc alloc_state; |
| 6885 | struct sched_domain *sd; | 7004 | struct sched_domain *sd; |
| 6886 | struct s_data d; | 7005 | struct s_data d; |
| 7006 | struct rq *rq = NULL; | ||
| 6887 | int i, ret = -ENOMEM; | 7007 | int i, ret = -ENOMEM; |
| 6888 | 7008 | ||
| 6889 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7009 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
| @@ -6934,11 +7054,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
| 6934 | /* Attach the domains */ | 7054 | /* Attach the domains */ |
| 6935 | rcu_read_lock(); | 7055 | rcu_read_lock(); |
| 6936 | for_each_cpu(i, cpu_map) { | 7056 | for_each_cpu(i, cpu_map) { |
| 7057 | rq = cpu_rq(i); | ||
| 6937 | sd = *per_cpu_ptr(d.sd, i); | 7058 | sd = *per_cpu_ptr(d.sd, i); |
| 7059 | |||
| 7060 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
| 7061 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
| 7062 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
| 7063 | |||
| 6938 | cpu_attach_domain(sd, d.rd, i); | 7064 | cpu_attach_domain(sd, d.rd, i); |
| 6939 | } | 7065 | } |
| 6940 | rcu_read_unlock(); | 7066 | rcu_read_unlock(); |
| 6941 | 7067 | ||
| 7068 | if (rq && sched_debug_enabled) { | ||
| 7069 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
| 7070 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
| 7071 | } | ||
| 7072 | |||
| 6942 | ret = 0; | 7073 | ret = 0; |
| 6943 | error: | 7074 | error: |
| 6944 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7075 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| @@ -7297,6 +7428,22 @@ int sched_cpu_dying(unsigned int cpu) | |||
| 7297 | } | 7428 | } |
| 7298 | #endif | 7429 | #endif |
| 7299 | 7430 | ||
| 7431 | #ifdef CONFIG_SCHED_SMT | ||
| 7432 | DEFINE_STATIC_KEY_FALSE(sched_smt_present); | ||
| 7433 | |||
| 7434 | static void sched_init_smt(void) | ||
| 7435 | { | ||
| 7436 | /* | ||
| 7437 | * We've enumerated all CPUs and will assume that if any CPU | ||
| 7438 | * has SMT siblings, CPU0 will too. | ||
| 7439 | */ | ||
| 7440 | if (cpumask_weight(cpu_smt_mask(0)) > 1) | ||
| 7441 | static_branch_enable(&sched_smt_present); | ||
| 7442 | } | ||
| 7443 | #else | ||
| 7444 | static inline void sched_init_smt(void) { } | ||
| 7445 | #endif | ||
| 7446 | |||
| 7300 | void __init sched_init_smp(void) | 7447 | void __init sched_init_smp(void) |
| 7301 | { | 7448 | { |
| 7302 | cpumask_var_t non_isolated_cpus; | 7449 | cpumask_var_t non_isolated_cpus; |
| @@ -7326,6 +7473,9 @@ void __init sched_init_smp(void) | |||
| 7326 | 7473 | ||
| 7327 | init_sched_rt_class(); | 7474 | init_sched_rt_class(); |
| 7328 | init_sched_dl_class(); | 7475 | init_sched_dl_class(); |
| 7476 | |||
| 7477 | sched_init_smt(); | ||
| 7478 | |||
| 7329 | sched_smp_initialized = true; | 7479 | sched_smp_initialized = true; |
| 7330 | } | 7480 | } |
| 7331 | 7481 | ||
| @@ -7363,6 +7513,7 @@ static struct kmem_cache *task_group_cache __read_mostly; | |||
| 7363 | #endif | 7513 | #endif |
| 7364 | 7514 | ||
| 7365 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); | 7515 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
| 7516 | DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
| 7366 | 7517 | ||
| 7367 | void __init sched_init(void) | 7518 | void __init sched_init(void) |
| 7368 | { | 7519 | { |
| @@ -7399,6 +7550,8 @@ void __init sched_init(void) | |||
| 7399 | for_each_possible_cpu(i) { | 7550 | for_each_possible_cpu(i) { |
| 7400 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( | 7551 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( |
| 7401 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | 7552 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); |
| 7553 | per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( | ||
| 7554 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); | ||
| 7402 | } | 7555 | } |
| 7403 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 7556 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
| 7404 | 7557 | ||
| @@ -7501,10 +7654,6 @@ void __init sched_init(void) | |||
| 7501 | 7654 | ||
| 7502 | set_load_weight(&init_task); | 7655 | set_load_weight(&init_task); |
| 7503 | 7656 | ||
| 7504 | #ifdef CONFIG_PREEMPT_NOTIFIERS | ||
| 7505 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | ||
| 7506 | #endif | ||
| 7507 | |||
| 7508 | /* | 7657 | /* |
| 7509 | * The boot idle thread does lazy MMU switching as well: | 7658 | * The boot idle thread does lazy MMU switching as well: |
| 7510 | */ | 7659 | */ |
| @@ -7512,11 +7661,6 @@ void __init sched_init(void) | |||
| 7512 | enter_lazy_tlb(&init_mm, current); | 7661 | enter_lazy_tlb(&init_mm, current); |
| 7513 | 7662 | ||
| 7514 | /* | 7663 | /* |
| 7515 | * During early bootup we pretend to be a normal task: | ||
| 7516 | */ | ||
| 7517 | current->sched_class = &fair_sched_class; | ||
| 7518 | |||
| 7519 | /* | ||
| 7520 | * Make us the idle thread. Technically, schedule() should not be | 7664 | * Make us the idle thread. Technically, schedule() should not be |
| 7521 | * called from this thread, however somewhere below it might be, | 7665 | * called from this thread, however somewhere below it might be, |
| 7522 | * but because we are the idle thread, we just pick up running again | 7666 | * but because we are the idle thread, we just pick up running again |
| @@ -7570,6 +7714,7 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 7570 | void ___might_sleep(const char *file, int line, int preempt_offset) | 7714 | void ___might_sleep(const char *file, int line, int preempt_offset) |
| 7571 | { | 7715 | { |
| 7572 | static unsigned long prev_jiffy; /* ratelimiting */ | 7716 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 7717 | unsigned long preempt_disable_ip; | ||
| 7573 | 7718 | ||
| 7574 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7719 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
| 7575 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 7720 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
| @@ -7580,6 +7725,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
| 7580 | return; | 7725 | return; |
| 7581 | prev_jiffy = jiffies; | 7726 | prev_jiffy = jiffies; |
| 7582 | 7727 | ||
| 7728 | /* Save this before calling printk(), since that will clobber it */ | ||
| 7729 | preempt_disable_ip = get_preempt_disable_ip(current); | ||
| 7730 | |||
| 7583 | printk(KERN_ERR | 7731 | printk(KERN_ERR |
| 7584 | "BUG: sleeping function called from invalid context at %s:%d\n", | 7732 | "BUG: sleeping function called from invalid context at %s:%d\n", |
| 7585 | file, line); | 7733 | file, line); |
| @@ -7594,14 +7742,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
| 7594 | debug_show_held_locks(current); | 7742 | debug_show_held_locks(current); |
| 7595 | if (irqs_disabled()) | 7743 | if (irqs_disabled()) |
| 7596 | print_irqtrace_events(current); | 7744 | print_irqtrace_events(current); |
| 7597 | #ifdef CONFIG_DEBUG_PREEMPT | 7745 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
| 7598 | if (!preempt_count_equals(preempt_offset)) { | 7746 | && !preempt_count_equals(preempt_offset)) { |
| 7599 | pr_err("Preemption disabled at:"); | 7747 | pr_err("Preemption disabled at:"); |
| 7600 | print_ip_sym(current->preempt_disable_ip); | 7748 | print_ip_sym(preempt_disable_ip); |
| 7601 | pr_cont("\n"); | 7749 | pr_cont("\n"); |
| 7602 | } | 7750 | } |
| 7603 | #endif | ||
| 7604 | dump_stack(); | 7751 | dump_stack(); |
| 7752 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | ||
| 7605 | } | 7753 | } |
| 7606 | EXPORT_SYMBOL(___might_sleep); | 7754 | EXPORT_SYMBOL(___might_sleep); |
| 7607 | #endif | 7755 | #endif |
| @@ -7622,12 +7770,10 @@ void normalize_rt_tasks(void) | |||
| 7622 | if (p->flags & PF_KTHREAD) | 7770 | if (p->flags & PF_KTHREAD) |
| 7623 | continue; | 7771 | continue; |
| 7624 | 7772 | ||
| 7625 | p->se.exec_start = 0; | 7773 | p->se.exec_start = 0; |
| 7626 | #ifdef CONFIG_SCHEDSTATS | 7774 | schedstat_set(p->se.statistics.wait_start, 0); |
| 7627 | p->se.statistics.wait_start = 0; | 7775 | schedstat_set(p->se.statistics.sleep_start, 0); |
| 7628 | p->se.statistics.sleep_start = 0; | 7776 | schedstat_set(p->se.statistics.block_start, 0); |
| 7629 | p->se.statistics.block_start = 0; | ||
| 7630 | #endif | ||
| 7631 | 7777 | ||
| 7632 | if (!dl_task(p) && !rt_task(p)) { | 7778 | if (!dl_task(p) && !rt_task(p)) { |
| 7633 | /* | 7779 | /* |
| @@ -7688,7 +7834,7 @@ struct task_struct *curr_task(int cpu) | |||
| 7688 | * | 7834 | * |
| 7689 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 7835 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 7690 | */ | 7836 | */ |
| 7691 | void set_curr_task(int cpu, struct task_struct *p) | 7837 | void ia64_set_curr_task(int cpu, struct task_struct *p) |
| 7692 | { | 7838 | { |
| 7693 | cpu_curr(cpu) = p; | 7839 | cpu_curr(cpu) = p; |
| 7694 | } | 7840 | } |
| @@ -7819,10 +7965,10 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7819 | 7965 | ||
| 7820 | sched_change_group(tsk, TASK_MOVE_GROUP); | 7966 | sched_change_group(tsk, TASK_MOVE_GROUP); |
| 7821 | 7967 | ||
| 7822 | if (unlikely(running)) | ||
| 7823 | tsk->sched_class->set_curr_task(rq); | ||
| 7824 | if (queued) | 7968 | if (queued) |
| 7825 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 7969 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
| 7970 | if (unlikely(running)) | ||
| 7971 | set_curr_task(rq, tsk); | ||
| 7826 | 7972 | ||
| 7827 | task_rq_unlock(rq, tsk, &rf); | 7973 | task_rq_unlock(rq, tsk, &rf); |
| 7828 | } | 7974 | } |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -31,56 +31,81 @@ static inline int right_child(int i) | |||
| 31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_heapify_down(struct cpudl *cp, int idx) |
| 35 | { | 35 | { |
| 36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int l, r, largest; |
| 37 | 37 | ||
| 38 | swap(cp->elements[a].cpu, cp->elements[b].cpu); | 38 | int orig_cpu = cp->elements[idx].cpu; |
| 39 | swap(cp->elements[a].dl , cp->elements[b].dl ); | 39 | u64 orig_dl = cp->elements[idx].dl; |
| 40 | 40 | ||
| 41 | swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); | 41 | if (left_child(idx) >= cp->size) |
| 42 | } | 42 | return; |
| 43 | |||
| 44 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
| 45 | { | ||
| 46 | int l, r, largest; | ||
| 47 | 43 | ||
| 48 | /* adapted from lib/prio_heap.c */ | 44 | /* adapted from lib/prio_heap.c */ |
| 49 | while(1) { | 45 | while(1) { |
| 46 | u64 largest_dl; | ||
| 50 | l = left_child(idx); | 47 | l = left_child(idx); |
| 51 | r = right_child(idx); | 48 | r = right_child(idx); |
| 52 | largest = idx; | 49 | largest = idx; |
| 50 | largest_dl = orig_dl; | ||
| 53 | 51 | ||
| 54 | if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, | 52 | if ((l < cp->size) && dl_time_before(orig_dl, |
| 55 | cp->elements[l].dl)) | 53 | cp->elements[l].dl)) { |
| 56 | largest = l; | 54 | largest = l; |
| 57 | if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, | 55 | largest_dl = cp->elements[l].dl; |
| 58 | cp->elements[r].dl)) | 56 | } |
| 57 | if ((r < cp->size) && dl_time_before(largest_dl, | ||
| 58 | cp->elements[r].dl)) | ||
| 59 | largest = r; | 59 | largest = r; |
| 60 | |||
| 60 | if (largest == idx) | 61 | if (largest == idx) |
| 61 | break; | 62 | break; |
| 62 | 63 | ||
| 63 | /* Push idx down the heap one level and bump one up */ | 64 | /* pull largest child onto idx */ |
| 64 | cpudl_exchange(cp, largest, idx); | 65 | cp->elements[idx].cpu = cp->elements[largest].cpu; |
| 66 | cp->elements[idx].dl = cp->elements[largest].dl; | ||
| 67 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
| 65 | idx = largest; | 68 | idx = largest; |
| 66 | } | 69 | } |
| 70 | /* actual push down of saved original values orig_* */ | ||
| 71 | cp->elements[idx].cpu = orig_cpu; | ||
| 72 | cp->elements[idx].dl = orig_dl; | ||
| 73 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
| 67 | } | 74 | } |
| 68 | 75 | ||
| 69 | static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) | 76 | static void cpudl_heapify_up(struct cpudl *cp, int idx) |
| 70 | { | 77 | { |
| 71 | WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); | 78 | int p; |
| 72 | 79 | ||
| 73 | if (dl_time_before(new_dl, cp->elements[idx].dl)) { | 80 | int orig_cpu = cp->elements[idx].cpu; |
| 74 | cp->elements[idx].dl = new_dl; | 81 | u64 orig_dl = cp->elements[idx].dl; |
| 75 | cpudl_heapify(cp, idx); | 82 | |
| 76 | } else { | 83 | if (idx == 0) |
| 77 | cp->elements[idx].dl = new_dl; | 84 | return; |
| 78 | while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | 85 | |
| 79 | cp->elements[idx].dl)) { | 86 | do { |
| 80 | cpudl_exchange(cp, idx, parent(idx)); | 87 | p = parent(idx); |
| 81 | idx = parent(idx); | 88 | if (dl_time_before(orig_dl, cp->elements[p].dl)) |
| 82 | } | 89 | break; |
| 83 | } | 90 | /* pull parent onto idx */ |
| 91 | cp->elements[idx].cpu = cp->elements[p].cpu; | ||
| 92 | cp->elements[idx].dl = cp->elements[p].dl; | ||
| 93 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
| 94 | idx = p; | ||
| 95 | } while (idx != 0); | ||
| 96 | /* actual push up of saved original values orig_* */ | ||
| 97 | cp->elements[idx].cpu = orig_cpu; | ||
| 98 | cp->elements[idx].dl = orig_dl; | ||
| 99 | cp->elements[cp->elements[idx].cpu].idx = idx; | ||
| 100 | } | ||
| 101 | |||
| 102 | static void cpudl_heapify(struct cpudl *cp, int idx) | ||
| 103 | { | ||
| 104 | if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, | ||
| 105 | cp->elements[idx].dl)) | ||
| 106 | cpudl_heapify_up(cp, idx); | ||
| 107 | else | ||
| 108 | cpudl_heapify_down(cp, idx); | ||
| 84 | } | 109 | } |
| 85 | 110 | ||
| 86 | static inline int cpudl_maximum(struct cpudl *cp) | 111 | static inline int cpudl_maximum(struct cpudl *cp) |
| @@ -120,16 +145,15 @@ out: | |||
| 120 | } | 145 | } |
| 121 | 146 | ||
| 122 | /* | 147 | /* |
| 123 | * cpudl_set - update the cpudl max-heap | 148 | * cpudl_clear - remove a cpu from the cpudl max-heap |
| 124 | * @cp: the cpudl max-heap context | 149 | * @cp: the cpudl max-heap context |
| 125 | * @cpu: the target cpu | 150 | * @cpu: the target cpu |
| 126 | * @dl: the new earliest deadline for this cpu | ||
| 127 | * | 151 | * |
| 128 | * Notes: assumes cpu_rq(cpu)->lock is locked | 152 | * Notes: assumes cpu_rq(cpu)->lock is locked |
| 129 | * | 153 | * |
| 130 | * Returns: (void) | 154 | * Returns: (void) |
| 131 | */ | 155 | */ |
| 132 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | 156 | void cpudl_clear(struct cpudl *cp, int cpu) |
| 133 | { | 157 | { |
| 134 | int old_idx, new_cpu; | 158 | int old_idx, new_cpu; |
| 135 | unsigned long flags; | 159 | unsigned long flags; |
| @@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
| 137 | WARN_ON(!cpu_present(cpu)); | 161 | WARN_ON(!cpu_present(cpu)); |
| 138 | 162 | ||
| 139 | raw_spin_lock_irqsave(&cp->lock, flags); | 163 | raw_spin_lock_irqsave(&cp->lock, flags); |
| 164 | |||
| 140 | old_idx = cp->elements[cpu].idx; | 165 | old_idx = cp->elements[cpu].idx; |
| 141 | if (!is_valid) { | 166 | if (old_idx == IDX_INVALID) { |
| 142 | /* remove item */ | 167 | /* |
| 143 | if (old_idx == IDX_INVALID) { | 168 | * Nothing to remove if old_idx was invalid. |
| 144 | /* | 169 | * This could happen if a rq_offline_dl is |
| 145 | * Nothing to remove if old_idx was invalid. | 170 | * called for a CPU without -dl tasks running. |
| 146 | * This could happen if a rq_offline_dl is | 171 | */ |
| 147 | * called for a CPU without -dl tasks running. | 172 | } else { |
| 148 | */ | ||
| 149 | goto out; | ||
| 150 | } | ||
| 151 | new_cpu = cp->elements[cp->size - 1].cpu; | 173 | new_cpu = cp->elements[cp->size - 1].cpu; |
| 152 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | 174 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; |
| 153 | cp->elements[old_idx].cpu = new_cpu; | 175 | cp->elements[old_idx].cpu = new_cpu; |
| 154 | cp->size--; | 176 | cp->size--; |
| 155 | cp->elements[new_cpu].idx = old_idx; | 177 | cp->elements[new_cpu].idx = old_idx; |
| 156 | cp->elements[cpu].idx = IDX_INVALID; | 178 | cp->elements[cpu].idx = IDX_INVALID; |
| 157 | while (old_idx > 0 && dl_time_before( | 179 | cpudl_heapify(cp, old_idx); |
| 158 | cp->elements[parent(old_idx)].dl, | ||
| 159 | cp->elements[old_idx].dl)) { | ||
| 160 | cpudl_exchange(cp, old_idx, parent(old_idx)); | ||
| 161 | old_idx = parent(old_idx); | ||
| 162 | } | ||
| 163 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
| 164 | cpudl_heapify(cp, old_idx); | ||
| 165 | 180 | ||
| 166 | goto out; | 181 | cpumask_set_cpu(cpu, cp->free_cpus); |
| 167 | } | 182 | } |
| 183 | raw_spin_unlock_irqrestore(&cp->lock, flags); | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * cpudl_set - update the cpudl max-heap | ||
| 188 | * @cp: the cpudl max-heap context | ||
| 189 | * @cpu: the target cpu | ||
| 190 | * @dl: the new earliest deadline for this cpu | ||
| 191 | * | ||
| 192 | * Notes: assumes cpu_rq(cpu)->lock is locked | ||
| 193 | * | ||
| 194 | * Returns: (void) | ||
| 195 | */ | ||
| 196 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | ||
| 197 | { | ||
| 198 | int old_idx; | ||
| 199 | unsigned long flags; | ||
| 168 | 200 | ||
| 201 | WARN_ON(!cpu_present(cpu)); | ||
| 202 | |||
| 203 | raw_spin_lock_irqsave(&cp->lock, flags); | ||
| 204 | |||
| 205 | old_idx = cp->elements[cpu].idx; | ||
| 169 | if (old_idx == IDX_INVALID) { | 206 | if (old_idx == IDX_INVALID) { |
| 170 | cp->size++; | 207 | int new_idx = cp->size++; |
| 171 | cp->elements[cp->size - 1].dl = dl; | 208 | cp->elements[new_idx].dl = dl; |
| 172 | cp->elements[cp->size - 1].cpu = cpu; | 209 | cp->elements[new_idx].cpu = cpu; |
| 173 | cp->elements[cpu].idx = cp->size - 1; | 210 | cp->elements[cpu].idx = new_idx; |
| 174 | cpudl_change_key(cp, cp->size - 1, dl); | 211 | cpudl_heapify_up(cp, new_idx); |
| 175 | cpumask_clear_cpu(cpu, cp->free_cpus); | 212 | cpumask_clear_cpu(cpu, cp->free_cpus); |
| 176 | } else { | 213 | } else { |
| 177 | cpudl_change_key(cp, old_idx, dl); | 214 | cp->elements[old_idx].dl = dl; |
| 215 | cpudl_heapify(cp, old_idx); | ||
| 178 | } | 216 | } |
| 179 | 217 | ||
| 180 | out: | ||
| 181 | raw_spin_unlock_irqrestore(&cp->lock, flags); | 218 | raw_spin_unlock_irqrestore(&cp->lock, flags); |
| 182 | } | 219 | } |
| 183 | 220 | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -23,7 +23,8 @@ struct cpudl { | |||
| 23 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
| 24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 24 | int cpudl_find(struct cpudl *cp, struct task_struct *p, |
| 25 | struct cpumask *later_mask); | 25 | struct cpumask *later_mask); |
| 26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 26 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
| 27 | void cpudl_clear(struct cpudl *cp, int cpu); | ||
| 27 | int cpudl_init(struct cpudl *cp); | 28 | int cpudl_init(struct cpudl *cp); |
| 28 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 29 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
| 29 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 30 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954e73b4..dbc51442ecbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
| @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
| 33 | */ | 33 | */ |
| 34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, | 34 | void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, |
| 35 | void (*func)(struct update_util_data *data, u64 time, | 35 | void (*func)(struct update_util_data *data, u64 time, |
| 36 | unsigned long util, unsigned long max)) | 36 | unsigned int flags)) |
| 37 | { | 37 | { |
| 38 | if (WARN_ON(!data || !func)) | 38 | if (WARN_ON(!data || !func)) |
| 39 | return; | 39 | return; |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b222c1..69e06898997d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -12,7 +12,6 @@ | |||
| 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 13 | 13 | ||
| 14 | #include <linux/cpufreq.h> | 14 | #include <linux/cpufreq.h> |
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
| 17 | #include <trace/events/power.h> | 16 | #include <trace/events/power.h> |
| 18 | 17 | ||
| @@ -48,11 +47,14 @@ struct sugov_cpu { | |||
| 48 | struct sugov_policy *sg_policy; | 47 | struct sugov_policy *sg_policy; |
| 49 | 48 | ||
| 50 | unsigned int cached_raw_freq; | 49 | unsigned int cached_raw_freq; |
| 50 | unsigned long iowait_boost; | ||
| 51 | unsigned long iowait_boost_max; | ||
| 52 | u64 last_update; | ||
| 51 | 53 | ||
| 52 | /* The fields below are only needed when sharing a policy. */ | 54 | /* The fields below are only needed when sharing a policy. */ |
| 53 | unsigned long util; | 55 | unsigned long util; |
| 54 | unsigned long max; | 56 | unsigned long max; |
| 55 | u64 last_update; | 57 | unsigned int flags; |
| 56 | }; | 58 | }; |
| 57 | 59 | ||
| 58 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); | 60 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); |
| @@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, | |||
| 144 | return cpufreq_driver_resolve_freq(policy, freq); | 146 | return cpufreq_driver_resolve_freq(policy, freq); |
| 145 | } | 147 | } |
| 146 | 148 | ||
| 149 | static void sugov_get_util(unsigned long *util, unsigned long *max) | ||
| 150 | { | ||
| 151 | struct rq *rq = this_rq(); | ||
| 152 | unsigned long cfs_max; | ||
| 153 | |||
| 154 | cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); | ||
| 155 | |||
| 156 | *util = min(rq->cfs.avg.util_avg, cfs_max); | ||
| 157 | *max = cfs_max; | ||
| 158 | } | ||
| 159 | |||
| 160 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, | ||
| 161 | unsigned int flags) | ||
| 162 | { | ||
| 163 | if (flags & SCHED_CPUFREQ_IOWAIT) { | ||
| 164 | sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; | ||
| 165 | } else if (sg_cpu->iowait_boost) { | ||
| 166 | s64 delta_ns = time - sg_cpu->last_update; | ||
| 167 | |||
| 168 | /* Clear iowait_boost if the CPU apprears to have been idle. */ | ||
| 169 | if (delta_ns > TICK_NSEC) | ||
| 170 | sg_cpu->iowait_boost = 0; | ||
| 171 | } | ||
| 172 | } | ||
| 173 | |||
| 174 | static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, | ||
| 175 | unsigned long *max) | ||
| 176 | { | ||
| 177 | unsigned long boost_util = sg_cpu->iowait_boost; | ||
| 178 | unsigned long boost_max = sg_cpu->iowait_boost_max; | ||
| 179 | |||
| 180 | if (!boost_util) | ||
| 181 | return; | ||
| 182 | |||
| 183 | if (*util * boost_max < *max * boost_util) { | ||
| 184 | *util = boost_util; | ||
| 185 | *max = boost_max; | ||
| 186 | } | ||
| 187 | sg_cpu->iowait_boost >>= 1; | ||
| 188 | } | ||
| 189 | |||
| 147 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 190 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
| 148 | unsigned long util, unsigned long max) | 191 | unsigned int flags) |
| 149 | { | 192 | { |
| 150 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 193 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
| 151 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 194 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| 152 | struct cpufreq_policy *policy = sg_policy->policy; | 195 | struct cpufreq_policy *policy = sg_policy->policy; |
| 196 | unsigned long util, max; | ||
| 153 | unsigned int next_f; | 197 | unsigned int next_f; |
| 154 | 198 | ||
| 199 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
| 200 | sg_cpu->last_update = time; | ||
| 201 | |||
| 155 | if (!sugov_should_update_freq(sg_policy, time)) | 202 | if (!sugov_should_update_freq(sg_policy, time)) |
| 156 | return; | 203 | return; |
| 157 | 204 | ||
| 158 | next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : | 205 | if (flags & SCHED_CPUFREQ_RT_DL) { |
| 159 | get_next_freq(sg_cpu, util, max); | 206 | next_f = policy->cpuinfo.max_freq; |
| 207 | } else { | ||
| 208 | sugov_get_util(&util, &max); | ||
| 209 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
| 210 | next_f = get_next_freq(sg_cpu, util, max); | ||
| 211 | } | ||
| 160 | sugov_update_commit(sg_policy, time, next_f); | 212 | sugov_update_commit(sg_policy, time, next_f); |
| 161 | } | 213 | } |
| 162 | 214 | ||
| 163 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | 215 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, |
| 164 | unsigned long util, unsigned long max) | 216 | unsigned long util, unsigned long max, |
| 217 | unsigned int flags) | ||
| 165 | { | 218 | { |
| 166 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 219 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| 167 | struct cpufreq_policy *policy = sg_policy->policy; | 220 | struct cpufreq_policy *policy = sg_policy->policy; |
| @@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
| 169 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | 222 | u64 last_freq_update_time = sg_policy->last_freq_update_time; |
| 170 | unsigned int j; | 223 | unsigned int j; |
| 171 | 224 | ||
| 172 | if (util == ULONG_MAX) | 225 | if (flags & SCHED_CPUFREQ_RT_DL) |
| 173 | return max_f; | 226 | return max_f; |
| 174 | 227 | ||
| 228 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
| 229 | |||
| 175 | for_each_cpu(j, policy->cpus) { | 230 | for_each_cpu(j, policy->cpus) { |
| 176 | struct sugov_cpu *j_sg_cpu; | 231 | struct sugov_cpu *j_sg_cpu; |
| 177 | unsigned long j_util, j_max; | 232 | unsigned long j_util, j_max; |
| @@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
| 186 | * frequency update and the time elapsed between the last update | 241 | * frequency update and the time elapsed between the last update |
| 187 | * of the CPU utilization and the last frequency update is long | 242 | * of the CPU utilization and the last frequency update is long |
| 188 | * enough, don't take the CPU into account as it probably is | 243 | * enough, don't take the CPU into account as it probably is |
| 189 | * idle now. | 244 | * idle now (and clear iowait_boost for it). |
| 190 | */ | 245 | */ |
| 191 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; | 246 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; |
| 192 | if (delta_ns > TICK_NSEC) | 247 | if (delta_ns > TICK_NSEC) { |
| 248 | j_sg_cpu->iowait_boost = 0; | ||
| 193 | continue; | 249 | continue; |
| 194 | 250 | } | |
| 195 | j_util = j_sg_cpu->util; | 251 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) |
| 196 | if (j_util == ULONG_MAX) | ||
| 197 | return max_f; | 252 | return max_f; |
| 198 | 253 | ||
| 254 | j_util = j_sg_cpu->util; | ||
| 199 | j_max = j_sg_cpu->max; | 255 | j_max = j_sg_cpu->max; |
| 200 | if (j_util * max > j_max * util) { | 256 | if (j_util * max > j_max * util) { |
| 201 | util = j_util; | 257 | util = j_util; |
| 202 | max = j_max; | 258 | max = j_max; |
| 203 | } | 259 | } |
| 260 | |||
| 261 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
| 204 | } | 262 | } |
| 205 | 263 | ||
| 206 | return get_next_freq(sg_cpu, util, max); | 264 | return get_next_freq(sg_cpu, util, max); |
| 207 | } | 265 | } |
| 208 | 266 | ||
| 209 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 267 | static void sugov_update_shared(struct update_util_data *hook, u64 time, |
| 210 | unsigned long util, unsigned long max) | 268 | unsigned int flags) |
| 211 | { | 269 | { |
| 212 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 270 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
| 213 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 271 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| 272 | unsigned long util, max; | ||
| 214 | unsigned int next_f; | 273 | unsigned int next_f; |
| 215 | 274 | ||
| 275 | sugov_get_util(&util, &max); | ||
| 276 | |||
| 216 | raw_spin_lock(&sg_policy->update_lock); | 277 | raw_spin_lock(&sg_policy->update_lock); |
| 217 | 278 | ||
| 218 | sg_cpu->util = util; | 279 | sg_cpu->util = util; |
| 219 | sg_cpu->max = max; | 280 | sg_cpu->max = max; |
| 281 | sg_cpu->flags = flags; | ||
| 282 | |||
| 283 | sugov_set_iowait_boost(sg_cpu, time, flags); | ||
| 220 | sg_cpu->last_update = time; | 284 | sg_cpu->last_update = time; |
| 221 | 285 | ||
| 222 | if (sugov_should_update_freq(sg_policy, time)) { | 286 | if (sugov_should_update_freq(sg_policy, time)) { |
| 223 | next_f = sugov_next_freq_shared(sg_cpu, util, max); | 287 | next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); |
| 224 | sugov_update_commit(sg_policy, time, next_f); | 288 | sugov_update_commit(sg_policy, time, next_f); |
| 225 | } | 289 | } |
| 226 | 290 | ||
| @@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 444 | 508 | ||
| 445 | sg_cpu->sg_policy = sg_policy; | 509 | sg_cpu->sg_policy = sg_policy; |
| 446 | if (policy_is_shared(policy)) { | 510 | if (policy_is_shared(policy)) { |
| 447 | sg_cpu->util = ULONG_MAX; | 511 | sg_cpu->util = 0; |
| 448 | sg_cpu->max = 0; | 512 | sg_cpu->max = 0; |
| 513 | sg_cpu->flags = SCHED_CPUFREQ_RT; | ||
| 449 | sg_cpu->last_update = 0; | 514 | sg_cpu->last_update = 0; |
| 450 | sg_cpu->cached_raw_freq = 0; | 515 | sg_cpu->cached_raw_freq = 0; |
| 516 | sg_cpu->iowait_boost = 0; | ||
| 517 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
| 451 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 518 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
| 452 | sugov_update_shared); | 519 | sugov_update_shared); |
| 453 | } else { | 520 | } else { |
| @@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = { | |||
| 495 | .limits = sugov_limits, | 562 | .limits = sugov_limits, |
| 496 | }; | 563 | }; |
| 497 | 564 | ||
| 498 | static int __init sugov_module_init(void) | ||
| 499 | { | ||
| 500 | return cpufreq_register_governor(&schedutil_gov); | ||
| 501 | } | ||
| 502 | |||
| 503 | static void __exit sugov_module_exit(void) | ||
| 504 | { | ||
| 505 | cpufreq_unregister_governor(&schedutil_gov); | ||
| 506 | } | ||
| 507 | |||
| 508 | MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); | ||
| 509 | MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); | ||
| 510 | MODULE_LICENSE("GPL"); | ||
| 511 | |||
| 512 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 565 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
| 513 | struct cpufreq_governor *cpufreq_default_governor(void) | 566 | struct cpufreq_governor *cpufreq_default_governor(void) |
| 514 | { | 567 | { |
| 515 | return &schedutil_gov; | 568 | return &schedutil_gov; |
| 516 | } | 569 | } |
| 517 | |||
| 518 | fs_initcall(sugov_module_init); | ||
| 519 | #else | ||
| 520 | module_init(sugov_module_init); | ||
| 521 | #endif | 570 | #endif |
| 522 | module_exit(sugov_module_exit); | 571 | |
| 572 | static int __init sugov_register(void) | ||
| 573 | { | ||
| 574 | return cpufreq_register_governor(&schedutil_gov); | ||
| 575 | } | ||
| 576 | fs_initcall(sugov_register); | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9858266fb0b3..5ebee3164e64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -23,10 +23,8 @@ | |||
| 23 | * task when irq is in progress while we read rq->clock. That is a worthy | 23 | * task when irq is in progress while we read rq->clock. That is a worthy |
| 24 | * compromise in place of having locks on each irq in account_system_time. | 24 | * compromise in place of having locks on each irq in account_system_time. |
| 25 | */ | 25 | */ |
| 26 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | 26 | DEFINE_PER_CPU(struct irqtime, cpu_irqtime); |
| 27 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 28 | 27 | ||
| 29 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 30 | static int sched_clock_irqtime; | 28 | static int sched_clock_irqtime; |
| 31 | 29 | ||
| 32 | void enable_sched_clock_irqtime(void) | 30 | void enable_sched_clock_irqtime(void) |
| @@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) | |||
| 39 | sched_clock_irqtime = 0; | 37 | sched_clock_irqtime = 0; |
| 40 | } | 38 | } |
| 41 | 39 | ||
| 42 | #ifndef CONFIG_64BIT | ||
| 43 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 44 | #endif /* CONFIG_64BIT */ | ||
| 45 | |||
| 46 | /* | 40 | /* |
| 47 | * Called before incrementing preempt_count on {soft,}irq_enter | 41 | * Called before incrementing preempt_count on {soft,}irq_enter |
| 48 | * and before decrementing preempt_count on {soft,}irq_exit. | 42 | * and before decrementing preempt_count on {soft,}irq_exit. |
| 49 | */ | 43 | */ |
| 50 | void irqtime_account_irq(struct task_struct *curr) | 44 | void irqtime_account_irq(struct task_struct *curr) |
| 51 | { | 45 | { |
| 46 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | ||
| 52 | s64 delta; | 47 | s64 delta; |
| 53 | int cpu; | 48 | int cpu; |
| 54 | 49 | ||
| @@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) | |||
| 56 | return; | 51 | return; |
| 57 | 52 | ||
| 58 | cpu = smp_processor_id(); | 53 | cpu = smp_processor_id(); |
| 59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | 54 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
| 60 | __this_cpu_add(irq_start_time, delta); | 55 | irqtime->irq_start_time += delta; |
| 61 | 56 | ||
| 62 | irq_time_write_begin(); | 57 | u64_stats_update_begin(&irqtime->sync); |
| 63 | /* | 58 | /* |
| 64 | * We do not account for softirq time from ksoftirqd here. | 59 | * We do not account for softirq time from ksoftirqd here. |
| 65 | * We want to continue accounting softirq time to ksoftirqd thread | 60 | * We want to continue accounting softirq time to ksoftirqd thread |
| @@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) | |||
| 67 | * that do not consume any time, but still wants to run. | 62 | * that do not consume any time, but still wants to run. |
| 68 | */ | 63 | */ |
| 69 | if (hardirq_count()) | 64 | if (hardirq_count()) |
| 70 | __this_cpu_add(cpu_hardirq_time, delta); | 65 | irqtime->hardirq_time += delta; |
| 71 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 66 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
| 72 | __this_cpu_add(cpu_softirq_time, delta); | 67 | irqtime->softirq_time += delta; |
| 73 | 68 | ||
| 74 | irq_time_write_end(); | 69 | u64_stats_update_end(&irqtime->sync); |
| 75 | } | 70 | } |
| 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 71 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
| 77 | 72 | ||
| 78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | 73 | static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) |
| 79 | { | 74 | { |
| 80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 75 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 81 | unsigned long flags; | ||
| 82 | cputime_t irq_cputime; | 76 | cputime_t irq_cputime; |
| 83 | 77 | ||
| 84 | local_irq_save(flags); | 78 | irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; |
| 85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - | ||
| 86 | cpustat[CPUTIME_IRQ]; | ||
| 87 | irq_cputime = min(irq_cputime, maxtime); | 79 | irq_cputime = min(irq_cputime, maxtime); |
| 88 | cpustat[CPUTIME_IRQ] += irq_cputime; | 80 | cpustat[idx] += irq_cputime; |
| 89 | local_irq_restore(flags); | 81 | |
| 90 | return irq_cputime; | 82 | return irq_cputime; |
| 91 | } | 83 | } |
| 92 | 84 | ||
| 93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | 85 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) |
| 94 | { | 86 | { |
| 95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 87 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), |
| 96 | unsigned long flags; | 88 | CPUTIME_IRQ, maxtime); |
| 97 | cputime_t softirq_cputime; | 89 | } |
| 98 | 90 | ||
| 99 | local_irq_save(flags); | 91 | static cputime_t irqtime_account_si_update(cputime_t maxtime) |
| 100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - | 92 | { |
| 101 | cpustat[CPUTIME_SOFTIRQ]; | 93 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), |
| 102 | softirq_cputime = min(softirq_cputime, maxtime); | 94 | CPUTIME_SOFTIRQ, maxtime); |
| 103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | ||
| 104 | local_irq_restore(flags); | ||
| 105 | return softirq_cputime; | ||
| 106 | } | 95 | } |
| 107 | 96 | ||
| 108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 97 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| @@ -263,6 +252,11 @@ void account_idle_time(cputime_t cputime) | |||
| 263 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 252 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
| 264 | } | 253 | } |
| 265 | 254 | ||
| 255 | /* | ||
| 256 | * When a guest is interrupted for a longer amount of time, missed clock | ||
| 257 | * ticks are not redelivered later. Due to that, this function may on | ||
| 258 | * occasion account more time than the calling functions think elapsed. | ||
| 259 | */ | ||
| 266 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | 260 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) |
| 267 | { | 261 | { |
| 268 | #ifdef CONFIG_PARAVIRT | 262 | #ifdef CONFIG_PARAVIRT |
| @@ -290,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max) | |||
| 290 | { | 284 | { |
| 291 | cputime_t accounted; | 285 | cputime_t accounted; |
| 292 | 286 | ||
| 287 | /* Shall be converted to a lockdep-enabled lightweight check */ | ||
| 288 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 289 | |||
| 293 | accounted = steal_account_process_time(max); | 290 | accounted = steal_account_process_time(max); |
| 294 | 291 | ||
| 295 | if (accounted < max) | 292 | if (accounted < max) |
| @@ -301,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max) | |||
| 301 | return accounted; | 298 | return accounted; |
| 302 | } | 299 | } |
| 303 | 300 | ||
| 301 | #ifdef CONFIG_64BIT | ||
| 302 | static inline u64 read_sum_exec_runtime(struct task_struct *t) | ||
| 303 | { | ||
| 304 | return t->se.sum_exec_runtime; | ||
| 305 | } | ||
| 306 | #else | ||
| 307 | static u64 read_sum_exec_runtime(struct task_struct *t) | ||
| 308 | { | ||
| 309 | u64 ns; | ||
| 310 | struct rq_flags rf; | ||
| 311 | struct rq *rq; | ||
| 312 | |||
| 313 | rq = task_rq_lock(t, &rf); | ||
| 314 | ns = t->se.sum_exec_runtime; | ||
| 315 | task_rq_unlock(rq, t, &rf); | ||
| 316 | |||
| 317 | return ns; | ||
| 318 | } | ||
| 319 | #endif | ||
| 320 | |||
| 304 | /* | 321 | /* |
| 305 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | 322 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live |
| 306 | * tasks (sum on group iteration) belonging to @tsk's group. | 323 | * tasks (sum on group iteration) belonging to @tsk's group. |
| @@ -313,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 313 | unsigned int seq, nextseq; | 330 | unsigned int seq, nextseq; |
| 314 | unsigned long flags; | 331 | unsigned long flags; |
| 315 | 332 | ||
| 333 | /* | ||
| 334 | * Update current task runtime to account pending time since last | ||
| 335 | * scheduler action or thread_group_cputime() call. This thread group | ||
| 336 | * might have other running tasks on different CPUs, but updating | ||
| 337 | * their runtime can affect syscall performance, so we skip account | ||
| 338 | * those pending times and rely only on values updated on tick or | ||
| 339 | * other scheduler action. | ||
| 340 | */ | ||
| 341 | if (same_thread_group(current, tsk)) | ||
| 342 | (void) task_sched_runtime(current); | ||
| 343 | |||
| 316 | rcu_read_lock(); | 344 | rcu_read_lock(); |
| 317 | /* Attempt a lockless read on the first round. */ | 345 | /* Attempt a lockless read on the first round. */ |
| 318 | nextseq = 0; | 346 | nextseq = 0; |
| @@ -327,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 327 | task_cputime(t, &utime, &stime); | 355 | task_cputime(t, &utime, &stime); |
| 328 | times->utime += utime; | 356 | times->utime += utime; |
| 329 | times->stime += stime; | 357 | times->stime += stime; |
| 330 | times->sum_exec_runtime += task_sched_runtime(t); | 358 | times->sum_exec_runtime += read_sum_exec_runtime(t); |
| 331 | } | 359 | } |
| 332 | /* If lockless access failed, take the lock. */ | 360 | /* If lockless access failed, take the lock. */ |
| 333 | nextseq = 1; | 361 | nextseq = 1; |
| @@ -371,7 +399,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
| 371 | * idle, or potentially user or system time. Due to rounding, | 399 | * idle, or potentially user or system time. Due to rounding, |
| 372 | * other time can exceed ticks occasionally. | 400 | * other time can exceed ticks occasionally. |
| 373 | */ | 401 | */ |
| 374 | other = account_other_time(cputime); | 402 | other = account_other_time(ULONG_MAX); |
| 375 | if (other >= cputime) | 403 | if (other >= cputime) |
| 376 | return; | 404 | return; |
| 377 | cputime -= other; | 405 | cputime -= other; |
| @@ -486,7 +514,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 486 | } | 514 | } |
| 487 | 515 | ||
| 488 | cputime = cputime_one_jiffy; | 516 | cputime = cputime_one_jiffy; |
| 489 | steal = steal_account_process_time(cputime); | 517 | steal = steal_account_process_time(ULONG_MAX); |
| 490 | 518 | ||
| 491 | if (steal >= cputime) | 519 | if (steal >= cputime) |
| 492 | return; | 520 | return; |
| @@ -516,7 +544,7 @@ void account_idle_ticks(unsigned long ticks) | |||
| 516 | } | 544 | } |
| 517 | 545 | ||
| 518 | cputime = jiffies_to_cputime(ticks); | 546 | cputime = jiffies_to_cputime(ticks); |
| 519 | steal = steal_account_process_time(cputime); | 547 | steal = steal_account_process_time(ULONG_MAX); |
| 520 | 548 | ||
| 521 | if (steal >= cputime) | 549 | if (steal >= cputime) |
| 522 | return; | 550 | return; |
| @@ -614,19 +642,25 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 614 | stime = curr->stime; | 642 | stime = curr->stime; |
| 615 | utime = curr->utime; | 643 | utime = curr->utime; |
| 616 | 644 | ||
| 617 | if (utime == 0) { | 645 | /* |
| 618 | stime = rtime; | 646 | * If either stime or both stime and utime are 0, assume all runtime is |
| 647 | * userspace. Once a task gets some ticks, the monotonicy code at | ||
| 648 | * 'update' will ensure things converge to the observed ratio. | ||
| 649 | */ | ||
| 650 | if (stime == 0) { | ||
| 651 | utime = rtime; | ||
| 619 | goto update; | 652 | goto update; |
| 620 | } | 653 | } |
| 621 | 654 | ||
| 622 | if (stime == 0) { | 655 | if (utime == 0) { |
| 623 | utime = rtime; | 656 | stime = rtime; |
| 624 | goto update; | 657 | goto update; |
| 625 | } | 658 | } |
| 626 | 659 | ||
| 627 | stime = scale_stime((__force u64)stime, (__force u64)rtime, | 660 | stime = scale_stime((__force u64)stime, (__force u64)rtime, |
| 628 | (__force u64)(stime + utime)); | 661 | (__force u64)(stime + utime)); |
| 629 | 662 | ||
| 663 | update: | ||
| 630 | /* | 664 | /* |
| 631 | * Make sure stime doesn't go backwards; this preserves monotonicity | 665 | * Make sure stime doesn't go backwards; this preserves monotonicity |
| 632 | * for utime because rtime is monotonic. | 666 | * for utime because rtime is monotonic. |
| @@ -649,7 +683,6 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 649 | stime = rtime - utime; | 683 | stime = rtime - utime; |
| 650 | } | 684 | } |
| 651 | 685 | ||
| 652 | update: | ||
| 653 | prev->stime = stime; | 686 | prev->stime = stime; |
| 654 | prev->utime = utime; | 687 | prev->utime = utime; |
| 655 | out: | 688 | out: |
| @@ -694,6 +727,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
| 694 | unsigned long now = READ_ONCE(jiffies); | 727 | unsigned long now = READ_ONCE(jiffies); |
| 695 | cputime_t delta, other; | 728 | cputime_t delta, other; |
| 696 | 729 | ||
| 730 | /* | ||
| 731 | * Unlike tick based timing, vtime based timing never has lost | ||
| 732 | * ticks, and no need for steal time accounting to make up for | ||
| 733 | * lost ticks. Vtime accounts a rounded version of actual | ||
| 734 | * elapsed time. Limit account_other_time to prevent rounding | ||
| 735 | * errors from causing elapsed vtime to go negative. | ||
| 736 | */ | ||
| 697 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | 737 | delta = jiffies_to_cputime(now - tsk->vtime_snap); |
| 698 | other = account_other_time(delta); | 738 | other = account_other_time(delta); |
| 699 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 739 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..37e2449186c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | |||
| 243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) | 243 | static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) |
| 244 | { | 244 | { |
| 245 | struct rq *later_rq = NULL; | 245 | struct rq *later_rq = NULL; |
| 246 | bool fallback = false; | ||
| 247 | 246 | ||
| 248 | later_rq = find_lock_later_rq(p, rq); | 247 | later_rq = find_lock_later_rq(p, rq); |
| 249 | |||
| 250 | if (!later_rq) { | 248 | if (!later_rq) { |
| 251 | int cpu; | 249 | int cpu; |
| 252 | 250 | ||
| @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
| 254 | * If we cannot preempt any rq, fall back to pick any | 252 | * If we cannot preempt any rq, fall back to pick any |
| 255 | * online cpu. | 253 | * online cpu. |
| 256 | */ | 254 | */ |
| 257 | fallback = true; | ||
| 258 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | 255 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); |
| 259 | if (cpu >= nr_cpu_ids) { | 256 | if (cpu >= nr_cpu_ids) { |
| 260 | /* | 257 | /* |
| @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
| 274 | double_lock_balance(rq, later_rq); | 271 | double_lock_balance(rq, later_rq); |
| 275 | } | 272 | } |
| 276 | 273 | ||
| 277 | /* | ||
| 278 | * By now the task is replenished and enqueued; migrate it. | ||
| 279 | */ | ||
| 280 | deactivate_task(rq, p, 0); | ||
| 281 | set_task_cpu(p, later_rq->cpu); | 274 | set_task_cpu(p, later_rq->cpu); |
| 282 | activate_task(later_rq, p, 0); | ||
| 283 | |||
| 284 | if (!fallback) | ||
| 285 | resched_curr(later_rq); | ||
| 286 | |||
| 287 | double_unlock_balance(later_rq, rq); | 275 | double_unlock_balance(later_rq, rq); |
| 288 | 276 | ||
| 289 | return later_rq; | 277 | return later_rq; |
| @@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
| 346 | * one, and to (try to!) reconcile itself with its own scheduling | 334 | * one, and to (try to!) reconcile itself with its own scheduling |
| 347 | * parameters. | 335 | * parameters. |
| 348 | */ | 336 | */ |
| 349 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | 337 | static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) |
| 350 | struct sched_dl_entity *pi_se) | ||
| 351 | { | 338 | { |
| 352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 339 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
| 353 | struct rq *rq = rq_of_dl_rq(dl_rq); | 340 | struct rq *rq = rq_of_dl_rq(dl_rq); |
| 354 | 341 | ||
| 342 | WARN_ON(dl_se->dl_boosted); | ||
| 355 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); | 343 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); |
| 356 | 344 | ||
| 357 | /* | 345 | /* |
| @@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
| 367 | * future; in fact, we must consider execution overheads (time | 355 | * future; in fact, we must consider execution overheads (time |
| 368 | * spent on hardirq context, etc.). | 356 | * spent on hardirq context, etc.). |
| 369 | */ | 357 | */ |
| 370 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 358 | dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; |
| 371 | dl_se->runtime = pi_se->dl_runtime; | 359 | dl_se->runtime = dl_se->dl_runtime; |
| 372 | } | 360 | } |
| 373 | 361 | ||
| 374 | /* | 362 | /* |
| @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 641 | goto unlock; | 629 | goto unlock; |
| 642 | } | 630 | } |
| 643 | 631 | ||
| 644 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
| 645 | if (dl_task(rq->curr)) | ||
| 646 | check_preempt_curr_dl(rq, p, 0); | ||
| 647 | else | ||
| 648 | resched_curr(rq); | ||
| 649 | |||
| 650 | #ifdef CONFIG_SMP | 632 | #ifdef CONFIG_SMP |
| 651 | /* | ||
| 652 | * Perform balancing operations here; after the replenishments. We | ||
| 653 | * cannot drop rq->lock before this, otherwise the assertion in | ||
| 654 | * start_dl_timer() about not missing updates is not true. | ||
| 655 | * | ||
| 656 | * If we find that the rq the task was on is no longer available, we | ||
| 657 | * need to select a new rq. | ||
| 658 | * | ||
| 659 | * XXX figure out if select_task_rq_dl() deals with offline cpus. | ||
| 660 | */ | ||
| 661 | if (unlikely(!rq->online)) { | 633 | if (unlikely(!rq->online)) { |
| 634 | /* | ||
| 635 | * If the runqueue is no longer available, migrate the | ||
| 636 | * task elsewhere. This necessarily changes rq. | ||
| 637 | */ | ||
| 662 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 638 | lockdep_unpin_lock(&rq->lock, rf.cookie); |
| 663 | rq = dl_task_offline_migration(rq, p); | 639 | rq = dl_task_offline_migration(rq, p); |
| 664 | rf.cookie = lockdep_pin_lock(&rq->lock); | 640 | rf.cookie = lockdep_pin_lock(&rq->lock); |
| 641 | |||
| 642 | /* | ||
| 643 | * Now that the task has been migrated to the new RQ and we | ||
| 644 | * have that locked, proceed as normal and enqueue the task | ||
| 645 | * there. | ||
| 646 | */ | ||
| 665 | } | 647 | } |
| 648 | #endif | ||
| 649 | |||
| 650 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | ||
| 651 | if (dl_task(rq->curr)) | ||
| 652 | check_preempt_curr_dl(rq, p, 0); | ||
| 653 | else | ||
| 654 | resched_curr(rq); | ||
| 666 | 655 | ||
| 656 | #ifdef CONFIG_SMP | ||
| 667 | /* | 657 | /* |
| 668 | * Queueing this task back might have overloaded rq, check if we need | 658 | * Queueing this task back might have overloaded rq, check if we need |
| 669 | * to kick someone away. | 659 | * to kick someone away. |
| @@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq) | |||
| 735 | return; | 725 | return; |
| 736 | } | 726 | } |
| 737 | 727 | ||
| 738 | /* kick cpufreq (see the comment in linux/cpufreq.h). */ | 728 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
| 739 | if (cpu_of(rq) == smp_processor_id()) | 729 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); |
| 740 | cpufreq_trigger_update(rq_clock(rq)); | ||
| 741 | 730 | ||
| 742 | schedstat_set(curr->se.statistics.exec_max, | 731 | schedstat_set(curr->se.statistics.exec_max, |
| 743 | max(curr->se.statistics.exec_max, delta_exec)); | 732 | max(curr->se.statistics.exec_max, delta_exec)); |
| @@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
| 798 | if (dl_rq->earliest_dl.curr == 0 || | 787 | if (dl_rq->earliest_dl.curr == 0 || |
| 799 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { | 788 | dl_time_before(deadline, dl_rq->earliest_dl.curr)) { |
| 800 | dl_rq->earliest_dl.curr = deadline; | 789 | dl_rq->earliest_dl.curr = deadline; |
| 801 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); | 790 | cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); |
| 802 | } | 791 | } |
| 803 | } | 792 | } |
| 804 | 793 | ||
| @@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | |||
| 813 | if (!dl_rq->dl_nr_running) { | 802 | if (!dl_rq->dl_nr_running) { |
| 814 | dl_rq->earliest_dl.curr = 0; | 803 | dl_rq->earliest_dl.curr = 0; |
| 815 | dl_rq->earliest_dl.next = 0; | 804 | dl_rq->earliest_dl.next = 0; |
| 816 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 805 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
| 817 | } else { | 806 | } else { |
| 818 | struct rb_node *leftmost = dl_rq->rb_leftmost; | 807 | struct rb_node *leftmost = dl_rq->rb_leftmost; |
| 819 | struct sched_dl_entity *entry; | 808 | struct sched_dl_entity *entry; |
| 820 | 809 | ||
| 821 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); | 810 | entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); |
| 822 | dl_rq->earliest_dl.curr = entry->deadline; | 811 | dl_rq->earliest_dl.curr = entry->deadline; |
| 823 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); | 812 | cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); |
| 824 | } | 813 | } |
| 825 | } | 814 | } |
| 826 | 815 | ||
| @@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) | |||
| 1671 | 1660 | ||
| 1672 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); | 1661 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); |
| 1673 | if (rq->dl.dl_nr_running > 0) | 1662 | if (rq->dl.dl_nr_running > 0) |
| 1674 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | 1663 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); |
| 1675 | } | 1664 | } |
| 1676 | 1665 | ||
| 1677 | /* Assumes rq->lock is held */ | 1666 | /* Assumes rq->lock is held */ |
| @@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) | |||
| 1680 | if (rq->dl.overloaded) | 1669 | if (rq->dl.overloaded) |
| 1681 | dl_clear_overload(rq); | 1670 | dl_clear_overload(rq); |
| 1682 | 1671 | ||
| 1683 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 1672 | cpudl_clear(&rq->rd->cpudl, rq->cpu); |
| 1684 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); | 1673 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); |
| 1685 | } | 1674 | } |
| 1686 | 1675 | ||
| @@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 1723 | */ | 1712 | */ |
| 1724 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 1713 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
| 1725 | { | 1714 | { |
| 1715 | |||
| 1716 | /* If p is not queued we will update its parameters at next wakeup. */ | ||
| 1717 | if (!task_on_rq_queued(p)) | ||
| 1718 | return; | ||
| 1719 | |||
| 1720 | /* | ||
| 1721 | * If p is boosted we already updated its params in | ||
| 1722 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), | ||
| 1723 | * p's deadline being now already after rq_clock(rq). | ||
| 1724 | */ | ||
| 1726 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | 1725 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) |
| 1727 | setup_new_dl_entity(&p->dl, &p->dl); | 1726 | setup_new_dl_entity(&p->dl); |
| 1728 | 1727 | ||
| 1729 | if (task_on_rq_queued(p) && rq->curr != p) { | 1728 | if (rq->curr != p) { |
| 1730 | #ifdef CONFIG_SMP | 1729 | #ifdef CONFIG_SMP |
| 1731 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) | 1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) |
| 1732 | queue_push_tasks(rq); | 1731 | queue_push_tasks(rq); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..fa178b62ea79 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 369 | 369 | ||
| 370 | #define P(F) \ | 370 | #define P(F) \ |
| 371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 371 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
| 372 | #define P_SCHEDSTAT(F) \ | ||
| 373 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | ||
| 372 | #define PN(F) \ | 374 | #define PN(F) \ |
| 373 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | 375 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
| 376 | #define PN_SCHEDSTAT(F) \ | ||
| 377 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
| 374 | 378 | ||
| 375 | if (!se) | 379 | if (!se) |
| 376 | return; | 380 | return; |
| @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 378 | PN(se->exec_start); | 382 | PN(se->exec_start); |
| 379 | PN(se->vruntime); | 383 | PN(se->vruntime); |
| 380 | PN(se->sum_exec_runtime); | 384 | PN(se->sum_exec_runtime); |
| 381 | #ifdef CONFIG_SCHEDSTATS | ||
| 382 | if (schedstat_enabled()) { | 385 | if (schedstat_enabled()) { |
| 383 | PN(se->statistics.wait_start); | 386 | PN_SCHEDSTAT(se->statistics.wait_start); |
| 384 | PN(se->statistics.sleep_start); | 387 | PN_SCHEDSTAT(se->statistics.sleep_start); |
| 385 | PN(se->statistics.block_start); | 388 | PN_SCHEDSTAT(se->statistics.block_start); |
| 386 | PN(se->statistics.sleep_max); | 389 | PN_SCHEDSTAT(se->statistics.sleep_max); |
| 387 | PN(se->statistics.block_max); | 390 | PN_SCHEDSTAT(se->statistics.block_max); |
| 388 | PN(se->statistics.exec_max); | 391 | PN_SCHEDSTAT(se->statistics.exec_max); |
| 389 | PN(se->statistics.slice_max); | 392 | PN_SCHEDSTAT(se->statistics.slice_max); |
| 390 | PN(se->statistics.wait_max); | 393 | PN_SCHEDSTAT(se->statistics.wait_max); |
| 391 | PN(se->statistics.wait_sum); | 394 | PN_SCHEDSTAT(se->statistics.wait_sum); |
| 392 | P(se->statistics.wait_count); | 395 | P_SCHEDSTAT(se->statistics.wait_count); |
| 393 | } | 396 | } |
| 394 | #endif | ||
| 395 | P(se->load.weight); | 397 | P(se->load.weight); |
| 396 | #ifdef CONFIG_SMP | 398 | #ifdef CONFIG_SMP |
| 397 | P(se->avg.load_avg); | 399 | P(se->avg.load_avg); |
| 398 | P(se->avg.util_avg); | 400 | P(se->avg.util_avg); |
| 399 | #endif | 401 | #endif |
| 402 | |||
| 403 | #undef PN_SCHEDSTAT | ||
| 400 | #undef PN | 404 | #undef PN |
| 405 | #undef P_SCHEDSTAT | ||
| 401 | #undef P | 406 | #undef P |
| 402 | } | 407 | } |
| 403 | #endif | 408 | #endif |
| @@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg) | |||
| 410 | if (autogroup_path(tg, group_path, PATH_MAX)) | 415 | if (autogroup_path(tg, group_path, PATH_MAX)) |
| 411 | return group_path; | 416 | return group_path; |
| 412 | 417 | ||
| 413 | return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 418 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
| 419 | return group_path; | ||
| 414 | } | 420 | } |
| 415 | #endif | 421 | #endif |
| 416 | 422 | ||
| @@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 429 | p->prio); | 435 | p->prio); |
| 430 | 436 | ||
| 431 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 437 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
| 432 | SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), | 438 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), |
| 433 | SPLIT_NS(p->se.sum_exec_runtime), | 439 | SPLIT_NS(p->se.sum_exec_runtime), |
| 434 | SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); | 440 | SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); |
| 435 | 441 | ||
| 436 | #ifdef CONFIG_NUMA_BALANCING | 442 | #ifdef CONFIG_NUMA_BALANCING |
| 437 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); | 443 | SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); |
| @@ -626,9 +632,7 @@ do { \ | |||
| 626 | #undef P64 | 632 | #undef P64 |
| 627 | #endif | 633 | #endif |
| 628 | 634 | ||
| 629 | #ifdef CONFIG_SCHEDSTATS | 635 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); |
| 630 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); | ||
| 631 | |||
| 632 | if (schedstat_enabled()) { | 636 | if (schedstat_enabled()) { |
| 633 | P(yld_count); | 637 | P(yld_count); |
| 634 | P(sched_count); | 638 | P(sched_count); |
| @@ -636,9 +640,8 @@ do { \ | |||
| 636 | P(ttwu_count); | 640 | P(ttwu_count); |
| 637 | P(ttwu_local); | 641 | P(ttwu_local); |
| 638 | } | 642 | } |
| 639 | |||
| 640 | #undef P | 643 | #undef P |
| 641 | #endif | 644 | |
| 642 | spin_lock_irqsave(&sched_debug_lock, flags); | 645 | spin_lock_irqsave(&sched_debug_lock, flags); |
| 643 | print_cfs_stats(m, cpu); | 646 | print_cfs_stats(m, cpu); |
| 644 | print_rt_stats(m, cpu); | 647 | print_rt_stats(m, cpu); |
| @@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 868 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 871 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
| 869 | #define P(F) \ | 872 | #define P(F) \ |
| 870 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 873 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
| 874 | #define P_SCHEDSTAT(F) \ | ||
| 875 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) | ||
| 871 | #define __PN(F) \ | 876 | #define __PN(F) \ |
| 872 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | 877 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
| 873 | #define PN(F) \ | 878 | #define PN(F) \ |
| 874 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | 879 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
| 880 | #define PN_SCHEDSTAT(F) \ | ||
| 881 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) | ||
| 875 | 882 | ||
| 876 | PN(se.exec_start); | 883 | PN(se.exec_start); |
| 877 | PN(se.vruntime); | 884 | PN(se.vruntime); |
| @@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 881 | 888 | ||
| 882 | P(se.nr_migrations); | 889 | P(se.nr_migrations); |
| 883 | 890 | ||
| 884 | #ifdef CONFIG_SCHEDSTATS | ||
| 885 | if (schedstat_enabled()) { | 891 | if (schedstat_enabled()) { |
| 886 | u64 avg_atom, avg_per_cpu; | 892 | u64 avg_atom, avg_per_cpu; |
| 887 | 893 | ||
| 888 | PN(se.statistics.sum_sleep_runtime); | 894 | PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); |
| 889 | PN(se.statistics.wait_start); | 895 | PN_SCHEDSTAT(se.statistics.wait_start); |
| 890 | PN(se.statistics.sleep_start); | 896 | PN_SCHEDSTAT(se.statistics.sleep_start); |
| 891 | PN(se.statistics.block_start); | 897 | PN_SCHEDSTAT(se.statistics.block_start); |
| 892 | PN(se.statistics.sleep_max); | 898 | PN_SCHEDSTAT(se.statistics.sleep_max); |
| 893 | PN(se.statistics.block_max); | 899 | PN_SCHEDSTAT(se.statistics.block_max); |
| 894 | PN(se.statistics.exec_max); | 900 | PN_SCHEDSTAT(se.statistics.exec_max); |
| 895 | PN(se.statistics.slice_max); | 901 | PN_SCHEDSTAT(se.statistics.slice_max); |
| 896 | PN(se.statistics.wait_max); | 902 | PN_SCHEDSTAT(se.statistics.wait_max); |
| 897 | PN(se.statistics.wait_sum); | 903 | PN_SCHEDSTAT(se.statistics.wait_sum); |
| 898 | P(se.statistics.wait_count); | 904 | P_SCHEDSTAT(se.statistics.wait_count); |
| 899 | PN(se.statistics.iowait_sum); | 905 | PN_SCHEDSTAT(se.statistics.iowait_sum); |
| 900 | P(se.statistics.iowait_count); | 906 | P_SCHEDSTAT(se.statistics.iowait_count); |
| 901 | P(se.statistics.nr_migrations_cold); | 907 | P_SCHEDSTAT(se.statistics.nr_migrations_cold); |
| 902 | P(se.statistics.nr_failed_migrations_affine); | 908 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); |
| 903 | P(se.statistics.nr_failed_migrations_running); | 909 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); |
| 904 | P(se.statistics.nr_failed_migrations_hot); | 910 | P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); |
| 905 | P(se.statistics.nr_forced_migrations); | 911 | P_SCHEDSTAT(se.statistics.nr_forced_migrations); |
| 906 | P(se.statistics.nr_wakeups); | 912 | P_SCHEDSTAT(se.statistics.nr_wakeups); |
| 907 | P(se.statistics.nr_wakeups_sync); | 913 | P_SCHEDSTAT(se.statistics.nr_wakeups_sync); |
| 908 | P(se.statistics.nr_wakeups_migrate); | 914 | P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); |
| 909 | P(se.statistics.nr_wakeups_local); | 915 | P_SCHEDSTAT(se.statistics.nr_wakeups_local); |
| 910 | P(se.statistics.nr_wakeups_remote); | 916 | P_SCHEDSTAT(se.statistics.nr_wakeups_remote); |
| 911 | P(se.statistics.nr_wakeups_affine); | 917 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine); |
| 912 | P(se.statistics.nr_wakeups_affine_attempts); | 918 | P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); |
| 913 | P(se.statistics.nr_wakeups_passive); | 919 | P_SCHEDSTAT(se.statistics.nr_wakeups_passive); |
| 914 | P(se.statistics.nr_wakeups_idle); | 920 | P_SCHEDSTAT(se.statistics.nr_wakeups_idle); |
| 915 | 921 | ||
| 916 | avg_atom = p->se.sum_exec_runtime; | 922 | avg_atom = p->se.sum_exec_runtime; |
| 917 | if (nr_switches) | 923 | if (nr_switches) |
| @@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 930 | __PN(avg_atom); | 936 | __PN(avg_atom); |
| 931 | __PN(avg_per_cpu); | 937 | __PN(avg_per_cpu); |
| 932 | } | 938 | } |
| 933 | #endif | 939 | |
| 934 | __P(nr_switches); | 940 | __P(nr_switches); |
| 935 | SEQ_printf(m, "%-45s:%21Ld\n", | 941 | SEQ_printf(m, "%-45s:%21Ld\n", |
| 936 | "nr_voluntary_switches", (long long)p->nvcsw); | 942 | "nr_voluntary_switches", (long long)p->nvcsw); |
| @@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 947 | #endif | 953 | #endif |
| 948 | P(policy); | 954 | P(policy); |
| 949 | P(prio); | 955 | P(prio); |
| 956 | #undef PN_SCHEDSTAT | ||
| 950 | #undef PN | 957 | #undef PN |
| 951 | #undef __PN | 958 | #undef __PN |
| 959 | #undef P_SCHEDSTAT | ||
| 952 | #undef P | 960 | #undef P |
| 953 | #undef __P | 961 | #undef __P |
| 954 | 962 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..d941c97dfbc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
| 114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
| 115 | #endif | 115 | #endif |
| 116 | 116 | ||
| 117 | /* | ||
| 118 | * The margin used when comparing utilization with CPU capacity: | ||
| 119 | * util * 1024 < capacity * margin | ||
| 120 | */ | ||
| 121 | unsigned int capacity_margin = 1280; /* ~20% */ | ||
| 122 | |||
| 117 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 123 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
| 118 | { | 124 | { |
| 119 | lw->weight += inc; | 125 | lw->weight += inc; |
| @@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
| 256 | 262 | ||
| 257 | static inline struct task_struct *task_of(struct sched_entity *se) | 263 | static inline struct task_struct *task_of(struct sched_entity *se) |
| 258 | { | 264 | { |
| 259 | #ifdef CONFIG_SCHED_DEBUG | 265 | SCHED_WARN_ON(!entity_is_task(se)); |
| 260 | WARN_ON_ONCE(!entity_is_task(se)); | ||
| 261 | #endif | ||
| 262 | return container_of(se, struct task_struct, se); | 266 | return container_of(se, struct task_struct, se); |
| 263 | } | 267 | } |
| 264 | 268 | ||
| @@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, | |||
| 456 | 460 | ||
| 457 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 461 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
| 458 | { | 462 | { |
| 463 | struct sched_entity *curr = cfs_rq->curr; | ||
| 464 | |||
| 459 | u64 vruntime = cfs_rq->min_vruntime; | 465 | u64 vruntime = cfs_rq->min_vruntime; |
| 460 | 466 | ||
| 461 | if (cfs_rq->curr) | 467 | if (curr) { |
| 462 | vruntime = cfs_rq->curr->vruntime; | 468 | if (curr->on_rq) |
| 469 | vruntime = curr->vruntime; | ||
| 470 | else | ||
| 471 | curr = NULL; | ||
| 472 | } | ||
| 463 | 473 | ||
| 464 | if (cfs_rq->rb_leftmost) { | 474 | if (cfs_rq->rb_leftmost) { |
| 465 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, | 475 | struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, |
| 466 | struct sched_entity, | 476 | struct sched_entity, |
| 467 | run_node); | 477 | run_node); |
| 468 | 478 | ||
| 469 | if (!cfs_rq->curr) | 479 | if (!curr) |
| 470 | vruntime = se->vruntime; | 480 | vruntime = se->vruntime; |
| 471 | else | 481 | else |
| 472 | vruntime = min_vruntime(vruntime, se->vruntime); | 482 | vruntime = min_vruntime(vruntime, se->vruntime); |
| @@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 656 | } | 666 | } |
| 657 | 667 | ||
| 658 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
| 659 | static int select_idle_sibling(struct task_struct *p, int cpu); | 669 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
| 660 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 661 | 671 | ||
| 662 | /* | 672 | /* |
| @@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
| 680 | * will definitely be update (after enqueue). | 690 | * will definitely be update (after enqueue). |
| 681 | */ | 691 | */ |
| 682 | sa->period_contrib = 1023; | 692 | sa->period_contrib = 1023; |
| 683 | sa->load_avg = scale_load_down(se->load.weight); | 693 | /* |
| 694 | * Tasks are intialized with full load to be seen as heavy tasks until | ||
| 695 | * they get a chance to stabilize to their real load level. | ||
| 696 | * Group entities are intialized with zero load to reflect the fact that | ||
| 697 | * nothing has been attached to the task group yet. | ||
| 698 | */ | ||
| 699 | if (entity_is_task(se)) | ||
| 700 | sa->load_avg = scale_load_down(se->load.weight); | ||
| 684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 701 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
| 685 | /* | 702 | /* |
| 686 | * At this point, util_avg won't be used in select_task_rq_fair anyway | 703 | * At this point, util_avg won't be used in select_task_rq_fair anyway |
| @@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 726 | struct sched_avg *sa = &se->avg; | 743 | struct sched_avg *sa = &se->avg; |
| 727 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 744 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
| 728 | u64 now = cfs_rq_clock_task(cfs_rq); | 745 | u64 now = cfs_rq_clock_task(cfs_rq); |
| 729 | int tg_update; | ||
| 730 | 746 | ||
| 731 | if (cap > 0) { | 747 | if (cap > 0) { |
| 732 | if (cfs_rq->avg.util_avg != 0) { | 748 | if (cfs_rq->avg.util_avg != 0) { |
| @@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 759 | } | 775 | } |
| 760 | } | 776 | } |
| 761 | 777 | ||
| 762 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 778 | update_cfs_rq_load_avg(now, cfs_rq, false); |
| 763 | attach_entity_load_avg(cfs_rq, se); | 779 | attach_entity_load_avg(cfs_rq, se); |
| 764 | if (tg_update) | 780 | update_tg_load_avg(cfs_rq, false); |
| 765 | update_tg_load_avg(cfs_rq, false); | ||
| 766 | } | 781 | } |
| 767 | 782 | ||
| 768 | #else /* !CONFIG_SMP */ | 783 | #else /* !CONFIG_SMP */ |
| @@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
| 799 | max(delta_exec, curr->statistics.exec_max)); | 814 | max(delta_exec, curr->statistics.exec_max)); |
| 800 | 815 | ||
| 801 | curr->sum_exec_runtime += delta_exec; | 816 | curr->sum_exec_runtime += delta_exec; |
| 802 | schedstat_add(cfs_rq, exec_clock, delta_exec); | 817 | schedstat_add(cfs_rq->exec_clock, delta_exec); |
| 803 | 818 | ||
| 804 | curr->vruntime += calc_delta_fair(delta_exec, curr); | 819 | curr->vruntime += calc_delta_fair(delta_exec, curr); |
| 805 | update_min_vruntime(cfs_rq); | 820 | update_min_vruntime(cfs_rq); |
| @@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq) | |||
| 820 | update_curr(cfs_rq_of(&rq->curr->se)); | 835 | update_curr(cfs_rq_of(&rq->curr->se)); |
| 821 | } | 836 | } |
| 822 | 837 | ||
| 823 | #ifdef CONFIG_SCHEDSTATS | ||
| 824 | static inline void | 838 | static inline void |
| 825 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 839 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 826 | { | 840 | { |
| 827 | u64 wait_start = rq_clock(rq_of(cfs_rq)); | 841 | u64 wait_start, prev_wait_start; |
| 842 | |||
| 843 | if (!schedstat_enabled()) | ||
| 844 | return; | ||
| 845 | |||
| 846 | wait_start = rq_clock(rq_of(cfs_rq)); | ||
| 847 | prev_wait_start = schedstat_val(se->statistics.wait_start); | ||
| 828 | 848 | ||
| 829 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && | 849 | if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && |
| 830 | likely(wait_start > se->statistics.wait_start)) | 850 | likely(wait_start > prev_wait_start)) |
| 831 | wait_start -= se->statistics.wait_start; | 851 | wait_start -= prev_wait_start; |
| 832 | 852 | ||
| 833 | se->statistics.wait_start = wait_start; | 853 | schedstat_set(se->statistics.wait_start, wait_start); |
| 834 | } | 854 | } |
| 835 | 855 | ||
| 836 | static void | 856 | static inline void |
| 837 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 857 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 838 | { | 858 | { |
| 839 | struct task_struct *p; | 859 | struct task_struct *p; |
| 840 | u64 delta; | 860 | u64 delta; |
| 841 | 861 | ||
| 842 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 862 | if (!schedstat_enabled()) |
| 863 | return; | ||
| 864 | |||
| 865 | delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); | ||
| 843 | 866 | ||
| 844 | if (entity_is_task(se)) { | 867 | if (entity_is_task(se)) { |
| 845 | p = task_of(se); | 868 | p = task_of(se); |
| @@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 849 | * time stamp can be adjusted to accumulate wait time | 872 | * time stamp can be adjusted to accumulate wait time |
| 850 | * prior to migration. | 873 | * prior to migration. |
| 851 | */ | 874 | */ |
| 852 | se->statistics.wait_start = delta; | 875 | schedstat_set(se->statistics.wait_start, delta); |
| 853 | return; | 876 | return; |
| 854 | } | 877 | } |
| 855 | trace_sched_stat_wait(p, delta); | 878 | trace_sched_stat_wait(p, delta); |
| 856 | } | 879 | } |
| 857 | 880 | ||
| 858 | se->statistics.wait_max = max(se->statistics.wait_max, delta); | 881 | schedstat_set(se->statistics.wait_max, |
| 859 | se->statistics.wait_count++; | 882 | max(schedstat_val(se->statistics.wait_max), delta)); |
| 860 | se->statistics.wait_sum += delta; | 883 | schedstat_inc(se->statistics.wait_count); |
| 861 | se->statistics.wait_start = 0; | 884 | schedstat_add(se->statistics.wait_sum, delta); |
| 885 | schedstat_set(se->statistics.wait_start, 0); | ||
| 886 | } | ||
| 887 | |||
| 888 | static inline void | ||
| 889 | update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 890 | { | ||
| 891 | struct task_struct *tsk = NULL; | ||
| 892 | u64 sleep_start, block_start; | ||
| 893 | |||
| 894 | if (!schedstat_enabled()) | ||
| 895 | return; | ||
| 896 | |||
| 897 | sleep_start = schedstat_val(se->statistics.sleep_start); | ||
| 898 | block_start = schedstat_val(se->statistics.block_start); | ||
| 899 | |||
| 900 | if (entity_is_task(se)) | ||
| 901 | tsk = task_of(se); | ||
| 902 | |||
| 903 | if (sleep_start) { | ||
| 904 | u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; | ||
| 905 | |||
| 906 | if ((s64)delta < 0) | ||
| 907 | delta = 0; | ||
| 908 | |||
| 909 | if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) | ||
| 910 | schedstat_set(se->statistics.sleep_max, delta); | ||
| 911 | |||
| 912 | schedstat_set(se->statistics.sleep_start, 0); | ||
| 913 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
| 914 | |||
| 915 | if (tsk) { | ||
| 916 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
| 917 | trace_sched_stat_sleep(tsk, delta); | ||
| 918 | } | ||
| 919 | } | ||
| 920 | if (block_start) { | ||
| 921 | u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; | ||
| 922 | |||
| 923 | if ((s64)delta < 0) | ||
| 924 | delta = 0; | ||
| 925 | |||
| 926 | if (unlikely(delta > schedstat_val(se->statistics.block_max))) | ||
| 927 | schedstat_set(se->statistics.block_max, delta); | ||
| 928 | |||
| 929 | schedstat_set(se->statistics.block_start, 0); | ||
| 930 | schedstat_add(se->statistics.sum_sleep_runtime, delta); | ||
| 931 | |||
| 932 | if (tsk) { | ||
| 933 | if (tsk->in_iowait) { | ||
| 934 | schedstat_add(se->statistics.iowait_sum, delta); | ||
| 935 | schedstat_inc(se->statistics.iowait_count); | ||
| 936 | trace_sched_stat_iowait(tsk, delta); | ||
| 937 | } | ||
| 938 | |||
| 939 | trace_sched_stat_blocked(tsk, delta); | ||
| 940 | |||
| 941 | /* | ||
| 942 | * Blocking time is in units of nanosecs, so shift by | ||
| 943 | * 20 to get a milliseconds-range estimation of the | ||
| 944 | * amount of time that the task spent sleeping: | ||
| 945 | */ | ||
| 946 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
| 947 | profile_hits(SLEEP_PROFILING, | ||
| 948 | (void *)get_wchan(tsk), | ||
| 949 | delta >> 20); | ||
| 950 | } | ||
| 951 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
| 952 | } | ||
| 953 | } | ||
| 862 | } | 954 | } |
| 863 | 955 | ||
| 864 | /* | 956 | /* |
| 865 | * Task is being enqueued - update stats: | 957 | * Task is being enqueued - update stats: |
| 866 | */ | 958 | */ |
| 867 | static inline void | 959 | static inline void |
| 868 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 960 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 869 | { | 961 | { |
| 962 | if (!schedstat_enabled()) | ||
| 963 | return; | ||
| 964 | |||
| 870 | /* | 965 | /* |
| 871 | * Are we enqueueing a waiting task? (for current tasks | 966 | * Are we enqueueing a waiting task? (for current tasks |
| 872 | * a dequeue/enqueue event is a NOP) | 967 | * a dequeue/enqueue event is a NOP) |
| 873 | */ | 968 | */ |
| 874 | if (se != cfs_rq->curr) | 969 | if (se != cfs_rq->curr) |
| 875 | update_stats_wait_start(cfs_rq, se); | 970 | update_stats_wait_start(cfs_rq, se); |
| 971 | |||
| 972 | if (flags & ENQUEUE_WAKEUP) | ||
| 973 | update_stats_enqueue_sleeper(cfs_rq, se); | ||
| 876 | } | 974 | } |
| 877 | 975 | ||
| 878 | static inline void | 976 | static inline void |
| 879 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 977 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 880 | { | 978 | { |
| 979 | |||
| 980 | if (!schedstat_enabled()) | ||
| 981 | return; | ||
| 982 | |||
| 881 | /* | 983 | /* |
| 882 | * Mark the end of the wait period if dequeueing a | 984 | * Mark the end of the wait period if dequeueing a |
| 883 | * waiting task: | 985 | * waiting task: |
| @@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 885 | if (se != cfs_rq->curr) | 987 | if (se != cfs_rq->curr) |
| 886 | update_stats_wait_end(cfs_rq, se); | 988 | update_stats_wait_end(cfs_rq, se); |
| 887 | 989 | ||
| 888 | if (flags & DEQUEUE_SLEEP) { | 990 | if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { |
| 889 | if (entity_is_task(se)) { | 991 | struct task_struct *tsk = task_of(se); |
| 890 | struct task_struct *tsk = task_of(se); | ||
| 891 | 992 | ||
| 892 | if (tsk->state & TASK_INTERRUPTIBLE) | 993 | if (tsk->state & TASK_INTERRUPTIBLE) |
| 893 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | 994 | schedstat_set(se->statistics.sleep_start, |
| 894 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 995 | rq_clock(rq_of(cfs_rq))); |
| 895 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | 996 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
| 896 | } | 997 | schedstat_set(se->statistics.block_start, |
| 998 | rq_clock(rq_of(cfs_rq))); | ||
| 897 | } | 999 | } |
| 898 | |||
| 899 | } | ||
| 900 | #else | ||
| 901 | static inline void | ||
| 902 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 903 | { | ||
| 904 | } | 1000 | } |
| 905 | 1001 | ||
| 906 | static inline void | ||
| 907 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 908 | { | ||
| 909 | } | ||
| 910 | |||
| 911 | static inline void | ||
| 912 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 913 | { | ||
| 914 | } | ||
| 915 | |||
| 916 | static inline void | ||
| 917 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
| 918 | { | ||
| 919 | } | ||
| 920 | #endif | ||
| 921 | |||
| 922 | /* | 1002 | /* |
| 923 | * We are picking a new current task - update its stats: | 1003 | * We are picking a new current task - update its stats: |
| 924 | */ | 1004 | */ |
| @@ -1513,8 +1593,16 @@ balance: | |||
| 1513 | * One idle CPU per node is evaluated for a task numa move. | 1593 | * One idle CPU per node is evaluated for a task numa move. |
| 1514 | * Call select_idle_sibling to maybe find a better one. | 1594 | * Call select_idle_sibling to maybe find a better one. |
| 1515 | */ | 1595 | */ |
| 1516 | if (!cur) | 1596 | if (!cur) { |
| 1517 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | 1597 | /* |
| 1598 | * select_idle_siblings() uses an per-cpu cpumask that | ||
| 1599 | * can be used from IRQ context. | ||
| 1600 | */ | ||
| 1601 | local_irq_disable(); | ||
| 1602 | env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, | ||
| 1603 | env->dst_cpu); | ||
| 1604 | local_irq_enable(); | ||
| 1605 | } | ||
| 1518 | 1606 | ||
| 1519 | assign: | 1607 | assign: |
| 1520 | task_numa_assign(env, cur, imp); | 1608 | task_numa_assign(env, cur, imp); |
| @@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work) | |||
| 2292 | unsigned long nr_pte_updates = 0; | 2380 | unsigned long nr_pte_updates = 0; |
| 2293 | long pages, virtpages; | 2381 | long pages, virtpages; |
| 2294 | 2382 | ||
| 2295 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2383 | SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); |
| 2296 | 2384 | ||
| 2297 | work->next = work; /* protect against double add */ | 2385 | work->next = work; /* protect against double add */ |
| 2298 | /* | 2386 | /* |
| @@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2803 | } | 2891 | } |
| 2804 | 2892 | ||
| 2805 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2893 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 2806 | /* | 2894 | /** |
| 2807 | * Updating tg's load_avg is necessary before update_cfs_share (which is done) | 2895 | * update_tg_load_avg - update the tg's load avg |
| 2808 | * and effective_load (which is not done because it is too costly). | 2896 | * @cfs_rq: the cfs_rq whose avg changed |
| 2897 | * @force: update regardless of how small the difference | ||
| 2898 | * | ||
| 2899 | * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. | ||
| 2900 | * However, because tg->load_avg is a global value there are performance | ||
| 2901 | * considerations. | ||
| 2902 | * | ||
| 2903 | * In order to avoid having to look at the other cfs_rq's, we use a | ||
| 2904 | * differential update where we store the last value we propagated. This in | ||
| 2905 | * turn allows skipping updates if the differential is 'small'. | ||
| 2906 | * | ||
| 2907 | * Updating tg's load_avg is necessary before update_cfs_share() (which is | ||
| 2908 | * done) and effective_load() (which is not done because it is too costly). | ||
| 2809 | */ | 2909 | */ |
| 2810 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 2910 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
| 2811 | { | 2911 | { |
| @@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | |||
| 2875 | 2975 | ||
| 2876 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 2976 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
| 2877 | { | 2977 | { |
| 2878 | struct rq *rq = rq_of(cfs_rq); | 2978 | if (&this_rq()->cfs == cfs_rq) { |
| 2879 | int cpu = cpu_of(rq); | ||
| 2880 | |||
| 2881 | if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { | ||
| 2882 | unsigned long max = rq->cpu_capacity_orig; | ||
| 2883 | |||
| 2884 | /* | 2979 | /* |
| 2885 | * There are a few boundary cases this might miss but it should | 2980 | * There are a few boundary cases this might miss but it should |
| 2886 | * get called often enough that that should (hopefully) not be | 2981 | * get called often enough that that should (hopefully) not be |
| @@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
| 2897 | * | 2992 | * |
| 2898 | * See cpu_util(). | 2993 | * See cpu_util(). |
| 2899 | */ | 2994 | */ |
| 2900 | cpufreq_update_util(rq_clock(rq), | 2995 | cpufreq_update_util(rq_of(cfs_rq), 0); |
| 2901 | min(cfs_rq->avg.util_avg, max), max); | ||
| 2902 | } | 2996 | } |
| 2903 | } | 2997 | } |
| 2904 | 2998 | ||
| @@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
| 2931 | * | 3025 | * |
| 2932 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. | 3026 | * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. |
| 2933 | * | 3027 | * |
| 2934 | * Returns true if the load decayed or we removed utilization. It is expected | 3028 | * Returns true if the load decayed or we removed load. |
| 2935 | * that one calls update_tg_load_avg() on this condition, but after you've | 3029 | * |
| 2936 | * modified the cfs_rq avg (attach/detach), such that we propagate the new | 3030 | * Since both these conditions indicate a changed cfs_rq->avg.load we should |
| 2937 | * avg up. | 3031 | * call update_tg_load_avg() when this function returns true. |
| 2938 | */ | 3032 | */ |
| 2939 | static inline int | 3033 | static inline int |
| 2940 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | 3034 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) |
| @@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
| 3159 | 3253 | ||
| 3160 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3254 | static inline void update_load_avg(struct sched_entity *se, int not_used) |
| 3161 | { | 3255 | { |
| 3162 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3256 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); |
| 3163 | struct rq *rq = rq_of(cfs_rq); | ||
| 3164 | |||
| 3165 | cpufreq_trigger_update(rq_clock(rq)); | ||
| 3166 | } | 3257 | } |
| 3167 | 3258 | ||
| 3168 | static inline void | 3259 | static inline void |
| @@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq) | |||
| 3183 | 3274 | ||
| 3184 | #endif /* CONFIG_SMP */ | 3275 | #endif /* CONFIG_SMP */ |
| 3185 | 3276 | ||
| 3186 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 3187 | { | ||
| 3188 | #ifdef CONFIG_SCHEDSTATS | ||
| 3189 | struct task_struct *tsk = NULL; | ||
| 3190 | |||
| 3191 | if (entity_is_task(se)) | ||
| 3192 | tsk = task_of(se); | ||
| 3193 | |||
| 3194 | if (se->statistics.sleep_start) { | ||
| 3195 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; | ||
| 3196 | |||
| 3197 | if ((s64)delta < 0) | ||
| 3198 | delta = 0; | ||
| 3199 | |||
| 3200 | if (unlikely(delta > se->statistics.sleep_max)) | ||
| 3201 | se->statistics.sleep_max = delta; | ||
| 3202 | |||
| 3203 | se->statistics.sleep_start = 0; | ||
| 3204 | se->statistics.sum_sleep_runtime += delta; | ||
| 3205 | |||
| 3206 | if (tsk) { | ||
| 3207 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
| 3208 | trace_sched_stat_sleep(tsk, delta); | ||
| 3209 | } | ||
| 3210 | } | ||
| 3211 | if (se->statistics.block_start) { | ||
| 3212 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; | ||
| 3213 | |||
| 3214 | if ((s64)delta < 0) | ||
| 3215 | delta = 0; | ||
| 3216 | |||
| 3217 | if (unlikely(delta > se->statistics.block_max)) | ||
| 3218 | se->statistics.block_max = delta; | ||
| 3219 | |||
| 3220 | se->statistics.block_start = 0; | ||
| 3221 | se->statistics.sum_sleep_runtime += delta; | ||
| 3222 | |||
| 3223 | if (tsk) { | ||
| 3224 | if (tsk->in_iowait) { | ||
| 3225 | se->statistics.iowait_sum += delta; | ||
| 3226 | se->statistics.iowait_count++; | ||
| 3227 | trace_sched_stat_iowait(tsk, delta); | ||
| 3228 | } | ||
| 3229 | |||
| 3230 | trace_sched_stat_blocked(tsk, delta); | ||
| 3231 | |||
| 3232 | /* | ||
| 3233 | * Blocking time is in units of nanosecs, so shift by | ||
| 3234 | * 20 to get a milliseconds-range estimation of the | ||
| 3235 | * amount of time that the task spent sleeping: | ||
| 3236 | */ | ||
| 3237 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
| 3238 | profile_hits(SLEEP_PROFILING, | ||
| 3239 | (void *)get_wchan(tsk), | ||
| 3240 | delta >> 20); | ||
| 3241 | } | ||
| 3242 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
| 3243 | } | ||
| 3244 | } | ||
| 3245 | #endif | ||
| 3246 | } | ||
| 3247 | |||
| 3248 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3277 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 3249 | { | 3278 | { |
| 3250 | #ifdef CONFIG_SCHED_DEBUG | 3279 | #ifdef CONFIG_SCHED_DEBUG |
| @@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3254 | d = -d; | 3283 | d = -d; |
| 3255 | 3284 | ||
| 3256 | if (d > 3*sysctl_sched_latency) | 3285 | if (d > 3*sysctl_sched_latency) |
| 3257 | schedstat_inc(cfs_rq, nr_spread_over); | 3286 | schedstat_inc(cfs_rq->nr_spread_over); |
| 3258 | #endif | 3287 | #endif |
| 3259 | } | 3288 | } |
| 3260 | 3289 | ||
| @@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3371 | account_entity_enqueue(cfs_rq, se); | 3400 | account_entity_enqueue(cfs_rq, se); |
| 3372 | update_cfs_shares(cfs_rq); | 3401 | update_cfs_shares(cfs_rq); |
| 3373 | 3402 | ||
| 3374 | if (flags & ENQUEUE_WAKEUP) { | 3403 | if (flags & ENQUEUE_WAKEUP) |
| 3375 | place_entity(cfs_rq, se, 0); | 3404 | place_entity(cfs_rq, se, 0); |
| 3376 | if (schedstat_enabled()) | ||
| 3377 | enqueue_sleeper(cfs_rq, se); | ||
| 3378 | } | ||
| 3379 | 3405 | ||
| 3380 | check_schedstat_required(); | 3406 | check_schedstat_required(); |
| 3381 | if (schedstat_enabled()) { | 3407 | update_stats_enqueue(cfs_rq, se, flags); |
| 3382 | update_stats_enqueue(cfs_rq, se); | 3408 | check_spread(cfs_rq, se); |
| 3383 | check_spread(cfs_rq, se); | ||
| 3384 | } | ||
| 3385 | if (!curr) | 3409 | if (!curr) |
| 3386 | __enqueue_entity(cfs_rq, se); | 3410 | __enqueue_entity(cfs_rq, se); |
| 3387 | se->on_rq = 1; | 3411 | se->on_rq = 1; |
| @@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3448 | update_curr(cfs_rq); | 3472 | update_curr(cfs_rq); |
| 3449 | dequeue_entity_load_avg(cfs_rq, se); | 3473 | dequeue_entity_load_avg(cfs_rq, se); |
| 3450 | 3474 | ||
| 3451 | if (schedstat_enabled()) | 3475 | update_stats_dequeue(cfs_rq, se, flags); |
| 3452 | update_stats_dequeue(cfs_rq, se, flags); | ||
| 3453 | 3476 | ||
| 3454 | clear_buddies(cfs_rq, se); | 3477 | clear_buddies(cfs_rq, se); |
| 3455 | 3478 | ||
| @@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3459 | account_entity_dequeue(cfs_rq, se); | 3482 | account_entity_dequeue(cfs_rq, se); |
| 3460 | 3483 | ||
| 3461 | /* | 3484 | /* |
| 3462 | * Normalize the entity after updating the min_vruntime because the | 3485 | * Normalize after update_curr(); which will also have moved |
| 3463 | * update can refer to the ->curr item and we need to reflect this | 3486 | * min_vruntime if @se is the one holding it back. But before doing |
| 3464 | * movement in our normalized position. | 3487 | * update_min_vruntime() again, which will discount @se's position and |
| 3488 | * can move min_vruntime forward still more. | ||
| 3465 | */ | 3489 | */ |
| 3466 | if (!(flags & DEQUEUE_SLEEP)) | 3490 | if (!(flags & DEQUEUE_SLEEP)) |
| 3467 | se->vruntime -= cfs_rq->min_vruntime; | 3491 | se->vruntime -= cfs_rq->min_vruntime; |
| @@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3469 | /* return excess runtime on last dequeue */ | 3493 | /* return excess runtime on last dequeue */ |
| 3470 | return_cfs_rq_runtime(cfs_rq); | 3494 | return_cfs_rq_runtime(cfs_rq); |
| 3471 | 3495 | ||
| 3472 | update_min_vruntime(cfs_rq); | ||
| 3473 | update_cfs_shares(cfs_rq); | 3496 | update_cfs_shares(cfs_rq); |
| 3497 | |||
| 3498 | /* | ||
| 3499 | * Now advance min_vruntime if @se was the entity holding it back, | ||
| 3500 | * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be | ||
| 3501 | * put back on, and if we advance min_vruntime, we'll be placed back | ||
| 3502 | * further than we started -- ie. we'll be penalized. | ||
| 3503 | */ | ||
| 3504 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
| 3505 | update_min_vruntime(cfs_rq); | ||
| 3474 | } | 3506 | } |
| 3475 | 3507 | ||
| 3476 | /* | 3508 | /* |
| @@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3523 | * a CPU. So account for the time it spent waiting on the | 3555 | * a CPU. So account for the time it spent waiting on the |
| 3524 | * runqueue. | 3556 | * runqueue. |
| 3525 | */ | 3557 | */ |
| 3526 | if (schedstat_enabled()) | 3558 | update_stats_wait_end(cfs_rq, se); |
| 3527 | update_stats_wait_end(cfs_rq, se); | ||
| 3528 | __dequeue_entity(cfs_rq, se); | 3559 | __dequeue_entity(cfs_rq, se); |
| 3529 | update_load_avg(se, 1); | 3560 | update_load_avg(se, 1); |
| 3530 | } | 3561 | } |
| 3531 | 3562 | ||
| 3532 | update_stats_curr_start(cfs_rq, se); | 3563 | update_stats_curr_start(cfs_rq, se); |
| 3533 | cfs_rq->curr = se; | 3564 | cfs_rq->curr = se; |
| 3534 | #ifdef CONFIG_SCHEDSTATS | 3565 | |
| 3535 | /* | 3566 | /* |
| 3536 | * Track our maximum slice length, if the CPU's load is at | 3567 | * Track our maximum slice length, if the CPU's load is at |
| 3537 | * least twice that of our own weight (i.e. dont track it | 3568 | * least twice that of our own weight (i.e. dont track it |
| 3538 | * when there are only lesser-weight tasks around): | 3569 | * when there are only lesser-weight tasks around): |
| 3539 | */ | 3570 | */ |
| 3540 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3571 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
| 3541 | se->statistics.slice_max = max(se->statistics.slice_max, | 3572 | schedstat_set(se->statistics.slice_max, |
| 3542 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3573 | max((u64)schedstat_val(se->statistics.slice_max), |
| 3574 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | ||
| 3543 | } | 3575 | } |
| 3544 | #endif | 3576 | |
| 3545 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 3577 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
| 3546 | } | 3578 | } |
| 3547 | 3579 | ||
| @@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
| 3620 | /* throttle cfs_rqs exceeding runtime */ | 3652 | /* throttle cfs_rqs exceeding runtime */ |
| 3621 | check_cfs_rq_runtime(cfs_rq); | 3653 | check_cfs_rq_runtime(cfs_rq); |
| 3622 | 3654 | ||
| 3623 | if (schedstat_enabled()) { | 3655 | check_spread(cfs_rq, prev); |
| 3624 | check_spread(cfs_rq, prev); | ||
| 3625 | if (prev->on_rq) | ||
| 3626 | update_stats_wait_start(cfs_rq, prev); | ||
| 3627 | } | ||
| 3628 | 3656 | ||
| 3629 | if (prev->on_rq) { | 3657 | if (prev->on_rq) { |
| 3658 | update_stats_wait_start(cfs_rq, prev); | ||
| 3630 | /* Put 'current' back into the tree. */ | 3659 | /* Put 'current' back into the tree. */ |
| 3631 | __enqueue_entity(cfs_rq, prev); | 3660 | __enqueue_entity(cfs_rq, prev); |
| 3632 | /* in !on_rq case, update occurred at dequeue */ | 3661 | /* in !on_rq case, update occurred at dequeue */ |
| @@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 4456 | struct sched_entity *se = &p->se; | 4485 | struct sched_entity *se = &p->se; |
| 4457 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 4486 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 4458 | 4487 | ||
| 4459 | WARN_ON(task_rq(p) != rq); | 4488 | SCHED_WARN_ON(task_rq(p) != rq); |
| 4460 | 4489 | ||
| 4461 | if (cfs_rq->nr_running > 1) { | 4490 | if (rq->cfs.h_nr_running > 1) { |
| 4462 | u64 slice = sched_slice(cfs_rq, se); | 4491 | u64 slice = sched_slice(cfs_rq, se); |
| 4463 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 4492 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
| 4464 | s64 delta = slice - ran; | 4493 | s64 delta = slice - ran; |
| @@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 4509 | struct cfs_rq *cfs_rq; | 4538 | struct cfs_rq *cfs_rq; |
| 4510 | struct sched_entity *se = &p->se; | 4539 | struct sched_entity *se = &p->se; |
| 4511 | 4540 | ||
| 4541 | /* | ||
| 4542 | * If in_iowait is set, the code below may not trigger any cpufreq | ||
| 4543 | * utilization updates, so do it here explicitly with the IOWAIT flag | ||
| 4544 | * passed. | ||
| 4545 | */ | ||
| 4546 | if (p->in_iowait) | ||
| 4547 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); | ||
| 4548 | |||
| 4512 | for_each_sched_entity(se) { | 4549 | for_each_sched_entity(se) { |
| 4513 | if (se->on_rq) | 4550 | if (se->on_rq) |
| 4514 | break; | 4551 | break; |
| @@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 4605 | } | 4642 | } |
| 4606 | 4643 | ||
| 4607 | #ifdef CONFIG_SMP | 4644 | #ifdef CONFIG_SMP |
| 4645 | |||
| 4646 | /* Working cpumask for: load_balance, load_balance_newidle. */ | ||
| 4647 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
| 4648 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | ||
| 4649 | |||
| 4608 | #ifdef CONFIG_NO_HZ_COMMON | 4650 | #ifdef CONFIG_NO_HZ_COMMON |
| 4609 | /* | 4651 | /* |
| 4610 | * per rq 'load' arrray crap; XXX kill this. | 4652 | * per rq 'load' arrray crap; XXX kill this. |
| @@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 5006 | * wl = S * s'_i; see (2) | 5048 | * wl = S * s'_i; see (2) |
| 5007 | */ | 5049 | */ |
| 5008 | if (W > 0 && w < W) | 5050 | if (W > 0 && w < W) |
| 5009 | wl = (w * (long)tg->shares) / W; | 5051 | wl = (w * (long)scale_load_down(tg->shares)) / W; |
| 5010 | else | 5052 | else |
| 5011 | wl = tg->shares; | 5053 | wl = scale_load_down(tg->shares); |
| 5012 | 5054 | ||
| 5013 | /* | 5055 | /* |
| 5014 | * Per the above, wl is the new se->load.weight value; since | 5056 | * Per the above, wl is the new se->load.weight value; since |
| @@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p) | |||
| 5091 | return 1; | 5133 | return 1; |
| 5092 | } | 5134 | } |
| 5093 | 5135 | ||
| 5094 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 5136 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
| 5137 | int prev_cpu, int sync) | ||
| 5095 | { | 5138 | { |
| 5096 | s64 this_load, load; | 5139 | s64 this_load, load; |
| 5097 | s64 this_eff_load, prev_eff_load; | 5140 | s64 this_eff_load, prev_eff_load; |
| 5098 | int idx, this_cpu, prev_cpu; | 5141 | int idx, this_cpu; |
| 5099 | struct task_group *tg; | 5142 | struct task_group *tg; |
| 5100 | unsigned long weight; | 5143 | unsigned long weight; |
| 5101 | int balanced; | 5144 | int balanced; |
| 5102 | 5145 | ||
| 5103 | idx = sd->wake_idx; | 5146 | idx = sd->wake_idx; |
| 5104 | this_cpu = smp_processor_id(); | 5147 | this_cpu = smp_processor_id(); |
| 5105 | prev_cpu = task_cpu(p); | ||
| 5106 | load = source_load(prev_cpu, idx); | 5148 | load = source_load(prev_cpu, idx); |
| 5107 | this_load = target_load(this_cpu, idx); | 5149 | this_load = target_load(this_cpu, idx); |
| 5108 | 5150 | ||
| @@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 5146 | 5188 | ||
| 5147 | balanced = this_eff_load <= prev_eff_load; | 5189 | balanced = this_eff_load <= prev_eff_load; |
| 5148 | 5190 | ||
| 5149 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 5191 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
| 5150 | 5192 | ||
| 5151 | if (!balanced) | 5193 | if (!balanced) |
| 5152 | return 0; | 5194 | return 0; |
| 5153 | 5195 | ||
| 5154 | schedstat_inc(sd, ttwu_move_affine); | 5196 | schedstat_inc(sd->ttwu_move_affine); |
| 5155 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | 5197 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
| 5156 | 5198 | ||
| 5157 | return 1; | 5199 | return 1; |
| 5158 | } | 5200 | } |
| @@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 5228 | int shallowest_idle_cpu = -1; | 5270 | int shallowest_idle_cpu = -1; |
| 5229 | int i; | 5271 | int i; |
| 5230 | 5272 | ||
| 5273 | /* Check if we have any choice: */ | ||
| 5274 | if (group->group_weight == 1) | ||
| 5275 | return cpumask_first(sched_group_cpus(group)); | ||
| 5276 | |||
| 5231 | /* Traverse only the allowed CPUs */ | 5277 | /* Traverse only the allowed CPUs */ |
| 5232 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 5278 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
| 5233 | if (idle_cpu(i)) { | 5279 | if (idle_cpu(i)) { |
| @@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 5265 | } | 5311 | } |
| 5266 | 5312 | ||
| 5267 | /* | 5313 | /* |
| 5268 | * Try and locate an idle CPU in the sched_domain. | 5314 | * Implement a for_each_cpu() variant that starts the scan at a given cpu |
| 5315 | * (@start), and wraps around. | ||
| 5316 | * | ||
| 5317 | * This is used to scan for idle CPUs; such that not all CPUs looking for an | ||
| 5318 | * idle CPU find the same CPU. The down-side is that tasks tend to cycle | ||
| 5319 | * through the LLC domain. | ||
| 5320 | * | ||
| 5321 | * Especially tbench is found sensitive to this. | ||
| 5269 | */ | 5322 | */ |
| 5270 | static int select_idle_sibling(struct task_struct *p, int target) | 5323 | |
| 5324 | static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) | ||
| 5325 | { | ||
| 5326 | int next; | ||
| 5327 | |||
| 5328 | again: | ||
| 5329 | next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); | ||
| 5330 | |||
| 5331 | if (*wrapped) { | ||
| 5332 | if (next >= start) | ||
| 5333 | return nr_cpumask_bits; | ||
| 5334 | } else { | ||
| 5335 | if (next >= nr_cpumask_bits) { | ||
| 5336 | *wrapped = 1; | ||
| 5337 | n = -1; | ||
| 5338 | goto again; | ||
| 5339 | } | ||
| 5340 | } | ||
| 5341 | |||
| 5342 | return next; | ||
| 5343 | } | ||
| 5344 | |||
| 5345 | #define for_each_cpu_wrap(cpu, mask, start, wrap) \ | ||
| 5346 | for ((wrap) = 0, (cpu) = (start)-1; \ | ||
| 5347 | (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ | ||
| 5348 | (cpu) < nr_cpumask_bits; ) | ||
| 5349 | |||
| 5350 | #ifdef CONFIG_SCHED_SMT | ||
| 5351 | |||
| 5352 | static inline void set_idle_cores(int cpu, int val) | ||
| 5353 | { | ||
| 5354 | struct sched_domain_shared *sds; | ||
| 5355 | |||
| 5356 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
| 5357 | if (sds) | ||
| 5358 | WRITE_ONCE(sds->has_idle_cores, val); | ||
| 5359 | } | ||
| 5360 | |||
| 5361 | static inline bool test_idle_cores(int cpu, bool def) | ||
| 5362 | { | ||
| 5363 | struct sched_domain_shared *sds; | ||
| 5364 | |||
| 5365 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
| 5366 | if (sds) | ||
| 5367 | return READ_ONCE(sds->has_idle_cores); | ||
| 5368 | |||
| 5369 | return def; | ||
| 5370 | } | ||
| 5371 | |||
| 5372 | /* | ||
| 5373 | * Scans the local SMT mask to see if the entire core is idle, and records this | ||
| 5374 | * information in sd_llc_shared->has_idle_cores. | ||
| 5375 | * | ||
| 5376 | * Since SMT siblings share all cache levels, inspecting this limited remote | ||
| 5377 | * state should be fairly cheap. | ||
| 5378 | */ | ||
| 5379 | void __update_idle_core(struct rq *rq) | ||
| 5380 | { | ||
| 5381 | int core = cpu_of(rq); | ||
| 5382 | int cpu; | ||
| 5383 | |||
| 5384 | rcu_read_lock(); | ||
| 5385 | if (test_idle_cores(core, true)) | ||
| 5386 | goto unlock; | ||
| 5387 | |||
| 5388 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
| 5389 | if (cpu == core) | ||
| 5390 | continue; | ||
| 5391 | |||
| 5392 | if (!idle_cpu(cpu)) | ||
| 5393 | goto unlock; | ||
| 5394 | } | ||
| 5395 | |||
| 5396 | set_idle_cores(core, 1); | ||
| 5397 | unlock: | ||
| 5398 | rcu_read_unlock(); | ||
| 5399 | } | ||
| 5400 | |||
| 5401 | /* | ||
| 5402 | * Scan the entire LLC domain for idle cores; this dynamically switches off if | ||
| 5403 | * there are no idle cores left in the system; tracked through | ||
| 5404 | * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. | ||
| 5405 | */ | ||
| 5406 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
| 5407 | { | ||
| 5408 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); | ||
| 5409 | int core, cpu, wrap; | ||
| 5410 | |||
| 5411 | if (!static_branch_likely(&sched_smt_present)) | ||
| 5412 | return -1; | ||
| 5413 | |||
| 5414 | if (!test_idle_cores(target, false)) | ||
| 5415 | return -1; | ||
| 5416 | |||
| 5417 | cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); | ||
| 5418 | |||
| 5419 | for_each_cpu_wrap(core, cpus, target, wrap) { | ||
| 5420 | bool idle = true; | ||
| 5421 | |||
| 5422 | for_each_cpu(cpu, cpu_smt_mask(core)) { | ||
| 5423 | cpumask_clear_cpu(cpu, cpus); | ||
| 5424 | if (!idle_cpu(cpu)) | ||
| 5425 | idle = false; | ||
| 5426 | } | ||
| 5427 | |||
| 5428 | if (idle) | ||
| 5429 | return core; | ||
| 5430 | } | ||
| 5431 | |||
| 5432 | /* | ||
| 5433 | * Failed to find an idle core; stop looking for one. | ||
| 5434 | */ | ||
| 5435 | set_idle_cores(target, 0); | ||
| 5436 | |||
| 5437 | return -1; | ||
| 5438 | } | ||
| 5439 | |||
| 5440 | /* | ||
| 5441 | * Scan the local SMT mask for idle CPUs. | ||
| 5442 | */ | ||
| 5443 | static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
| 5444 | { | ||
| 5445 | int cpu; | ||
| 5446 | |||
| 5447 | if (!static_branch_likely(&sched_smt_present)) | ||
| 5448 | return -1; | ||
| 5449 | |||
| 5450 | for_each_cpu(cpu, cpu_smt_mask(target)) { | ||
| 5451 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
| 5452 | continue; | ||
| 5453 | if (idle_cpu(cpu)) | ||
| 5454 | return cpu; | ||
| 5455 | } | ||
| 5456 | |||
| 5457 | return -1; | ||
| 5458 | } | ||
| 5459 | |||
| 5460 | #else /* CONFIG_SCHED_SMT */ | ||
| 5461 | |||
| 5462 | static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | ||
| 5463 | { | ||
| 5464 | return -1; | ||
| 5465 | } | ||
| 5466 | |||
| 5467 | static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | ||
| 5468 | { | ||
| 5469 | return -1; | ||
| 5470 | } | ||
| 5471 | |||
| 5472 | #endif /* CONFIG_SCHED_SMT */ | ||
| 5473 | |||
| 5474 | /* | ||
| 5475 | * Scan the LLC domain for idle CPUs; this is dynamically regulated by | ||
| 5476 | * comparing the average scan cost (tracked in sd->avg_scan_cost) against the | ||
| 5477 | * average idle time for this rq (as found in rq->avg_idle). | ||
| 5478 | */ | ||
| 5479 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) | ||
| 5480 | { | ||
| 5481 | struct sched_domain *this_sd; | ||
| 5482 | u64 avg_cost, avg_idle = this_rq()->avg_idle; | ||
| 5483 | u64 time, cost; | ||
| 5484 | s64 delta; | ||
| 5485 | int cpu, wrap; | ||
| 5486 | |||
| 5487 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | ||
| 5488 | if (!this_sd) | ||
| 5489 | return -1; | ||
| 5490 | |||
| 5491 | avg_cost = this_sd->avg_scan_cost; | ||
| 5492 | |||
| 5493 | /* | ||
| 5494 | * Due to large variance we need a large fuzz factor; hackbench in | ||
| 5495 | * particularly is sensitive here. | ||
| 5496 | */ | ||
| 5497 | if ((avg_idle / 512) < avg_cost) | ||
| 5498 | return -1; | ||
| 5499 | |||
| 5500 | time = local_clock(); | ||
| 5501 | |||
| 5502 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { | ||
| 5503 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
| 5504 | continue; | ||
| 5505 | if (idle_cpu(cpu)) | ||
| 5506 | break; | ||
| 5507 | } | ||
| 5508 | |||
| 5509 | time = local_clock() - time; | ||
| 5510 | cost = this_sd->avg_scan_cost; | ||
| 5511 | delta = (s64)(time - cost) / 8; | ||
| 5512 | this_sd->avg_scan_cost += delta; | ||
| 5513 | |||
| 5514 | return cpu; | ||
| 5515 | } | ||
| 5516 | |||
| 5517 | /* | ||
| 5518 | * Try and locate an idle core/thread in the LLC cache domain. | ||
| 5519 | */ | ||
| 5520 | static int select_idle_sibling(struct task_struct *p, int prev, int target) | ||
| 5271 | { | 5521 | { |
| 5272 | struct sched_domain *sd; | 5522 | struct sched_domain *sd; |
| 5273 | struct sched_group *sg; | 5523 | int i; |
| 5274 | int i = task_cpu(p); | ||
| 5275 | 5524 | ||
| 5276 | if (idle_cpu(target)) | 5525 | if (idle_cpu(target)) |
| 5277 | return target; | 5526 | return target; |
| 5278 | 5527 | ||
| 5279 | /* | 5528 | /* |
| 5280 | * If the prevous cpu is cache affine and idle, don't be stupid. | 5529 | * If the previous cpu is cache affine and idle, don't be stupid. |
| 5281 | */ | 5530 | */ |
| 5282 | if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) | 5531 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
| 5283 | return i; | 5532 | return prev; |
| 5284 | 5533 | ||
| 5285 | /* | ||
| 5286 | * Otherwise, iterate the domains and find an eligible idle cpu. | ||
| 5287 | * | ||
| 5288 | * A completely idle sched group at higher domains is more | ||
| 5289 | * desirable than an idle group at a lower level, because lower | ||
| 5290 | * domains have smaller groups and usually share hardware | ||
| 5291 | * resources which causes tasks to contend on them, e.g. x86 | ||
| 5292 | * hyperthread siblings in the lowest domain (SMT) can contend | ||
| 5293 | * on the shared cpu pipeline. | ||
| 5294 | * | ||
| 5295 | * However, while we prefer idle groups at higher domains | ||
| 5296 | * finding an idle cpu at the lowest domain is still better than | ||
| 5297 | * returning 'target', which we've already established, isn't | ||
| 5298 | * idle. | ||
| 5299 | */ | ||
| 5300 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 5534 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
| 5301 | for_each_lower_domain(sd) { | 5535 | if (!sd) |
| 5302 | sg = sd->groups; | 5536 | return target; |
| 5303 | do { | 5537 | |
| 5304 | if (!cpumask_intersects(sched_group_cpus(sg), | 5538 | i = select_idle_core(p, sd, target); |
| 5305 | tsk_cpus_allowed(p))) | 5539 | if ((unsigned)i < nr_cpumask_bits) |
| 5306 | goto next; | 5540 | return i; |
| 5307 | 5541 | ||
| 5308 | /* Ensure the entire group is idle */ | 5542 | i = select_idle_cpu(p, sd, target); |
| 5309 | for_each_cpu(i, sched_group_cpus(sg)) { | 5543 | if ((unsigned)i < nr_cpumask_bits) |
| 5310 | if (i == target || !idle_cpu(i)) | 5544 | return i; |
| 5311 | goto next; | 5545 | |
| 5312 | } | 5546 | i = select_idle_smt(p, sd, target); |
| 5547 | if ((unsigned)i < nr_cpumask_bits) | ||
| 5548 | return i; | ||
| 5313 | 5549 | ||
| 5314 | /* | ||
| 5315 | * It doesn't matter which cpu we pick, the | ||
| 5316 | * whole group is idle. | ||
| 5317 | */ | ||
| 5318 | target = cpumask_first_and(sched_group_cpus(sg), | ||
| 5319 | tsk_cpus_allowed(p)); | ||
| 5320 | goto done; | ||
| 5321 | next: | ||
| 5322 | sg = sg->next; | ||
| 5323 | } while (sg != sd->groups); | ||
| 5324 | } | ||
| 5325 | done: | ||
| 5326 | return target; | 5550 | return target; |
| 5327 | } | 5551 | } |
| 5328 | 5552 | ||
| @@ -5360,6 +5584,32 @@ static int cpu_util(int cpu) | |||
| 5360 | return (util >= capacity) ? capacity : util; | 5584 | return (util >= capacity) ? capacity : util; |
| 5361 | } | 5585 | } |
| 5362 | 5586 | ||
| 5587 | static inline int task_util(struct task_struct *p) | ||
| 5588 | { | ||
| 5589 | return p->se.avg.util_avg; | ||
| 5590 | } | ||
| 5591 | |||
| 5592 | /* | ||
| 5593 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the | ||
| 5594 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. | ||
| 5595 | * | ||
| 5596 | * In that case WAKE_AFFINE doesn't make sense and we'll let | ||
| 5597 | * BALANCE_WAKE sort things out. | ||
| 5598 | */ | ||
| 5599 | static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | ||
| 5600 | { | ||
| 5601 | long min_cap, max_cap; | ||
| 5602 | |||
| 5603 | min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); | ||
| 5604 | max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; | ||
| 5605 | |||
| 5606 | /* Minimum capacity is close to max, no need to abort wake_affine */ | ||
| 5607 | if (max_cap - min_cap < max_cap >> 3) | ||
| 5608 | return 0; | ||
| 5609 | |||
| 5610 | return min_cap * 1024 < task_util(p) * capacity_margin; | ||
| 5611 | } | ||
| 5612 | |||
| 5363 | /* | 5613 | /* |
| 5364 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 5614 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
| 5365 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 5615 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
| @@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 5383 | 5633 | ||
| 5384 | if (sd_flag & SD_BALANCE_WAKE) { | 5634 | if (sd_flag & SD_BALANCE_WAKE) { |
| 5385 | record_wakee(p); | 5635 | record_wakee(p); |
| 5386 | want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | 5636 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) |
| 5637 | && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); | ||
| 5387 | } | 5638 | } |
| 5388 | 5639 | ||
| 5389 | rcu_read_lock(); | 5640 | rcu_read_lock(); |
| @@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 5409 | 5660 | ||
| 5410 | if (affine_sd) { | 5661 | if (affine_sd) { |
| 5411 | sd = NULL; /* Prefer wake_affine over balance flags */ | 5662 | sd = NULL; /* Prefer wake_affine over balance flags */ |
| 5412 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 5663 | if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) |
| 5413 | new_cpu = cpu; | 5664 | new_cpu = cpu; |
| 5414 | } | 5665 | } |
| 5415 | 5666 | ||
| 5416 | if (!sd) { | 5667 | if (!sd) { |
| 5417 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 5668 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
| 5418 | new_cpu = select_idle_sibling(p, new_cpu); | 5669 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
| 5419 | 5670 | ||
| 5420 | } else while (sd) { | 5671 | } else while (sd) { |
| 5421 | struct sched_group *group; | 5672 | struct sched_group *group; |
| @@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 5939 | * | 6190 | * |
| 5940 | * The adjacency matrix of the resulting graph is given by: | 6191 | * The adjacency matrix of the resulting graph is given by: |
| 5941 | * | 6192 | * |
| 5942 | * log_2 n | 6193 | * log_2 n |
| 5943 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) | 6194 | * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) |
| 5944 | * k = 0 | 6195 | * k = 0 |
| 5945 | * | 6196 | * |
| @@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 5985 | * | 6236 | * |
| 5986 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that | 6237 | * [XXX write more on how we solve this.. _after_ merging pjt's patches that |
| 5987 | * rewrite all of this once again.] | 6238 | * rewrite all of this once again.] |
| 5988 | */ | 6239 | */ |
| 5989 | 6240 | ||
| 5990 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 6241 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
| 5991 | 6242 | ||
| @@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 6133 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 6384 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
| 6134 | int cpu; | 6385 | int cpu; |
| 6135 | 6386 | ||
| 6136 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 6387 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
| 6137 | 6388 | ||
| 6138 | env->flags |= LBF_SOME_PINNED; | 6389 | env->flags |= LBF_SOME_PINNED; |
| 6139 | 6390 | ||
| @@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 6164 | env->flags &= ~LBF_ALL_PINNED; | 6415 | env->flags &= ~LBF_ALL_PINNED; |
| 6165 | 6416 | ||
| 6166 | if (task_running(env->src_rq, p)) { | 6417 | if (task_running(env->src_rq, p)) { |
| 6167 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 6418 | schedstat_inc(p->se.statistics.nr_failed_migrations_running); |
| 6168 | return 0; | 6419 | return 0; |
| 6169 | } | 6420 | } |
| 6170 | 6421 | ||
| @@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 6181 | if (tsk_cache_hot <= 0 || | 6432 | if (tsk_cache_hot <= 0 || |
| 6182 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 6433 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 6183 | if (tsk_cache_hot == 1) { | 6434 | if (tsk_cache_hot == 1) { |
| 6184 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 6435 | schedstat_inc(env->sd->lb_hot_gained[env->idle]); |
| 6185 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 6436 | schedstat_inc(p->se.statistics.nr_forced_migrations); |
| 6186 | } | 6437 | } |
| 6187 | return 1; | 6438 | return 1; |
| 6188 | } | 6439 | } |
| 6189 | 6440 | ||
| 6190 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 6441 | schedstat_inc(p->se.statistics.nr_failed_migrations_hot); |
| 6191 | return 0; | 6442 | return 0; |
| 6192 | } | 6443 | } |
| 6193 | 6444 | ||
| @@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
| 6227 | * so we can safely collect stats here rather than | 6478 | * so we can safely collect stats here rather than |
| 6228 | * inside detach_tasks(). | 6479 | * inside detach_tasks(). |
| 6229 | */ | 6480 | */ |
| 6230 | schedstat_inc(env->sd, lb_gained[env->idle]); | 6481 | schedstat_inc(env->sd->lb_gained[env->idle]); |
| 6231 | return p; | 6482 | return p; |
| 6232 | } | 6483 | } |
| 6233 | return NULL; | 6484 | return NULL; |
| @@ -6319,7 +6570,7 @@ next: | |||
| 6319 | * so we can safely collect detach_one_task() stats here rather | 6570 | * so we can safely collect detach_one_task() stats here rather |
| 6320 | * than inside detach_one_task(). | 6571 | * than inside detach_one_task(). |
| 6321 | */ | 6572 | */ |
| 6322 | schedstat_add(env->sd, lb_gained[env->idle], detached); | 6573 | schedstat_add(env->sd->lb_gained[env->idle], detached); |
| 6323 | 6574 | ||
| 6324 | return detached; | 6575 | return detached; |
| 6325 | } | 6576 | } |
| @@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6647 | /* | 6898 | /* |
| 6648 | * !SD_OVERLAP domains can assume that child groups | 6899 | * !SD_OVERLAP domains can assume that child groups |
| 6649 | * span the current group. | 6900 | * span the current group. |
| 6650 | */ | 6901 | */ |
| 6651 | 6902 | ||
| 6652 | group = child->groups; | 6903 | group = child->groups; |
| 6653 | do { | 6904 | do { |
| @@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 7147 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; | 7398 | load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; |
| 7148 | if (load_above_capacity > busiest->group_capacity) { | 7399 | if (load_above_capacity > busiest->group_capacity) { |
| 7149 | load_above_capacity -= busiest->group_capacity; | 7400 | load_above_capacity -= busiest->group_capacity; |
| 7150 | load_above_capacity *= NICE_0_LOAD; | 7401 | load_above_capacity *= scale_load_down(NICE_0_LOAD); |
| 7151 | load_above_capacity /= busiest->group_capacity; | 7402 | load_above_capacity /= busiest->group_capacity; |
| 7152 | } else | 7403 | } else |
| 7153 | load_above_capacity = ~0UL; | 7404 | load_above_capacity = ~0UL; |
| @@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 7354 | */ | 7605 | */ |
| 7355 | #define MAX_PINNED_INTERVAL 512 | 7606 | #define MAX_PINNED_INTERVAL 512 |
| 7356 | 7607 | ||
| 7357 | /* Working cpumask for load_balance and load_balance_newidle. */ | ||
| 7358 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | ||
| 7359 | |||
| 7360 | static int need_active_balance(struct lb_env *env) | 7608 | static int need_active_balance(struct lb_env *env) |
| 7361 | { | 7609 | { |
| 7362 | struct sched_domain *sd = env->sd; | 7610 | struct sched_domain *sd = env->sd; |
| @@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 7460 | 7708 | ||
| 7461 | cpumask_copy(cpus, cpu_active_mask); | 7709 | cpumask_copy(cpus, cpu_active_mask); |
| 7462 | 7710 | ||
| 7463 | schedstat_inc(sd, lb_count[idle]); | 7711 | schedstat_inc(sd->lb_count[idle]); |
| 7464 | 7712 | ||
| 7465 | redo: | 7713 | redo: |
| 7466 | if (!should_we_balance(&env)) { | 7714 | if (!should_we_balance(&env)) { |
| @@ -7470,19 +7718,19 @@ redo: | |||
| 7470 | 7718 | ||
| 7471 | group = find_busiest_group(&env); | 7719 | group = find_busiest_group(&env); |
| 7472 | if (!group) { | 7720 | if (!group) { |
| 7473 | schedstat_inc(sd, lb_nobusyg[idle]); | 7721 | schedstat_inc(sd->lb_nobusyg[idle]); |
| 7474 | goto out_balanced; | 7722 | goto out_balanced; |
| 7475 | } | 7723 | } |
| 7476 | 7724 | ||
| 7477 | busiest = find_busiest_queue(&env, group); | 7725 | busiest = find_busiest_queue(&env, group); |
| 7478 | if (!busiest) { | 7726 | if (!busiest) { |
| 7479 | schedstat_inc(sd, lb_nobusyq[idle]); | 7727 | schedstat_inc(sd->lb_nobusyq[idle]); |
| 7480 | goto out_balanced; | 7728 | goto out_balanced; |
| 7481 | } | 7729 | } |
| 7482 | 7730 | ||
| 7483 | BUG_ON(busiest == env.dst_rq); | 7731 | BUG_ON(busiest == env.dst_rq); |
| 7484 | 7732 | ||
| 7485 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 7733 | schedstat_add(sd->lb_imbalance[idle], env.imbalance); |
| 7486 | 7734 | ||
| 7487 | env.src_cpu = busiest->cpu; | 7735 | env.src_cpu = busiest->cpu; |
| 7488 | env.src_rq = busiest; | 7736 | env.src_rq = busiest; |
| @@ -7589,7 +7837,7 @@ more_balance: | |||
| 7589 | } | 7837 | } |
| 7590 | 7838 | ||
| 7591 | if (!ld_moved) { | 7839 | if (!ld_moved) { |
| 7592 | schedstat_inc(sd, lb_failed[idle]); | 7840 | schedstat_inc(sd->lb_failed[idle]); |
| 7593 | /* | 7841 | /* |
| 7594 | * Increment the failure counter only on periodic balance. | 7842 | * Increment the failure counter only on periodic balance. |
| 7595 | * We do not want newidle balance, which can be very | 7843 | * We do not want newidle balance, which can be very |
| @@ -7672,7 +7920,7 @@ out_all_pinned: | |||
| 7672 | * we can't migrate them. Let the imbalance flag set so parent level | 7920 | * we can't migrate them. Let the imbalance flag set so parent level |
| 7673 | * can try to migrate them. | 7921 | * can try to migrate them. |
| 7674 | */ | 7922 | */ |
| 7675 | schedstat_inc(sd, lb_balanced[idle]); | 7923 | schedstat_inc(sd->lb_balanced[idle]); |
| 7676 | 7924 | ||
| 7677 | sd->nr_balance_failed = 0; | 7925 | sd->nr_balance_failed = 0; |
| 7678 | 7926 | ||
| @@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | |||
| 7704 | } | 7952 | } |
| 7705 | 7953 | ||
| 7706 | static inline void | 7954 | static inline void |
| 7707 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | 7955 | update_next_balance(struct sched_domain *sd, unsigned long *next_balance) |
| 7708 | { | 7956 | { |
| 7709 | unsigned long interval, next; | 7957 | unsigned long interval, next; |
| 7710 | 7958 | ||
| 7711 | interval = get_sd_balance_interval(sd, cpu_busy); | 7959 | /* used by idle balance, so cpu_busy = 0 */ |
| 7960 | interval = get_sd_balance_interval(sd, 0); | ||
| 7712 | next = sd->last_balance + interval; | 7961 | next = sd->last_balance + interval; |
| 7713 | 7962 | ||
| 7714 | if (time_after(*next_balance, next)) | 7963 | if (time_after(*next_balance, next)) |
| @@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq) | |||
| 7738 | rcu_read_lock(); | 7987 | rcu_read_lock(); |
| 7739 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 7988 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
| 7740 | if (sd) | 7989 | if (sd) |
| 7741 | update_next_balance(sd, 0, &next_balance); | 7990 | update_next_balance(sd, &next_balance); |
| 7742 | rcu_read_unlock(); | 7991 | rcu_read_unlock(); |
| 7743 | 7992 | ||
| 7744 | goto out; | 7993 | goto out; |
| @@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq) | |||
| 7756 | continue; | 8005 | continue; |
| 7757 | 8006 | ||
| 7758 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | 8007 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
| 7759 | update_next_balance(sd, 0, &next_balance); | 8008 | update_next_balance(sd, &next_balance); |
| 7760 | break; | 8009 | break; |
| 7761 | } | 8010 | } |
| 7762 | 8011 | ||
| @@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq) | |||
| 7774 | curr_cost += domain_cost; | 8023 | curr_cost += domain_cost; |
| 7775 | } | 8024 | } |
| 7776 | 8025 | ||
| 7777 | update_next_balance(sd, 0, &next_balance); | 8026 | update_next_balance(sd, &next_balance); |
| 7778 | 8027 | ||
| 7779 | /* | 8028 | /* |
| 7780 | * Stop searching for tasks to pull if there are | 8029 | * Stop searching for tasks to pull if there are |
| @@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 7864 | .idle = CPU_IDLE, | 8113 | .idle = CPU_IDLE, |
| 7865 | }; | 8114 | }; |
| 7866 | 8115 | ||
| 7867 | schedstat_inc(sd, alb_count); | 8116 | schedstat_inc(sd->alb_count); |
| 7868 | 8117 | ||
| 7869 | p = detach_one_task(&env); | 8118 | p = detach_one_task(&env); |
| 7870 | if (p) { | 8119 | if (p) { |
| 7871 | schedstat_inc(sd, alb_pushed); | 8120 | schedstat_inc(sd->alb_pushed); |
| 7872 | /* Active balancing done, reset the failure counter. */ | 8121 | /* Active balancing done, reset the failure counter. */ |
| 7873 | sd->nr_balance_failed = 0; | 8122 | sd->nr_balance_failed = 0; |
| 7874 | } else { | 8123 | } else { |
| 7875 | schedstat_inc(sd, alb_failed); | 8124 | schedstat_inc(sd->alb_failed); |
| 7876 | } | 8125 | } |
| 7877 | } | 8126 | } |
| 7878 | rcu_read_unlock(); | 8127 | rcu_read_unlock(); |
| @@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void) | |||
| 7964 | int cpu = smp_processor_id(); | 8213 | int cpu = smp_processor_id(); |
| 7965 | 8214 | ||
| 7966 | rcu_read_lock(); | 8215 | rcu_read_lock(); |
| 7967 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8216 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
| 7968 | 8217 | ||
| 7969 | if (!sd || !sd->nohz_idle) | 8218 | if (!sd || !sd->nohz_idle) |
| 7970 | goto unlock; | 8219 | goto unlock; |
| 7971 | sd->nohz_idle = 0; | 8220 | sd->nohz_idle = 0; |
| 7972 | 8221 | ||
| 7973 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); | 8222 | atomic_inc(&sd->shared->nr_busy_cpus); |
| 7974 | unlock: | 8223 | unlock: |
| 7975 | rcu_read_unlock(); | 8224 | rcu_read_unlock(); |
| 7976 | } | 8225 | } |
| @@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void) | |||
| 7981 | int cpu = smp_processor_id(); | 8230 | int cpu = smp_processor_id(); |
| 7982 | 8231 | ||
| 7983 | rcu_read_lock(); | 8232 | rcu_read_lock(); |
| 7984 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8233 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); |
| 7985 | 8234 | ||
| 7986 | if (!sd || sd->nohz_idle) | 8235 | if (!sd || sd->nohz_idle) |
| 7987 | goto unlock; | 8236 | goto unlock; |
| 7988 | sd->nohz_idle = 1; | 8237 | sd->nohz_idle = 1; |
| 7989 | 8238 | ||
| 7990 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); | 8239 | atomic_dec(&sd->shared->nr_busy_cpus); |
| 7991 | unlock: | 8240 | unlock: |
| 7992 | rcu_read_unlock(); | 8241 | rcu_read_unlock(); |
| 7993 | } | 8242 | } |
| @@ -8214,8 +8463,8 @@ end: | |||
| 8214 | static inline bool nohz_kick_needed(struct rq *rq) | 8463 | static inline bool nohz_kick_needed(struct rq *rq) |
| 8215 | { | 8464 | { |
| 8216 | unsigned long now = jiffies; | 8465 | unsigned long now = jiffies; |
| 8466 | struct sched_domain_shared *sds; | ||
| 8217 | struct sched_domain *sd; | 8467 | struct sched_domain *sd; |
| 8218 | struct sched_group_capacity *sgc; | ||
| 8219 | int nr_busy, cpu = rq->cpu; | 8468 | int nr_busy, cpu = rq->cpu; |
| 8220 | bool kick = false; | 8469 | bool kick = false; |
| 8221 | 8470 | ||
| @@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
| 8243 | return true; | 8492 | return true; |
| 8244 | 8493 | ||
| 8245 | rcu_read_lock(); | 8494 | rcu_read_lock(); |
| 8246 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 8495 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
| 8247 | if (sd) { | 8496 | if (sds) { |
| 8248 | sgc = sd->groups->sgc; | 8497 | /* |
| 8249 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 8498 | * XXX: write a coherent comment on why we do this. |
| 8250 | 8499 | * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com | |
| 8500 | */ | ||
| 8501 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
| 8251 | if (nr_busy > 1) { | 8502 | if (nr_busy > 1) { |
| 8252 | kick = true; | 8503 | kick = true; |
| 8253 | goto unlock; | 8504 | goto unlock; |
| @@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | |||
| 8283 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 8534 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
| 8284 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | 8535 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). |
| 8285 | */ | 8536 | */ |
| 8286 | static void run_rebalance_domains(struct softirq_action *h) | 8537 | static __latent_entropy void run_rebalance_domains(struct softirq_action *h) |
| 8287 | { | 8538 | { |
| 8288 | struct rq *this_rq = this_rq(); | 8539 | struct rq *this_rq = this_rq(); |
| 8289 | enum cpu_idle_type idle = this_rq->idle_balance ? | 8540 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| @@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
| 8441 | struct sched_entity *se = &p->se; | 8692 | struct sched_entity *se = &p->se; |
| 8442 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8693 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 8443 | u64 now = cfs_rq_clock_task(cfs_rq); | 8694 | u64 now = cfs_rq_clock_task(cfs_rq); |
| 8444 | int tg_update; | ||
| 8445 | 8695 | ||
| 8446 | if (!vruntime_normalized(p)) { | 8696 | if (!vruntime_normalized(p)) { |
| 8447 | /* | 8697 | /* |
| @@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p) | |||
| 8453 | } | 8703 | } |
| 8454 | 8704 | ||
| 8455 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8705 | /* Catch up with the cfs_rq and remove our load when we leave */ |
| 8456 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8706 | update_cfs_rq_load_avg(now, cfs_rq, false); |
| 8457 | detach_entity_load_avg(cfs_rq, se); | 8707 | detach_entity_load_avg(cfs_rq, se); |
| 8458 | if (tg_update) | 8708 | update_tg_load_avg(cfs_rq, false); |
| 8459 | update_tg_load_avg(cfs_rq, false); | ||
| 8460 | } | 8709 | } |
| 8461 | 8710 | ||
| 8462 | static void attach_task_cfs_rq(struct task_struct *p) | 8711 | static void attach_task_cfs_rq(struct task_struct *p) |
| @@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
| 8464 | struct sched_entity *se = &p->se; | 8713 | struct sched_entity *se = &p->se; |
| 8465 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 8714 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 8466 | u64 now = cfs_rq_clock_task(cfs_rq); | 8715 | u64 now = cfs_rq_clock_task(cfs_rq); |
| 8467 | int tg_update; | ||
| 8468 | 8716 | ||
| 8469 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8717 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8470 | /* | 8718 | /* |
| @@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
| 8475 | #endif | 8723 | #endif |
| 8476 | 8724 | ||
| 8477 | /* Synchronize task with its cfs_rq */ | 8725 | /* Synchronize task with its cfs_rq */ |
| 8478 | tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); | 8726 | update_cfs_rq_load_avg(now, cfs_rq, false); |
| 8479 | attach_entity_load_avg(cfs_rq, se); | 8727 | attach_entity_load_avg(cfs_rq, se); |
| 8480 | if (tg_update) | 8728 | update_tg_load_avg(cfs_rq, false); |
| 8481 | update_tg_load_avg(cfs_rq, false); | ||
| 8482 | 8729 | ||
| 8483 | if (!vruntime_normalized(p)) | 8730 | if (!vruntime_normalized(p)) |
| 8484 | se->vruntime += cfs_rq->min_vruntime; | 8731 | se->vruntime += cfs_rq->min_vruntime; |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -16,6 +16,9 @@ | |||
| 16 | 16 | ||
| 17 | #include "sched.h" | 17 | #include "sched.h" |
| 18 | 18 | ||
| 19 | /* Linker adds these: start and end of __cpuidle functions */ | ||
| 20 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | ||
| 21 | |||
| 19 | /** | 22 | /** |
| 20 | * sched_idle_set_state - Record idle state for the current CPU. | 23 | * sched_idle_set_state - Record idle state for the current CPU. |
| 21 | * @idle_state: State to record. | 24 | * @idle_state: State to record. |
| @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) | |||
| 53 | __setup("hlt", cpu_idle_nopoll_setup); | 56 | __setup("hlt", cpu_idle_nopoll_setup); |
| 54 | #endif | 57 | #endif |
| 55 | 58 | ||
| 56 | static inline int cpu_idle_poll(void) | 59 | static noinline int __cpuidle cpu_idle_poll(void) |
| 57 | { | 60 | { |
| 58 | rcu_idle_enter(); | 61 | rcu_idle_enter(); |
| 59 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 62 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
| @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) | |||
| 84 | * | 87 | * |
| 85 | * To use when the cpuidle framework cannot be used. | 88 | * To use when the cpuidle framework cannot be used. |
| 86 | */ | 89 | */ |
| 87 | void default_idle_call(void) | 90 | void __cpuidle default_idle_call(void) |
| 88 | { | 91 | { |
| 89 | if (current_clr_polling_and_test()) { | 92 | if (current_clr_polling_and_test()) { |
| 90 | local_irq_enable(); | 93 | local_irq_enable(); |
| @@ -271,6 +274,12 @@ static void cpu_idle_loop(void) | |||
| 271 | } | 274 | } |
| 272 | } | 275 | } |
| 273 | 276 | ||
| 277 | bool cpu_in_idle(unsigned long pc) | ||
| 278 | { | ||
| 279 | return pc >= (unsigned long)__cpuidle_text_start && | ||
| 280 | pc < (unsigned long)__cpuidle_text_end; | ||
| 281 | } | ||
| 282 | |||
| 274 | void cpu_startup_entry(enum cpuhp_state state) | 283 | void cpu_startup_entry(enum cpuhp_state state) |
| 275 | { | 284 | { |
| 276 | /* | 285 | /* |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -27,8 +27,8 @@ static struct task_struct * | |||
| 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) |
| 28 | { | 28 | { |
| 29 | put_prev_task(rq, prev); | 29 | put_prev_task(rq, prev); |
| 30 | 30 | update_idle_core(rq); | |
| 31 | schedstat_inc(rq, sched_goidle); | 31 | schedstat_inc(rq->sched_goidle); |
| 32 | return rq->idle; | 32 | return rq->idle; |
| 33 | } | 33 | } |
| 34 | 34 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5690b722691..2516b8df6dbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq) | |||
| 957 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
| 958 | return; | 958 | return; |
| 959 | 959 | ||
| 960 | /* Kick cpufreq (see the comment in linux/cpufreq.h). */ | 960 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ |
| 961 | if (cpu_of(rq) == smp_processor_id()) | 961 | cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); |
| 962 | cpufreq_trigger_update(rq_clock(rq)); | ||
| 963 | 962 | ||
| 964 | schedstat_set(curr->se.statistics.exec_max, | 963 | schedstat_set(curr->se.statistics.exec_max, |
| 965 | max(curr->se.statistics.exec_max, delta_exec)); | 964 | max(curr->se.statistics.exec_max, delta_exec)); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..055f935d4421 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
| 3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
| 4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
| 5 | #include <linux/u64_stats_sync.h> | ||
| 5 | #include <linux/sched/deadline.h> | 6 | #include <linux/sched/deadline.h> |
| 6 | #include <linux/binfmts.h> | 7 | #include <linux/binfmts.h> |
| 7 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
| @@ -15,6 +16,12 @@ | |||
| 15 | #include "cpudeadline.h" | 16 | #include "cpudeadline.h" |
| 16 | #include "cpuacct.h" | 17 | #include "cpuacct.h" |
| 17 | 18 | ||
| 19 | #ifdef CONFIG_SCHED_DEBUG | ||
| 20 | #define SCHED_WARN_ON(x) WARN_ONCE(x, #x) | ||
| 21 | #else | ||
| 22 | #define SCHED_WARN_ON(x) ((void)(x)) | ||
| 23 | #endif | ||
| 24 | |||
| 18 | struct rq; | 25 | struct rq; |
| 19 | struct cpuidle_state; | 26 | struct cpuidle_state; |
| 20 | 27 | ||
| @@ -565,6 +572,8 @@ struct root_domain { | |||
| 565 | */ | 572 | */ |
| 566 | cpumask_var_t rto_mask; | 573 | cpumask_var_t rto_mask; |
| 567 | struct cpupri cpupri; | 574 | struct cpupri cpupri; |
| 575 | |||
| 576 | unsigned long max_cpu_capacity; | ||
| 568 | }; | 577 | }; |
| 569 | 578 | ||
| 570 | extern struct root_domain def_root_domain; | 579 | extern struct root_domain def_root_domain; |
| @@ -597,7 +606,6 @@ struct rq { | |||
| 597 | #ifdef CONFIG_SMP | 606 | #ifdef CONFIG_SMP |
| 598 | unsigned long last_load_update_tick; | 607 | unsigned long last_load_update_tick; |
| 599 | #endif /* CONFIG_SMP */ | 608 | #endif /* CONFIG_SMP */ |
| 600 | u64 nohz_stamp; | ||
| 601 | unsigned long nohz_flags; | 609 | unsigned long nohz_flags; |
| 602 | #endif /* CONFIG_NO_HZ_COMMON */ | 610 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 603 | #ifdef CONFIG_NO_HZ_FULL | 611 | #ifdef CONFIG_NO_HZ_FULL |
| @@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq) | |||
| 723 | #endif | 731 | #endif |
| 724 | } | 732 | } |
| 725 | 733 | ||
| 734 | |||
| 735 | #ifdef CONFIG_SCHED_SMT | ||
| 736 | |||
| 737 | extern struct static_key_false sched_smt_present; | ||
| 738 | |||
| 739 | extern void __update_idle_core(struct rq *rq); | ||
| 740 | |||
| 741 | static inline void update_idle_core(struct rq *rq) | ||
| 742 | { | ||
| 743 | if (static_branch_unlikely(&sched_smt_present)) | ||
| 744 | __update_idle_core(rq); | ||
| 745 | } | ||
| 746 | |||
| 747 | #else | ||
| 748 | static inline void update_idle_core(struct rq *rq) { } | ||
| 749 | #endif | ||
| 750 | |||
| 726 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 751 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 727 | 752 | ||
| 728 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 753 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| @@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
| 857 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 882 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
| 858 | DECLARE_PER_CPU(int, sd_llc_size); | 883 | DECLARE_PER_CPU(int, sd_llc_size); |
| 859 | DECLARE_PER_CPU(int, sd_llc_id); | 884 | DECLARE_PER_CPU(int, sd_llc_id); |
| 885 | DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
| 860 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | 886 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); |
| 861 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 862 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 887 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
| 863 | 888 | ||
| 864 | struct sched_group_capacity { | 889 | struct sched_group_capacity { |
| @@ -870,10 +895,6 @@ struct sched_group_capacity { | |||
| 870 | unsigned int capacity; | 895 | unsigned int capacity; |
| 871 | unsigned long next_update; | 896 | unsigned long next_update; |
| 872 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 897 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
| 873 | /* | ||
| 874 | * Number of busy cpus in this group. | ||
| 875 | */ | ||
| 876 | atomic_t nr_busy_cpus; | ||
| 877 | 898 | ||
| 878 | unsigned long cpumask[0]; /* iteration mask */ | 899 | unsigned long cpumask[0]; /* iteration mask */ |
| 879 | }; | 900 | }; |
| @@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1000 | * per-task data have been completed by this moment. | 1021 | * per-task data have been completed by this moment. |
| 1001 | */ | 1022 | */ |
| 1002 | smp_wmb(); | 1023 | smp_wmb(); |
| 1024 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
| 1025 | p->cpu = cpu; | ||
| 1026 | #else | ||
| 1003 | task_thread_info(p)->cpu = cpu; | 1027 | task_thread_info(p)->cpu = cpu; |
| 1028 | #endif | ||
| 1004 | p->wake_cpu = cpu; | 1029 | p->wake_cpu = cpu; |
| 1005 | #endif | 1030 | #endif |
| 1006 | } | 1031 | } |
| @@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
| 1260 | prev->sched_class->put_prev_task(rq, prev); | 1285 | prev->sched_class->put_prev_task(rq, prev); |
| 1261 | } | 1286 | } |
| 1262 | 1287 | ||
| 1288 | static inline void set_curr_task(struct rq *rq, struct task_struct *curr) | ||
| 1289 | { | ||
| 1290 | curr->sched_class->set_curr_task(rq); | ||
| 1291 | } | ||
| 1292 | |||
| 1263 | #define sched_class_highest (&stop_sched_class) | 1293 | #define sched_class_highest (&stop_sched_class) |
| 1264 | #define for_each_class(class) \ | 1294 | #define for_each_class(class) \ |
| 1265 | for (class = sched_class_highest; class; class = class->next) | 1295 | for (class = sched_class_highest; class; class = class->next) |
| @@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq, | |||
| 1290 | 1320 | ||
| 1291 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1321 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
| 1292 | { | 1322 | { |
| 1293 | WARN_ON(!rcu_read_lock_held()); | 1323 | SCHED_WARN_ON(!rcu_read_lock_held()); |
| 1294 | return rq->idle_state; | 1324 | return rq->idle_state; |
| 1295 | } | 1325 | } |
| 1296 | #else | 1326 | #else |
| @@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
| 1710 | #endif | 1740 | #endif |
| 1711 | 1741 | ||
| 1712 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1742 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 1743 | struct irqtime { | ||
| 1744 | u64 hardirq_time; | ||
| 1745 | u64 softirq_time; | ||
| 1746 | u64 irq_start_time; | ||
| 1747 | struct u64_stats_sync sync; | ||
| 1748 | }; | ||
| 1713 | 1749 | ||
| 1714 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | 1750 | DECLARE_PER_CPU(struct irqtime, cpu_irqtime); |
| 1715 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
| 1716 | |||
| 1717 | #ifndef CONFIG_64BIT | ||
| 1718 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 1719 | |||
| 1720 | static inline void irq_time_write_begin(void) | ||
| 1721 | { | ||
| 1722 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1723 | smp_wmb(); | ||
| 1724 | } | ||
| 1725 | |||
| 1726 | static inline void irq_time_write_end(void) | ||
| 1727 | { | ||
| 1728 | smp_wmb(); | ||
| 1729 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1730 | } | ||
| 1731 | 1751 | ||
| 1732 | static inline u64 irq_time_read(int cpu) | 1752 | static inline u64 irq_time_read(int cpu) |
| 1733 | { | 1753 | { |
| 1734 | u64 irq_time; | 1754 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
| 1735 | unsigned seq; | 1755 | unsigned int seq; |
| 1756 | u64 total; | ||
| 1736 | 1757 | ||
| 1737 | do { | 1758 | do { |
| 1738 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | 1759 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
| 1739 | irq_time = per_cpu(cpu_softirq_time, cpu) + | 1760 | total = irqtime->softirq_time + irqtime->hardirq_time; |
| 1740 | per_cpu(cpu_hardirq_time, cpu); | 1761 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
| 1741 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 1742 | |||
| 1743 | return irq_time; | ||
| 1744 | } | ||
| 1745 | #else /* CONFIG_64BIT */ | ||
| 1746 | static inline void irq_time_write_begin(void) | ||
| 1747 | { | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | static inline void irq_time_write_end(void) | ||
| 1751 | { | ||
| 1752 | } | ||
| 1753 | 1762 | ||
| 1754 | static inline u64 irq_time_read(int cpu) | 1763 | return total; |
| 1755 | { | ||
| 1756 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 1757 | } | 1764 | } |
| 1758 | #endif /* CONFIG_64BIT */ | ||
| 1759 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1765 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 1760 | 1766 | ||
| 1761 | #ifdef CONFIG_CPU_FREQ | 1767 | #ifdef CONFIG_CPU_FREQ |
| @@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | |||
| 1763 | 1769 | ||
| 1764 | /** | 1770 | /** |
| 1765 | * cpufreq_update_util - Take a note about CPU utilization changes. | 1771 | * cpufreq_update_util - Take a note about CPU utilization changes. |
| 1766 | * @time: Current time. | 1772 | * @rq: Runqueue to carry out the update for. |
| 1767 | * @util: Current utilization. | 1773 | * @flags: Update reason flags. |
| 1768 | * @max: Utilization ceiling. | ||
| 1769 | * | 1774 | * |
| 1770 | * This function is called by the scheduler on every invocation of | 1775 | * This function is called by the scheduler on the CPU whose utilization is |
| 1771 | * update_load_avg() on the CPU whose utilization is being updated. | 1776 | * being updated. |
| 1772 | * | 1777 | * |
| 1773 | * It can only be called from RCU-sched read-side critical sections. | 1778 | * It can only be called from RCU-sched read-side critical sections. |
| 1774 | */ | ||
| 1775 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) | ||
| 1776 | { | ||
| 1777 | struct update_util_data *data; | ||
| 1778 | |||
| 1779 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
| 1780 | if (data) | ||
| 1781 | data->func(data, time, util, max); | ||
| 1782 | } | ||
| 1783 | |||
| 1784 | /** | ||
| 1785 | * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. | ||
| 1786 | * @time: Current time. | ||
| 1787 | * | 1779 | * |
| 1788 | * The way cpufreq is currently arranged requires it to evaluate the CPU | 1780 | * The way cpufreq is currently arranged requires it to evaluate the CPU |
| 1789 | * performance state (frequency/voltage) on a regular basis to prevent it from | 1781 | * performance state (frequency/voltage) on a regular basis to prevent it from |
| @@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo | |||
| 1797 | * but that really is a band-aid. Going forward it should be replaced with | 1789 | * but that really is a band-aid. Going forward it should be replaced with |
| 1798 | * solutions targeted more specifically at RT and DL tasks. | 1790 | * solutions targeted more specifically at RT and DL tasks. |
| 1799 | */ | 1791 | */ |
| 1800 | static inline void cpufreq_trigger_update(u64 time) | 1792 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) |
| 1793 | { | ||
| 1794 | struct update_util_data *data; | ||
| 1795 | |||
| 1796 | data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); | ||
| 1797 | if (data) | ||
| 1798 | data->func(data, rq_clock(rq), flags); | ||
| 1799 | } | ||
| 1800 | |||
| 1801 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) | ||
| 1801 | { | 1802 | { |
| 1802 | cpufreq_update_util(time, ULONG_MAX, 0); | 1803 | if (cpu_of(rq) == smp_processor_id()) |
| 1804 | cpufreq_update_util(rq, flags); | ||
| 1803 | } | 1805 | } |
| 1804 | #else | 1806 | #else |
| 1805 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} | 1807 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} |
| 1806 | static inline void cpufreq_trigger_update(u64 time) {} | 1808 | static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} |
| 1807 | #endif /* CONFIG_CPU_FREQ */ | 1809 | #endif /* CONFIG_CPU_FREQ */ |
| 1808 | 1810 | ||
| 1809 | #ifdef arch_scale_freq_capacity | 1811 | #ifdef arch_scale_freq_capacity |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
| @@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
| 29 | if (rq) | 29 | if (rq) |
| 30 | rq->rq_sched_info.run_delay += delta; | 30 | rq->rq_sched_info.run_delay += delta; |
| 31 | } | 31 | } |
| 32 | # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 32 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
| 33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) | 33 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
| 34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) | 34 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
| 35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 35 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
| 36 | # define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) | 36 | #define schedstat_val(var) (var) |
| 37 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | ||
| 37 | 38 | ||
| 38 | #else /* !CONFIG_SCHEDSTATS */ | 39 | #else /* !CONFIG_SCHEDSTATS */ |
| 39 | static inline void | 40 | static inline void |
| @@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
| 45 | static inline void | 46 | static inline void |
| 46 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 47 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
| 47 | {} | 48 | {} |
| 48 | # define schedstat_enabled() 0 | 49 | #define schedstat_enabled() 0 |
| 49 | # define schedstat_inc(rq, field) do { } while (0) | 50 | #define schedstat_inc(var) do { } while (0) |
| 50 | # define schedstat_add(rq, field, amt) do { } while (0) | 51 | #define schedstat_add(var, amt) do { } while (0) |
| 51 | # define schedstat_set(var, val) do { } while (0) | 52 | #define schedstat_set(var, val) do { } while (0) |
| 52 | # define schedstat_val(rq, field) 0 | 53 | #define schedstat_val(var) 0 |
| 53 | #endif | 54 | #define schedstat_val_or_zero(var) 0 |
| 55 | #endif /* CONFIG_SCHEDSTATS */ | ||
| 54 | 56 | ||
| 55 | #ifdef CONFIG_SCHED_INFO | 57 | #ifdef CONFIG_SCHED_INFO |
| 56 | static inline void sched_info_reset_dequeued(struct task_struct *t) | 58 | static inline void sched_info_reset_dequeued(struct task_struct *t) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..4f7053579fe3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
| 196 | } | 196 | } |
| 197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 197 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
| 198 | 198 | ||
| 199 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | 199 | void init_wait_entry(wait_queue_t *wait, int flags) |
| 200 | { | 200 | { |
| 201 | unsigned long flags; | 201 | wait->flags = flags; |
| 202 | |||
| 203 | if (signal_pending_state(state, current)) | ||
| 204 | return -ERESTARTSYS; | ||
| 205 | |||
| 206 | wait->private = current; | 202 | wait->private = current; |
| 207 | wait->func = autoremove_wake_function; | 203 | wait->func = autoremove_wake_function; |
| 204 | INIT_LIST_HEAD(&wait->task_list); | ||
| 205 | } | ||
| 206 | EXPORT_SYMBOL(init_wait_entry); | ||
| 207 | |||
| 208 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
| 209 | { | ||
| 210 | unsigned long flags; | ||
| 211 | long ret = 0; | ||
| 208 | 212 | ||
| 209 | spin_lock_irqsave(&q->lock, flags); | 213 | spin_lock_irqsave(&q->lock, flags); |
| 210 | if (list_empty(&wait->task_list)) { | 214 | if (unlikely(signal_pending_state(state, current))) { |
| 211 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | 215 | /* |
| 212 | __add_wait_queue_tail(q, wait); | 216 | * Exclusive waiter must not fail if it was selected by wakeup, |
| 213 | else | 217 | * it should "consume" the condition we were waiting for. |
| 214 | __add_wait_queue(q, wait); | 218 | * |
| 219 | * The caller will recheck the condition and return success if | ||
| 220 | * we were already woken up, we can not miss the event because | ||
| 221 | * wakeup locks/unlocks the same q->lock. | ||
| 222 | * | ||
| 223 | * But we need to ensure that set-condition + wakeup after that | ||
| 224 | * can't see us, it should wake up another exclusive waiter if | ||
| 225 | * we fail. | ||
| 226 | */ | ||
| 227 | list_del_init(&wait->task_list); | ||
| 228 | ret = -ERESTARTSYS; | ||
| 229 | } else { | ||
| 230 | if (list_empty(&wait->task_list)) { | ||
| 231 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
| 232 | __add_wait_queue_tail(q, wait); | ||
| 233 | else | ||
| 234 | __add_wait_queue(q, wait); | ||
| 235 | } | ||
| 236 | set_current_state(state); | ||
| 215 | } | 237 | } |
| 216 | set_current_state(state); | ||
| 217 | spin_unlock_irqrestore(&q->lock, flags); | 238 | spin_unlock_irqrestore(&q->lock, flags); |
| 218 | 239 | ||
| 219 | return 0; | 240 | return ret; |
| 220 | } | 241 | } |
| 221 | EXPORT_SYMBOL(prepare_to_wait_event); | 242 | EXPORT_SYMBOL(prepare_to_wait_event); |
| 222 | 243 | ||
| @@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
| 255 | } | 276 | } |
| 256 | EXPORT_SYMBOL(finish_wait); | 277 | EXPORT_SYMBOL(finish_wait); |
| 257 | 278 | ||
| 258 | /** | ||
| 259 | * abort_exclusive_wait - abort exclusive waiting in a queue | ||
| 260 | * @q: waitqueue waited on | ||
| 261 | * @wait: wait descriptor | ||
| 262 | * @mode: runstate of the waiter to be woken | ||
| 263 | * @key: key to identify a wait bit queue or %NULL | ||
| 264 | * | ||
| 265 | * Sets current thread back to running state and removes | ||
| 266 | * the wait descriptor from the given waitqueue if still | ||
| 267 | * queued. | ||
| 268 | * | ||
| 269 | * Wakes up the next waiter if the caller is concurrently | ||
| 270 | * woken up through the queue. | ||
| 271 | * | ||
| 272 | * This prevents waiter starvation where an exclusive waiter | ||
| 273 | * aborts and is woken up concurrently and no one wakes up | ||
| 274 | * the next waiter. | ||
| 275 | */ | ||
| 276 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | ||
| 277 | unsigned int mode, void *key) | ||
| 278 | { | ||
| 279 | unsigned long flags; | ||
| 280 | |||
| 281 | __set_current_state(TASK_RUNNING); | ||
| 282 | spin_lock_irqsave(&q->lock, flags); | ||
| 283 | if (!list_empty(&wait->task_list)) | ||
| 284 | list_del_init(&wait->task_list); | ||
| 285 | else if (waitqueue_active(q)) | ||
| 286 | __wake_up_locked_key(q, mode, key); | ||
| 287 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 288 | } | ||
| 289 | EXPORT_SYMBOL(abort_exclusive_wait); | ||
| 290 | |||
| 291 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | 279 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) |
| 292 | { | 280 | { |
| 293 | int ret = default_wake_function(wait, mode, sync, key); | 281 | int ret = default_wake_function(wait, mode, sync, key); |
| @@ -425,20 +413,29 @@ int __sched | |||
| 425 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 413 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
| 426 | wait_bit_action_f *action, unsigned mode) | 414 | wait_bit_action_f *action, unsigned mode) |
| 427 | { | 415 | { |
| 428 | do { | 416 | int ret = 0; |
| 429 | int ret; | ||
| 430 | 417 | ||
| 418 | for (;;) { | ||
| 431 | prepare_to_wait_exclusive(wq, &q->wait, mode); | 419 | prepare_to_wait_exclusive(wq, &q->wait, mode); |
| 432 | if (!test_bit(q->key.bit_nr, q->key.flags)) | 420 | if (test_bit(q->key.bit_nr, q->key.flags)) { |
| 433 | continue; | 421 | ret = action(&q->key, mode); |
| 434 | ret = action(&q->key, mode); | 422 | /* |
| 435 | if (!ret) | 423 | * See the comment in prepare_to_wait_event(). |
| 436 | continue; | 424 | * finish_wait() does not necessarily takes wq->lock, |
| 437 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); | 425 | * but test_and_set_bit() implies mb() which pairs with |
| 438 | return ret; | 426 | * smp_mb__after_atomic() before wake_up_page(). |
| 439 | } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); | 427 | */ |
| 440 | finish_wait(wq, &q->wait); | 428 | if (ret) |
| 441 | return 0; | 429 | finish_wait(wq, &q->wait); |
| 430 | } | ||
| 431 | if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { | ||
| 432 | if (!ret) | ||
| 433 | finish_wait(wq, &q->wait); | ||
| 434 | return 0; | ||
| 435 | } else if (ret) { | ||
| 436 | return ret; | ||
| 437 | } | ||
| 438 | } | ||
| 442 | } | 439 | } |
| 443 | EXPORT_SYMBOL(__wait_on_bit_lock); | 440 | EXPORT_SYMBOL(__wait_on_bit_lock); |
| 444 | 441 | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ef6c6c3f9d8a..0db7c8a2afe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
| 605 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | 605 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
| 606 | /* | 606 | /* |
| 607 | * The delivery of a fatal signal during event | 607 | * The delivery of a fatal signal during event |
| 608 | * notification may silently skip tracer notification. | 608 | * notification may silently skip tracer notification, |
| 609 | * Terminating the task now avoids executing a system | 609 | * which could leave us with a potentially unmodified |
| 610 | * call that may not be intended. | 610 | * syscall that the tracer would have liked to have |
| 611 | * changed. Since the process is about to die, we just | ||
| 612 | * force the syscall to be skipped and let the signal | ||
| 613 | * kill the process and correctly handle any tracer exit | ||
| 614 | * notifications. | ||
| 611 | */ | 615 | */ |
| 612 | if (fatal_signal_pending(current)) | 616 | if (fatal_signal_pending(current)) |
| 613 | do_exit(SIGSYS); | 617 | goto skip; |
| 614 | /* Check if the tracer forced the syscall to be skipped. */ | 618 | /* Check if the tracer forced the syscall to be skipped. */ |
| 615 | this_syscall = syscall_get_nr(current, task_pt_regs(current)); | 619 | this_syscall = syscall_get_nr(current, task_pt_regs(current)); |
| 616 | if (this_syscall < 0) | 620 | if (this_syscall < 0) |
diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..75761acc77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action) | |||
| 3044 | } | 3044 | } |
| 3045 | EXPORT_SYMBOL(kernel_sigaction); | 3045 | EXPORT_SYMBOL(kernel_sigaction); |
| 3046 | 3046 | ||
| 3047 | void __weak sigaction_compat_abi(struct k_sigaction *act, | ||
| 3048 | struct k_sigaction *oact) | ||
| 3049 | { | ||
| 3050 | } | ||
| 3051 | |||
| 3047 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 3052 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
| 3048 | { | 3053 | { |
| 3049 | struct task_struct *p = current, *t; | 3054 | struct task_struct *p = current, *t; |
| @@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 3059 | if (oact) | 3064 | if (oact) |
| 3060 | *oact = *k; | 3065 | *oact = *k; |
| 3061 | 3066 | ||
| 3067 | sigaction_compat_abi(act, oact); | ||
| 3068 | |||
| 3062 | if (act) { | 3069 | if (act) { |
| 3063 | sigdelsetmask(&act->sa.sa_mask, | 3070 | sigdelsetmask(&act->sa.sa_mask, |
| 3064 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | 3071 | sigmask(SIGKILL) | sigmask(SIGSTOP)); |
diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..bba3b201668d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/hypervisor.h> | ||
| 17 | 18 | ||
| 18 | #include "smpboot.h" | 19 | #include "smpboot.h" |
| 19 | 20 | ||
| @@ -724,3 +725,54 @@ void wake_up_all_idle_cpus(void) | |||
| 724 | preempt_enable(); | 725 | preempt_enable(); |
| 725 | } | 726 | } |
| 726 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | 727 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); |
| 728 | |||
| 729 | /** | ||
| 730 | * smp_call_on_cpu - Call a function on a specific cpu | ||
| 731 | * | ||
| 732 | * Used to call a function on a specific cpu and wait for it to return. | ||
| 733 | * Optionally make sure the call is done on a specified physical cpu via vcpu | ||
| 734 | * pinning in order to support virtualized environments. | ||
| 735 | */ | ||
| 736 | struct smp_call_on_cpu_struct { | ||
| 737 | struct work_struct work; | ||
| 738 | struct completion done; | ||
| 739 | int (*func)(void *); | ||
| 740 | void *data; | ||
| 741 | int ret; | ||
| 742 | int cpu; | ||
| 743 | }; | ||
| 744 | |||
| 745 | static void smp_call_on_cpu_callback(struct work_struct *work) | ||
| 746 | { | ||
| 747 | struct smp_call_on_cpu_struct *sscs; | ||
| 748 | |||
| 749 | sscs = container_of(work, struct smp_call_on_cpu_struct, work); | ||
| 750 | if (sscs->cpu >= 0) | ||
| 751 | hypervisor_pin_vcpu(sscs->cpu); | ||
| 752 | sscs->ret = sscs->func(sscs->data); | ||
| 753 | if (sscs->cpu >= 0) | ||
| 754 | hypervisor_pin_vcpu(-1); | ||
| 755 | |||
| 756 | complete(&sscs->done); | ||
| 757 | } | ||
| 758 | |||
| 759 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) | ||
| 760 | { | ||
| 761 | struct smp_call_on_cpu_struct sscs = { | ||
| 762 | .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), | ||
| 763 | .func = func, | ||
| 764 | .data = par, | ||
| 765 | .cpu = phys ? cpu : -1, | ||
| 766 | }; | ||
| 767 | |||
| 768 | INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); | ||
| 769 | |||
| 770 | if (cpu >= nr_cpu_ids || !cpu_online(cpu)) | ||
| 771 | return -ENXIO; | ||
| 772 | |||
| 773 | queue_work_on(cpu, system_wq, &sscs.work); | ||
| 774 | wait_for_completion(&sscs.done); | ||
| 775 | |||
| 776 | return sscs.ret; | ||
| 777 | } | ||
| 778 | EXPORT_SYMBOL_GPL(smp_call_on_cpu); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d1fb22..4a5c6e73ecd4 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | |||
| 186 | kfree(td); | 186 | kfree(td); |
| 187 | return PTR_ERR(tsk); | 187 | return PTR_ERR(tsk); |
| 188 | } | 188 | } |
| 189 | /* | ||
| 190 | * Park the thread so that it could start right on the CPU | ||
| 191 | * when it is available. | ||
| 192 | */ | ||
| 193 | kthread_park(tsk); | ||
| 189 | get_task_struct(tsk); | 194 | get_task_struct(tsk); |
| 190 | *per_cpu_ptr(ht->store, cpu) = tsk; | 195 | *per_cpu_ptr(ht->store, cpu) = tsk; |
| 191 | if (ht->create) { | 196 | if (ht->create) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..1bf81ef91375 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -78,6 +78,17 @@ static void wakeup_softirqd(void) | |||
| 78 | } | 78 | } |
| 79 | 79 | ||
| 80 | /* | 80 | /* |
| 81 | * If ksoftirqd is scheduled, we do not want to process pending softirqs | ||
| 82 | * right now. Let ksoftirqd handle this at its own rate, to get fairness. | ||
| 83 | */ | ||
| 84 | static bool ksoftirqd_running(void) | ||
| 85 | { | ||
| 86 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); | ||
| 87 | |||
| 88 | return tsk && (tsk->state == TASK_RUNNING); | ||
| 89 | } | ||
| 90 | |||
| 91 | /* | ||
| 81 | * preempt_count and SOFTIRQ_OFFSET usage: | 92 | * preempt_count and SOFTIRQ_OFFSET usage: |
| 82 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | 93 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving |
| 83 | * softirq processing. | 94 | * softirq processing. |
| @@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void) | |||
| 313 | 324 | ||
| 314 | pending = local_softirq_pending(); | 325 | pending = local_softirq_pending(); |
| 315 | 326 | ||
| 316 | if (pending) | 327 | if (pending && !ksoftirqd_running()) |
| 317 | do_softirq_own_stack(); | 328 | do_softirq_own_stack(); |
| 318 | 329 | ||
| 319 | local_irq_restore(flags); | 330 | local_irq_restore(flags); |
| @@ -340,6 +351,9 @@ void irq_enter(void) | |||
| 340 | 351 | ||
| 341 | static inline void invoke_softirq(void) | 352 | static inline void invoke_softirq(void) |
| 342 | { | 353 | { |
| 354 | if (ksoftirqd_running()) | ||
| 355 | return; | ||
| 356 | |||
| 343 | if (!force_irqthreads) { | 357 | if (!force_irqthreads) { |
| 344 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK | 358 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK |
| 345 | /* | 359 | /* |
| @@ -482,7 +496,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) | |||
| 482 | } | 496 | } |
| 483 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); | 497 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); |
| 484 | 498 | ||
| 485 | static void tasklet_action(struct softirq_action *a) | 499 | static __latent_entropy void tasklet_action(struct softirq_action *a) |
| 486 | { | 500 | { |
| 487 | struct tasklet_struct *list; | 501 | struct tasklet_struct *list; |
| 488 | 502 | ||
| @@ -518,7 +532,7 @@ static void tasklet_action(struct softirq_action *a) | |||
| 518 | } | 532 | } |
| 519 | } | 533 | } |
| 520 | 534 | ||
| 521 | static void tasklet_hi_action(struct softirq_action *a) | 535 | static __latent_entropy void tasklet_hi_action(struct softirq_action *a) |
| 522 | { | 536 | { |
| 523 | struct tasklet_struct *list; | 537 | struct tasklet_struct *list; |
| 524 | 538 | ||
| @@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | |||
| 700 | BUG(); | 714 | BUG(); |
| 701 | } | 715 | } |
| 702 | 716 | ||
| 703 | static void takeover_tasklets(unsigned int cpu) | 717 | static int takeover_tasklets(unsigned int cpu) |
| 704 | { | 718 | { |
| 705 | /* CPU is dead, so no lock needed. */ | 719 | /* CPU is dead, so no lock needed. */ |
| 706 | local_irq_disable(); | 720 | local_irq_disable(); |
| @@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu) | |||
| 723 | raise_softirq_irqoff(HI_SOFTIRQ); | 737 | raise_softirq_irqoff(HI_SOFTIRQ); |
| 724 | 738 | ||
| 725 | local_irq_enable(); | 739 | local_irq_enable(); |
| 740 | return 0; | ||
| 726 | } | 741 | } |
| 742 | #else | ||
| 743 | #define takeover_tasklets NULL | ||
| 727 | #endif /* CONFIG_HOTPLUG_CPU */ | 744 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 728 | 745 | ||
| 729 | static int cpu_callback(struct notifier_block *nfb, unsigned long action, | ||
| 730 | void *hcpu) | ||
| 731 | { | ||
| 732 | switch (action) { | ||
| 733 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 734 | case CPU_DEAD: | ||
| 735 | case CPU_DEAD_FROZEN: | ||
| 736 | takeover_tasklets((unsigned long)hcpu); | ||
| 737 | break; | ||
| 738 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 739 | } | ||
| 740 | return NOTIFY_OK; | ||
| 741 | } | ||
| 742 | |||
| 743 | static struct notifier_block cpu_nfb = { | ||
| 744 | .notifier_call = cpu_callback | ||
| 745 | }; | ||
| 746 | |||
| 747 | static struct smp_hotplug_thread softirq_threads = { | 746 | static struct smp_hotplug_thread softirq_threads = { |
| 748 | .store = &ksoftirqd, | 747 | .store = &ksoftirqd, |
| 749 | .thread_should_run = ksoftirqd_should_run, | 748 | .thread_should_run = ksoftirqd_should_run, |
| @@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = { | |||
| 753 | 752 | ||
| 754 | static __init int spawn_ksoftirqd(void) | 753 | static __init int spawn_ksoftirqd(void) |
| 755 | { | 754 | { |
| 756 | register_cpu_notifier(&cpu_nfb); | 755 | cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, |
| 757 | 756 | takeover_tasklets); | |
| 758 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); | 757 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); |
| 759 | 758 | ||
| 760 | return 0; | 759 | return 0; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..ec9ab2f01489 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -20,7 +20,6 @@ | |||
| 20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
| 21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
| 22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | #include <linux/lglock.h> | ||
| 24 | #include <linux/nmi.h> | 23 | #include <linux/nmi.h> |
| 25 | 24 | ||
| 26 | /* | 25 | /* |
| @@ -47,13 +46,9 @@ struct cpu_stopper { | |||
| 47 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 46 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
| 48 | static bool stop_machine_initialized = false; | 47 | static bool stop_machine_initialized = false; |
| 49 | 48 | ||
| 50 | /* | 49 | /* static data for stop_cpus */ |
| 51 | * Avoids a race between stop_two_cpus and global stop_cpus, where | 50 | static DEFINE_MUTEX(stop_cpus_mutex); |
| 52 | * the stoppers could get queued up in reverse order, leading to | 51 | static bool stop_cpus_in_progress; |
| 53 | * system deadlock. Using an lglock means stop_two_cpus remains | ||
| 54 | * relatively cheap. | ||
| 55 | */ | ||
| 56 | DEFINE_STATIC_LGLOCK(stop_cpus_lock); | ||
| 57 | 52 | ||
| 58 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 53 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
| 59 | { | 54 | { |
| @@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
| 126 | cpu_stop_init_done(&done, 1); | 121 | cpu_stop_init_done(&done, 1); |
| 127 | if (!cpu_stop_queue_work(cpu, &work)) | 122 | if (!cpu_stop_queue_work(cpu, &work)) |
| 128 | return -ENOENT; | 123 | return -ENOENT; |
| 124 | /* | ||
| 125 | * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup | ||
| 126 | * cycle by doing a preemption: | ||
| 127 | */ | ||
| 128 | cond_resched(); | ||
| 129 | wait_for_completion(&done.completion); | 129 | wait_for_completion(&done.completion); |
| 130 | return done.ret; | 130 | return done.ret; |
| 131 | } | 131 | } |
| @@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | |||
| 230 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); | 230 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); |
| 231 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); | 231 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); |
| 232 | int err; | 232 | int err; |
| 233 | 233 | retry: | |
| 234 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
| 235 | spin_lock_irq(&stopper1->lock); | 234 | spin_lock_irq(&stopper1->lock); |
| 236 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); | 235 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); |
| 237 | 236 | ||
| 238 | err = -ENOENT; | 237 | err = -ENOENT; |
| 239 | if (!stopper1->enabled || !stopper2->enabled) | 238 | if (!stopper1->enabled || !stopper2->enabled) |
| 240 | goto unlock; | 239 | goto unlock; |
| 240 | /* | ||
| 241 | * Ensure that if we race with __stop_cpus() the stoppers won't get | ||
| 242 | * queued up in reverse order leading to system deadlock. | ||
| 243 | * | ||
| 244 | * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has | ||
| 245 | * queued a work on cpu1 but not on cpu2, we hold both locks. | ||
| 246 | * | ||
| 247 | * It can be falsely true but it is safe to spin until it is cleared, | ||
| 248 | * queue_stop_cpus_work() does everything under preempt_disable(). | ||
| 249 | */ | ||
| 250 | err = -EDEADLK; | ||
| 251 | if (unlikely(stop_cpus_in_progress)) | ||
| 252 | goto unlock; | ||
| 241 | 253 | ||
| 242 | err = 0; | 254 | err = 0; |
| 243 | __cpu_stop_queue_work(stopper1, work1); | 255 | __cpu_stop_queue_work(stopper1, work1); |
| @@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | |||
| 245 | unlock: | 257 | unlock: |
| 246 | spin_unlock(&stopper2->lock); | 258 | spin_unlock(&stopper2->lock); |
| 247 | spin_unlock_irq(&stopper1->lock); | 259 | spin_unlock_irq(&stopper1->lock); |
| 248 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
| 249 | 260 | ||
| 261 | if (unlikely(err == -EDEADLK)) { | ||
| 262 | while (stop_cpus_in_progress) | ||
| 263 | cpu_relax(); | ||
| 264 | goto retry; | ||
| 265 | } | ||
| 250 | return err; | 266 | return err; |
| 251 | } | 267 | } |
| 252 | /** | 268 | /** |
| @@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
| 316 | return cpu_stop_queue_work(cpu, work_buf); | 332 | return cpu_stop_queue_work(cpu, work_buf); |
| 317 | } | 333 | } |
| 318 | 334 | ||
| 319 | /* static data for stop_cpus */ | ||
| 320 | static DEFINE_MUTEX(stop_cpus_mutex); | ||
| 321 | |||
| 322 | static bool queue_stop_cpus_work(const struct cpumask *cpumask, | 335 | static bool queue_stop_cpus_work(const struct cpumask *cpumask, |
| 323 | cpu_stop_fn_t fn, void *arg, | 336 | cpu_stop_fn_t fn, void *arg, |
| 324 | struct cpu_stop_done *done) | 337 | struct cpu_stop_done *done) |
| @@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, | |||
| 332 | * preempted by a stopper which might wait for other stoppers | 345 | * preempted by a stopper which might wait for other stoppers |
| 333 | * to enter @fn which can lead to deadlock. | 346 | * to enter @fn which can lead to deadlock. |
| 334 | */ | 347 | */ |
| 335 | lg_global_lock(&stop_cpus_lock); | 348 | preempt_disable(); |
| 349 | stop_cpus_in_progress = true; | ||
| 336 | for_each_cpu(cpu, cpumask) { | 350 | for_each_cpu(cpu, cpumask) { |
| 337 | work = &per_cpu(cpu_stopper.stop_work, cpu); | 351 | work = &per_cpu(cpu_stopper.stop_work, cpu); |
| 338 | work->fn = fn; | 352 | work->fn = fn; |
| @@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, | |||
| 341 | if (cpu_stop_queue_work(cpu, work)) | 355 | if (cpu_stop_queue_work(cpu, work)) |
| 342 | queued = true; | 356 | queued = true; |
| 343 | } | 357 | } |
| 344 | lg_global_unlock(&stop_cpus_lock); | 358 | stop_cpus_in_progress = false; |
| 359 | preempt_enable(); | ||
| 345 | 360 | ||
| 346 | return queued; | 361 | return queued; |
| 347 | } | 362 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2c5e3a8e00d7..635482e60ca3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -250,3 +250,8 @@ cond_syscall(sys_execveat); | |||
| 250 | 250 | ||
| 251 | /* membarrier */ | 251 | /* membarrier */ |
| 252 | cond_syscall(sys_membarrier); | 252 | cond_syscall(sys_membarrier); |
| 253 | |||
| 254 | /* memory protection keys */ | ||
| 255 | cond_syscall(sys_pkey_mprotect); | ||
| 256 | cond_syscall(sys_pkey_alloc); | ||
| 257 | cond_syscall(sys_pkey_free); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b43d0b27c1fe..706309f9ed84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -65,6 +65,7 @@ | |||
| 65 | #include <linux/sched/sysctl.h> | 65 | #include <linux/sched/sysctl.h> |
| 66 | #include <linux/kexec.h> | 66 | #include <linux/kexec.h> |
| 67 | #include <linux/bpf.h> | 67 | #include <linux/bpf.h> |
| 68 | #include <linux/mount.h> | ||
| 68 | 69 | ||
| 69 | #include <asm/uaccess.h> | 70 | #include <asm/uaccess.h> |
| 70 | #include <asm/processor.h> | 71 | #include <asm/processor.h> |
| @@ -106,9 +107,8 @@ extern unsigned int core_pipe_limit; | |||
| 106 | extern int pid_max; | 107 | extern int pid_max; |
| 107 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
| 108 | extern int percpu_pagelist_fraction; | 109 | extern int percpu_pagelist_fraction; |
| 109 | extern int compat_log; | ||
| 110 | extern int latencytop_enabled; | 110 | extern int latencytop_enabled; |
| 111 | extern int sysctl_nr_open_min, sysctl_nr_open_max; | 111 | extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; |
| 112 | #ifndef CONFIG_MMU | 112 | #ifndef CONFIG_MMU |
| 113 | extern int sysctl_nr_trim_pages; | 113 | extern int sysctl_nr_trim_pages; |
| 114 | #endif | 114 | #endif |
| @@ -1084,15 +1084,6 @@ static struct ctl_table kern_table[] = { | |||
| 1084 | .extra1 = &neg_one, | 1084 | .extra1 = &neg_one, |
| 1085 | }, | 1085 | }, |
| 1086 | #endif | 1086 | #endif |
| 1087 | #ifdef CONFIG_COMPAT | ||
| 1088 | { | ||
| 1089 | .procname = "compat-log", | ||
| 1090 | .data = &compat_log, | ||
| 1091 | .maxlen = sizeof (int), | ||
| 1092 | .mode = 0644, | ||
| 1093 | .proc_handler = proc_dointvec, | ||
| 1094 | }, | ||
| 1095 | #endif | ||
| 1096 | #ifdef CONFIG_RT_MUTEXES | 1087 | #ifdef CONFIG_RT_MUTEXES |
| 1097 | { | 1088 | { |
| 1098 | .procname = "max_lock_depth", | 1089 | .procname = "max_lock_depth", |
| @@ -1692,7 +1683,7 @@ static struct ctl_table fs_table[] = { | |||
| 1692 | { | 1683 | { |
| 1693 | .procname = "nr_open", | 1684 | .procname = "nr_open", |
| 1694 | .data = &sysctl_nr_open, | 1685 | .data = &sysctl_nr_open, |
| 1695 | .maxlen = sizeof(int), | 1686 | .maxlen = sizeof(unsigned int), |
| 1696 | .mode = 0644, | 1687 | .mode = 0644, |
| 1697 | .proc_handler = proc_dointvec_minmax, | 1688 | .proc_handler = proc_dointvec_minmax, |
| 1698 | .extra1 = &sysctl_nr_open_min, | 1689 | .extra1 = &sysctl_nr_open_min, |
| @@ -1838,6 +1829,14 @@ static struct ctl_table fs_table[] = { | |||
| 1838 | .mode = 0644, | 1829 | .mode = 0644, |
| 1839 | .proc_handler = proc_doulongvec_minmax, | 1830 | .proc_handler = proc_doulongvec_minmax, |
| 1840 | }, | 1831 | }, |
| 1832 | { | ||
| 1833 | .procname = "mount-max", | ||
| 1834 | .data = &sysctl_mount_max, | ||
| 1835 | .maxlen = sizeof(unsigned int), | ||
| 1836 | .mode = 0644, | ||
| 1837 | .proc_handler = proc_dointvec_minmax, | ||
| 1838 | .extra1 = &one, | ||
| 1839 | }, | ||
| 1841 | { } | 1840 | { } |
| 1842 | }; | 1841 | }; |
| 1843 | 1842 | ||
| @@ -2140,6 +2139,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, | |||
| 2140 | return 0; | 2139 | return 0; |
| 2141 | } | 2140 | } |
| 2142 | 2141 | ||
| 2142 | static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, | ||
| 2143 | int *valp, | ||
| 2144 | int write, void *data) | ||
| 2145 | { | ||
| 2146 | if (write) { | ||
| 2147 | if (*negp) | ||
| 2148 | return -EINVAL; | ||
| 2149 | *valp = *lvalp; | ||
| 2150 | } else { | ||
| 2151 | unsigned int val = *valp; | ||
| 2152 | *lvalp = (unsigned long)val; | ||
| 2153 | } | ||
| 2154 | return 0; | ||
| 2155 | } | ||
| 2156 | |||
| 2143 | static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; | 2157 | static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; |
| 2144 | 2158 | ||
| 2145 | static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | 2159 | static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, |
| @@ -2259,8 +2273,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, | |||
| 2259 | int proc_dointvec(struct ctl_table *table, int write, | 2273 | int proc_dointvec(struct ctl_table *table, int write, |
| 2260 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2274 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2261 | { | 2275 | { |
| 2262 | return do_proc_dointvec(table,write,buffer,lenp,ppos, | 2276 | return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); |
| 2263 | NULL,NULL); | 2277 | } |
| 2278 | |||
| 2279 | /** | ||
| 2280 | * proc_douintvec - read a vector of unsigned integers | ||
| 2281 | * @table: the sysctl table | ||
| 2282 | * @write: %TRUE if this is a write to the sysctl file | ||
| 2283 | * @buffer: the user buffer | ||
| 2284 | * @lenp: the size of the user buffer | ||
| 2285 | * @ppos: file position | ||
| 2286 | * | ||
| 2287 | * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer | ||
| 2288 | * values from/to the user buffer, treated as an ASCII string. | ||
| 2289 | * | ||
| 2290 | * Returns 0 on success. | ||
| 2291 | */ | ||
| 2292 | int proc_douintvec(struct ctl_table *table, int write, | ||
| 2293 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2294 | { | ||
| 2295 | return do_proc_dointvec(table, write, buffer, lenp, ppos, | ||
| 2296 | do_proc_douintvec_conv, NULL); | ||
| 2264 | } | 2297 | } |
| 2265 | 2298 | ||
| 2266 | /* | 2299 | /* |
| @@ -2858,6 +2891,12 @@ int proc_dointvec(struct ctl_table *table, int write, | |||
| 2858 | return -ENOSYS; | 2891 | return -ENOSYS; |
| 2859 | } | 2892 | } |
| 2860 | 2893 | ||
| 2894 | int proc_douintvec(struct ctl_table *table, int write, | ||
| 2895 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2896 | { | ||
| 2897 | return -ENOSYS; | ||
| 2898 | } | ||
| 2899 | |||
| 2861 | int proc_dointvec_minmax(struct ctl_table *table, int write, | 2900 | int proc_dointvec_minmax(struct ctl_table *table, int write, |
| 2862 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2901 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2863 | { | 2902 | { |
| @@ -2903,6 +2942,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, | |||
| 2903 | * exception granted :-) | 2942 | * exception granted :-) |
| 2904 | */ | 2943 | */ |
| 2905 | EXPORT_SYMBOL(proc_dointvec); | 2944 | EXPORT_SYMBOL(proc_dointvec); |
| 2945 | EXPORT_SYMBOL(proc_douintvec); | ||
| 2906 | EXPORT_SYMBOL(proc_dointvec_jiffies); | 2946 | EXPORT_SYMBOL(proc_dointvec_jiffies); |
| 2907 | EXPORT_SYMBOL(proc_dointvec_minmax); | 2947 | EXPORT_SYMBOL(proc_dointvec_minmax); |
| 2908 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); | 2948 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | |||
| 542 | static int alarm_timer_create(struct k_itimer *new_timer) | 542 | static int alarm_timer_create(struct k_itimer *new_timer) |
| 543 | { | 543 | { |
| 544 | enum alarmtimer_type type; | 544 | enum alarmtimer_type type; |
| 545 | struct alarm_base *base; | ||
| 546 | 545 | ||
| 547 | if (!alarmtimer_get_rtcdev()) | 546 | if (!alarmtimer_get_rtcdev()) |
| 548 | return -ENOTSUPP; | 547 | return -ENOTSUPP; |
| @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
| 551 | return -EPERM; | 550 | return -EPERM; |
| 552 | 551 | ||
| 553 | type = clock2alarm(new_timer->it_clock); | 552 | type = clock2alarm(new_timer->it_clock); |
| 554 | base = &alarm_bases[type]; | ||
| 555 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); | 553 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); |
| 556 | return 0; | 554 | return 0; |
| 557 | } | 555 | } |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..7e4fad75acaa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur) | |||
| 600 | */ | 600 | */ |
| 601 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { | 601 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { |
| 602 | /* Override clocksource cannot be used. */ | 602 | /* Override clocksource cannot be used. */ |
| 603 | pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", | 603 | if (cs->flags & CLOCK_SOURCE_UNSTABLE) { |
| 604 | cs->name); | 604 | pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", |
| 605 | override_name[0] = 0; | 605 | cs->name); |
| 606 | override_name[0] = 0; | ||
| 607 | } else { | ||
| 608 | /* | ||
| 609 | * The override cannot be currently verified. | ||
| 610 | * Deferring to let the watchdog check. | ||
| 611 | */ | ||
| 612 | pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", | ||
| 613 | cs->name); | ||
| 614 | } | ||
| 606 | } else | 615 | } else |
| 607 | /* Override clocksource can be used. */ | 616 | /* Override clocksource can be used. */ |
| 608 | best = cs; | 617 | best = cs; |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..bb5ec425dfe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); | |||
| 307 | */ | 307 | */ |
| 308 | ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) | 308 | ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) |
| 309 | { | 309 | { |
| 310 | ktime_t res = ktime_add(lhs, rhs); | 310 | ktime_t res = ktime_add_unsafe(lhs, rhs); |
| 311 | 311 | ||
| 312 | /* | 312 | /* |
| 313 | * We use KTIME_SEC_MAX here, the maximum timeout which we can | 313 | * We use KTIME_SEC_MAX here, the maximum timeout which we can |
| @@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) | |||
| 703 | static DECLARE_WORK(hrtimer_work, clock_was_set_work); | 703 | static DECLARE_WORK(hrtimer_work, clock_was_set_work); |
| 704 | 704 | ||
| 705 | /* | 705 | /* |
| 706 | * Called from timekeeping and resume code to reprogramm the hrtimer | 706 | * Called from timekeeping and resume code to reprogram the hrtimer |
| 707 | * interrupt device on all cpus. | 707 | * interrupt device on all cpus. |
| 708 | */ | 708 | */ |
| 709 | void clock_was_set_delayed(void) | 709 | void clock_was_set_delayed(void) |
| @@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |||
| 1241 | 1241 | ||
| 1242 | /* | 1242 | /* |
| 1243 | * Note: We clear the running state after enqueue_hrtimer and | 1243 | * Note: We clear the running state after enqueue_hrtimer and |
| 1244 | * we do not reprogramm the event hardware. Happens either in | 1244 | * we do not reprogram the event hardware. Happens either in |
| 1245 | * hrtimer_start_range_ns() or in hrtimer_interrupt() | 1245 | * hrtimer_start_range_ns() or in hrtimer_interrupt() |
| 1246 | * | 1246 | * |
| 1247 | * Note: Because we dropped the cpu_base->lock above, | 1247 | * Note: Because we dropped the cpu_base->lock above, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 204fdc86863d..3bcb61b52f6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) | |||
| 186 | return false; | 186 | return false; |
| 187 | } | 187 | } |
| 188 | 188 | ||
| 189 | static bool can_stop_full_tick(struct tick_sched *ts) | 189 | static bool can_stop_full_tick(int cpu, struct tick_sched *ts) |
| 190 | { | 190 | { |
| 191 | WARN_ON_ONCE(!irqs_disabled()); | 191 | WARN_ON_ONCE(!irqs_disabled()); |
| 192 | 192 | ||
| 193 | if (unlikely(!cpu_online(cpu))) | ||
| 194 | return false; | ||
| 195 | |||
| 193 | if (check_tick_dependency(&tick_dep_mask)) | 196 | if (check_tick_dependency(&tick_dep_mask)) |
| 194 | return false; | 197 | return false; |
| 195 | 198 | ||
| @@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) | |||
| 843 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | 846 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) |
| 844 | return; | 847 | return; |
| 845 | 848 | ||
| 846 | if (can_stop_full_tick(ts)) | 849 | if (can_stop_full_tick(cpu, ts)) |
| 847 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | 850 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); |
| 848 | else if (ts->tick_stopped) | 851 | else if (ts->tick_stopped) |
| 849 | tick_nohz_restart_sched_tick(ts, ktime_get()); | 852 | tick_nohz_restart_sched_tick(ts, ktime_get()); |
| @@ -908,10 +911,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) | |||
| 908 | ktime_t now, expires; | 911 | ktime_t now, expires; |
| 909 | int cpu = smp_processor_id(); | 912 | int cpu = smp_processor_id(); |
| 910 | 913 | ||
| 914 | now = tick_nohz_start_idle(ts); | ||
| 915 | |||
| 911 | if (can_stop_idle_tick(cpu, ts)) { | 916 | if (can_stop_idle_tick(cpu, ts)) { |
| 912 | int was_stopped = ts->tick_stopped; | 917 | int was_stopped = ts->tick_stopped; |
| 913 | 918 | ||
| 914 | now = tick_nohz_start_idle(ts); | ||
| 915 | ts->idle_calls++; | 919 | ts->idle_calls++; |
| 916 | 920 | ||
| 917 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); | 921 | expires = tick_nohz_stop_sched_tick(ts, now, cpu); |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, | |||
| 780 | { | 780 | { |
| 781 | struct timespec64 res; | 781 | struct timespec64 res; |
| 782 | 782 | ||
| 783 | set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, | 783 | set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, |
| 784 | lhs.tv_nsec + rhs.tv_nsec); | 784 | lhs.tv_nsec + rhs.tv_nsec); |
| 785 | 785 | ||
| 786 | if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { | 786 | if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3b65746c7f15..37dec7e3db43 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -401,7 +401,13 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) | |||
| 401 | do { | 401 | do { |
| 402 | seq = raw_read_seqcount_latch(&tkf->seq); | 402 | seq = raw_read_seqcount_latch(&tkf->seq); |
| 403 | tkr = tkf->base + (seq & 0x01); | 403 | tkr = tkf->base + (seq & 0x01); |
| 404 | now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); | 404 | now = ktime_to_ns(tkr->base); |
| 405 | |||
| 406 | now += timekeeping_delta_to_ns(tkr, | ||
| 407 | clocksource_delta( | ||
| 408 | tkr->read(tkr->clock), | ||
| 409 | tkr->cycle_last, | ||
| 410 | tkr->mask)); | ||
| 405 | } while (read_seqcount_retry(&tkf->seq, seq)); | 411 | } while (read_seqcount_retry(&tkf->seq, seq)); |
| 406 | 412 | ||
| 407 | return now; | 413 | return now; |
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c | |||
| @@ -23,7 +23,9 @@ | |||
| 23 | 23 | ||
| 24 | #include "timekeeping_internal.h" | 24 | #include "timekeeping_internal.h" |
| 25 | 25 | ||
| 26 | static unsigned int sleep_time_bin[32] = {0}; | 26 | #define NUM_BINS 32 |
| 27 | |||
| 28 | static unsigned int sleep_time_bin[NUM_BINS] = {0}; | ||
| 27 | 29 | ||
| 28 | static int tk_debug_show_sleep_time(struct seq_file *s, void *data) | 30 | static int tk_debug_show_sleep_time(struct seq_file *s, void *data) |
| 29 | { | 31 | { |
| @@ -69,6 +71,11 @@ late_initcall(tk_debug_sleep_time_init); | |||
| 69 | 71 | ||
| 70 | void tk_debug_account_sleep_time(struct timespec64 *t) | 72 | void tk_debug_account_sleep_time(struct timespec64 *t) |
| 71 | { | 73 | { |
| 72 | sleep_time_bin[fls(t->tv_sec)]++; | 74 | /* Cap bin index so we don't overflow the array */ |
| 75 | int bin = min(fls(t->tv_sec), NUM_BINS-1); | ||
| 76 | |||
| 77 | sleep_time_bin[bin]++; | ||
| 78 | pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, | ||
| 79 | t->tv_nsec / NSEC_PER_MSEC); | ||
| 73 | } | 80 | } |
| 74 | 81 | ||
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..2d47980a1bc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base) | |||
| 1633 | /* | 1633 | /* |
| 1634 | * This function runs timers and the timer-tq in bottom half context. | 1634 | * This function runs timers and the timer-tq in bottom half context. |
| 1635 | */ | 1635 | */ |
| 1636 | static void run_timer_softirq(struct softirq_action *h) | 1636 | static __latent_entropy void run_timer_softirq(struct softirq_action *h) |
| 1637 | { | 1637 | { |
| 1638 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | 1638 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
| 1639 | 1639 | ||
diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -43,6 +43,7 @@ | |||
| 43 | #include <linux/stat.h> | 43 | #include <linux/stat.h> |
| 44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
| 45 | #include <linux/trace_clock.h> | 45 | #include <linux/trace_clock.h> |
| 46 | #include <linux/ktime.h> | ||
| 46 | #include <asm/byteorder.h> | 47 | #include <asm/byteorder.h> |
| 47 | #include <linux/torture.h> | 48 | #include <linux/torture.h> |
| 48 | 49 | ||
| @@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); | |||
| 446 | * Variables for auto-shutdown. This allows "lights out" torture runs | 447 | * Variables for auto-shutdown. This allows "lights out" torture runs |
| 447 | * to be fully scripted. | 448 | * to be fully scripted. |
| 448 | */ | 449 | */ |
| 449 | static int shutdown_secs; /* desired test duration in seconds. */ | ||
| 450 | static struct task_struct *shutdown_task; | 450 | static struct task_struct *shutdown_task; |
| 451 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | 451 | static ktime_t shutdown_time; /* time to system shutdown. */ |
| 452 | static void (*torture_shutdown_hook)(void); | 452 | static void (*torture_shutdown_hook)(void); |
| 453 | 453 | ||
| 454 | /* | 454 | /* |
| @@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); | |||
| 471 | */ | 471 | */ |
| 472 | static int torture_shutdown(void *arg) | 472 | static int torture_shutdown(void *arg) |
| 473 | { | 473 | { |
| 474 | long delta; | 474 | ktime_t ktime_snap; |
| 475 | unsigned long jiffies_snap; | ||
| 476 | 475 | ||
| 477 | VERBOSE_TOROUT_STRING("torture_shutdown task started"); | 476 | VERBOSE_TOROUT_STRING("torture_shutdown task started"); |
| 478 | jiffies_snap = jiffies; | 477 | ktime_snap = ktime_get(); |
| 479 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | 478 | while (ktime_before(ktime_snap, shutdown_time) && |
| 480 | !torture_must_stop()) { | 479 | !torture_must_stop()) { |
| 481 | delta = shutdown_time - jiffies_snap; | ||
| 482 | if (verbose) | 480 | if (verbose) |
| 483 | pr_alert("%s" TORTURE_FLAG | 481 | pr_alert("%s" TORTURE_FLAG |
| 484 | "torture_shutdown task: %lu jiffies remaining\n", | 482 | "torture_shutdown task: %llu ms remaining\n", |
| 485 | torture_type, delta); | 483 | torture_type, |
| 486 | schedule_timeout_interruptible(delta); | 484 | ktime_ms_delta(shutdown_time, ktime_snap)); |
| 487 | jiffies_snap = jiffies; | 485 | set_current_state(TASK_INTERRUPTIBLE); |
| 486 | schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); | ||
| 487 | ktime_snap = ktime_get(); | ||
| 488 | } | 488 | } |
| 489 | if (torture_must_stop()) { | 489 | if (torture_must_stop()) { |
| 490 | torture_kthread_stopping("torture_shutdown"); | 490 | torture_kthread_stopping("torture_shutdown"); |
| @@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) | |||
| 511 | { | 511 | { |
| 512 | int ret = 0; | 512 | int ret = 0; |
| 513 | 513 | ||
| 514 | shutdown_secs = ssecs; | ||
| 515 | torture_shutdown_hook = cleanup; | 514 | torture_shutdown_hook = cleanup; |
| 516 | if (shutdown_secs > 0) { | 515 | if (ssecs > 0) { |
| 517 | shutdown_time = jiffies + shutdown_secs * HZ; | 516 | shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); |
| 518 | ret = torture_create_kthread(torture_shutdown, NULL, | 517 | ret = torture_create_kthread(torture_shutdown, NULL, |
| 519 | shutdown_task); | 518 | shutdown_task); |
| 520 | } | 519 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER | |||
| 24 | help | 24 | help |
| 25 | See Documentation/trace/ftrace-design.txt | 25 | See Documentation/trace/ftrace-design.txt |
| 26 | 26 | ||
| 27 | config HAVE_FUNCTION_GRAPH_FP_TEST | ||
| 28 | bool | ||
| 29 | help | ||
| 30 | See Documentation/trace/ftrace-design.txt | ||
| 31 | |||
| 32 | config HAVE_DYNAMIC_FTRACE | 27 | config HAVE_DYNAMIC_FTRACE |
| 33 | bool | 28 | bool |
| 34 | help | 29 | help |
| @@ -221,6 +216,41 @@ config SCHED_TRACER | |||
| 221 | This tracer tracks the latency of the highest priority task | 216 | This tracer tracks the latency of the highest priority task |
| 222 | to be scheduled in, starting from the point it has woken up. | 217 | to be scheduled in, starting from the point it has woken up. |
| 223 | 218 | ||
| 219 | config HWLAT_TRACER | ||
| 220 | bool "Tracer to detect hardware latencies (like SMIs)" | ||
| 221 | select GENERIC_TRACER | ||
| 222 | help | ||
| 223 | This tracer, when enabled will create one or more kernel threads, | ||
| 224 | depening on what the cpumask file is set to, which each thread | ||
| 225 | spinning in a loop looking for interruptions caused by | ||
| 226 | something other than the kernel. For example, if a | ||
| 227 | System Management Interrupt (SMI) takes a noticeable amount of | ||
| 228 | time, this tracer will detect it. This is useful for testing | ||
| 229 | if a system is reliable for Real Time tasks. | ||
| 230 | |||
| 231 | Some files are created in the tracing directory when this | ||
| 232 | is enabled: | ||
| 233 | |||
| 234 | hwlat_detector/width - time in usecs for how long to spin for | ||
| 235 | hwlat_detector/window - time in usecs between the start of each | ||
| 236 | iteration | ||
| 237 | |||
| 238 | A kernel thread is created that will spin with interrupts disabled | ||
| 239 | for "width" microseconds in every "widow" cycle. It will not spin | ||
| 240 | for "window - width" microseconds, where the system can | ||
| 241 | continue to operate. | ||
| 242 | |||
| 243 | The output will appear in the trace and trace_pipe files. | ||
| 244 | |||
| 245 | When the tracer is not running, it has no affect on the system, | ||
| 246 | but when it is running, it can cause the system to be | ||
| 247 | periodically non responsive. Do not run this tracer on a | ||
| 248 | production system. | ||
| 249 | |||
| 250 | To enable this tracer, echo in "hwlat" into the current_tracer | ||
| 251 | file. Every time a latency is greater than tracing_thresh, it will | ||
| 252 | be recorded into the ring buffer. | ||
| 253 | |||
| 224 | config ENABLE_DEFAULT_TRACERS | 254 | config ENABLE_DEFAULT_TRACERS |
| 225 | bool "Trace process context switches and events" | 255 | bool "Trace process context switches and events" |
| 226 | depends on !GENERIC_TRACER | 256 | depends on !GENERIC_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..e57980845549 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -1,8 +1,4 @@ | |||
| 1 | 1 | ||
| 2 | # We are fully aware of the dangers of __builtin_return_address() | ||
| 3 | FRAME_CFLAGS := $(call cc-disable-warning,frame-address) | ||
| 4 | KBUILD_CFLAGS += $(FRAME_CFLAGS) | ||
| 5 | |||
| 6 | # Do not instrument the tracer itself: | 2 | # Do not instrument the tracer itself: |
| 7 | 3 | ||
| 8 | ifdef CONFIG_FUNCTION_TRACER | 4 | ifdef CONFIG_FUNCTION_TRACER |
| @@ -41,6 +37,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | |||
| 41 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 37 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
| 42 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 38 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
| 43 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | 39 | obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o |
| 40 | obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o | ||
| 44 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 41 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
| 45 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 42 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
| 46 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 43 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7598e6ca817a..dbafc5df03f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
| 223 | what |= MASK_TC_BIT(op_flags, META); | 223 | what |= MASK_TC_BIT(op_flags, META); |
| 224 | what |= MASK_TC_BIT(op_flags, PREFLUSH); | 224 | what |= MASK_TC_BIT(op_flags, PREFLUSH); |
| 225 | what |= MASK_TC_BIT(op_flags, FUA); | 225 | what |= MASK_TC_BIT(op_flags, FUA); |
| 226 | if (op == REQ_OP_DISCARD) | 226 | if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) |
| 227 | what |= BLK_TC_ACT(BLK_TC_DISCARD); | 227 | what |= BLK_TC_ACT(BLK_TC_DISCARD); |
| 228 | if (op == REQ_OP_FLUSH) | 228 | if (op == REQ_OP_FLUSH) |
| 229 | what |= BLK_TC_ACT(BLK_TC_FLUSH); | 229 | what |= BLK_TC_ACT(BLK_TC_FLUSH); |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com | 1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016 Facebook | ||
| 2 | * | 3 | * |
| 3 | * This program is free software; you can redistribute it and/or | 4 | * This program is free software; you can redistribute it and/or |
| 4 | * modify it under the terms of version 2 of the GNU General Public | 5 | * modify it under the terms of version 2 of the GNU General Public |
| @@ -8,6 +9,7 @@ | |||
| 8 | #include <linux/types.h> | 9 | #include <linux/types.h> |
| 9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
| 10 | #include <linux/bpf.h> | 11 | #include <linux/bpf.h> |
| 12 | #include <linux/bpf_perf_event.h> | ||
| 11 | #include <linux/filter.h> | 13 | #include <linux/filter.h> |
| 12 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
| 13 | #include <linux/ctype.h> | 15 | #include <linux/ctype.h> |
| @@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) | |||
| 59 | } | 61 | } |
| 60 | EXPORT_SYMBOL_GPL(trace_call_bpf); | 62 | EXPORT_SYMBOL_GPL(trace_call_bpf); |
| 61 | 63 | ||
| 62 | static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 64 | BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) |
| 63 | { | 65 | { |
| 64 | void *dst = (void *) (long) r1; | 66 | int ret; |
| 65 | int ret, size = (int) r2; | ||
| 66 | void *unsafe_ptr = (void *) (long) r3; | ||
| 67 | 67 | ||
| 68 | ret = probe_kernel_read(dst, unsafe_ptr, size); | 68 | ret = probe_kernel_read(dst, unsafe_ptr, size); |
| 69 | if (unlikely(ret < 0)) | 69 | if (unlikely(ret < 0)) |
| @@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
| 81 | .arg3_type = ARG_ANYTHING, | 81 | .arg3_type = ARG_ANYTHING, |
| 82 | }; | 82 | }; |
| 83 | 83 | ||
| 84 | static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 84 | BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, |
| 85 | u32, size) | ||
| 85 | { | 86 | { |
| 86 | void *unsafe_ptr = (void *) (long) r1; | ||
| 87 | void *src = (void *) (long) r2; | ||
| 88 | int size = (int) r3; | ||
| 89 | |||
| 90 | /* | 87 | /* |
| 91 | * Ensure we're in user context which is safe for the helper to | 88 | * Ensure we're in user context which is safe for the helper to |
| 92 | * run. This helper has no business in a kthread. | 89 | * run. This helper has no business in a kthread. |
| @@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | |||
| 128 | * limited trace_printk() | 125 | * limited trace_printk() |
| 129 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 126 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed |
| 130 | */ | 127 | */ |
| 131 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, |
| 129 | u64, arg2, u64, arg3) | ||
| 132 | { | 130 | { |
| 133 | char *fmt = (char *) (long) r1; | ||
| 134 | bool str_seen = false; | 131 | bool str_seen = false; |
| 135 | int mod[3] = {}; | 132 | int mod[3] = {}; |
| 136 | int fmt_cnt = 0; | 133 | int fmt_cnt = 0; |
| @@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | |||
| 176 | 173 | ||
| 177 | switch (fmt_cnt) { | 174 | switch (fmt_cnt) { |
| 178 | case 1: | 175 | case 1: |
| 179 | unsafe_addr = r3; | 176 | unsafe_addr = arg1; |
| 180 | r3 = (long) buf; | 177 | arg1 = (long) buf; |
| 181 | break; | 178 | break; |
| 182 | case 2: | 179 | case 2: |
| 183 | unsafe_addr = r4; | 180 | unsafe_addr = arg2; |
| 184 | r4 = (long) buf; | 181 | arg2 = (long) buf; |
| 185 | break; | 182 | break; |
| 186 | case 3: | 183 | case 3: |
| 187 | unsafe_addr = r5; | 184 | unsafe_addr = arg3; |
| 188 | r5 = (long) buf; | 185 | arg3 = (long) buf; |
| 189 | break; | 186 | break; |
| 190 | } | 187 | } |
| 191 | buf[0] = 0; | 188 | buf[0] = 0; |
| @@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | |||
| 207 | } | 204 | } |
| 208 | 205 | ||
| 209 | return __trace_printk(1/* fake ip will not be printed */, fmt, | 206 | return __trace_printk(1/* fake ip will not be printed */, fmt, |
| 210 | mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, | 207 | mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, |
| 211 | mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, | 208 | mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, |
| 212 | mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); | 209 | mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); |
| 213 | } | 210 | } |
| 214 | 211 | ||
| 215 | static const struct bpf_func_proto bpf_trace_printk_proto = { | 212 | static const struct bpf_func_proto bpf_trace_printk_proto = { |
| @@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | |||
| 231 | return &bpf_trace_printk_proto; | 228 | return &bpf_trace_printk_proto; |
| 232 | } | 229 | } |
| 233 | 230 | ||
| 234 | static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) | 231 | BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) |
| 235 | { | 232 | { |
| 236 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 237 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 233 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 238 | unsigned int cpu = smp_processor_id(); | 234 | unsigned int cpu = smp_processor_id(); |
| 239 | u64 index = flags & BPF_F_INDEX_MASK; | 235 | u64 index = flags & BPF_F_INDEX_MASK; |
| @@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
| 310 | return 0; | 306 | return 0; |
| 311 | } | 307 | } |
| 312 | 308 | ||
| 313 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) | 309 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, |
| 310 | u64, flags, void *, data, u64, size) | ||
| 314 | { | 311 | { |
| 315 | struct pt_regs *regs = (struct pt_regs *)(long) r1; | ||
| 316 | struct bpf_map *map = (struct bpf_map *)(long) r2; | ||
| 317 | void *data = (void *)(long) r4; | ||
| 318 | struct perf_raw_record raw = { | 312 | struct perf_raw_record raw = { |
| 319 | .frag = { | 313 | .frag = { |
| 320 | .size = size, | 314 | .size = size, |
| @@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | |||
| 365 | return __bpf_perf_event_output(regs, map, flags, &raw); | 359 | return __bpf_perf_event_output(regs, map, flags, &raw); |
| 366 | } | 360 | } |
| 367 | 361 | ||
| 368 | static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 362 | BPF_CALL_0(bpf_get_current_task) |
| 369 | { | 363 | { |
| 370 | return (long) current; | 364 | return (long) current; |
| 371 | } | 365 | } |
| @@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { | |||
| 376 | .ret_type = RET_INTEGER, | 370 | .ret_type = RET_INTEGER, |
| 377 | }; | 371 | }; |
| 378 | 372 | ||
| 373 | BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) | ||
| 374 | { | ||
| 375 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 376 | struct cgroup *cgrp; | ||
| 377 | |||
| 378 | if (unlikely(in_interrupt())) | ||
| 379 | return -EINVAL; | ||
| 380 | if (unlikely(idx >= array->map.max_entries)) | ||
| 381 | return -E2BIG; | ||
| 382 | |||
| 383 | cgrp = READ_ONCE(array->ptrs[idx]); | ||
| 384 | if (unlikely(!cgrp)) | ||
| 385 | return -EAGAIN; | ||
| 386 | |||
| 387 | return task_under_cgroup_hierarchy(current, cgrp); | ||
| 388 | } | ||
| 389 | |||
| 390 | static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { | ||
| 391 | .func = bpf_current_task_under_cgroup, | ||
| 392 | .gpl_only = false, | ||
| 393 | .ret_type = RET_INTEGER, | ||
| 394 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 395 | .arg2_type = ARG_ANYTHING, | ||
| 396 | }; | ||
| 397 | |||
| 379 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | 398 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) |
| 380 | { | 399 | { |
| 381 | switch (func_id) { | 400 | switch (func_id) { |
| @@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
| 407 | return &bpf_perf_event_read_proto; | 426 | return &bpf_perf_event_read_proto; |
| 408 | case BPF_FUNC_probe_write_user: | 427 | case BPF_FUNC_probe_write_user: |
| 409 | return bpf_get_probe_write_proto(); | 428 | return bpf_get_probe_write_proto(); |
| 429 | case BPF_FUNC_current_task_under_cgroup: | ||
| 430 | return &bpf_current_task_under_cgroup_proto; | ||
| 431 | case BPF_FUNC_get_prandom_u32: | ||
| 432 | return &bpf_get_prandom_u32_proto; | ||
| 410 | default: | 433 | default: |
| 411 | return NULL; | 434 | return NULL; |
| 412 | } | 435 | } |
| @@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { | |||
| 447 | .type = BPF_PROG_TYPE_KPROBE, | 470 | .type = BPF_PROG_TYPE_KPROBE, |
| 448 | }; | 471 | }; |
| 449 | 472 | ||
| 450 | static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) | 473 | BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, |
| 474 | u64, flags, void *, data, u64, size) | ||
| 451 | { | 475 | { |
| 476 | struct pt_regs *regs = *(struct pt_regs **)tp_buff; | ||
| 477 | |||
| 452 | /* | 478 | /* |
| 453 | * r1 points to perf tracepoint buffer where first 8 bytes are hidden | 479 | * r1 points to perf tracepoint buffer where first 8 bytes are hidden |
| 454 | * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it | 480 | * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it |
| 455 | * from there and call the same bpf_perf_event_output() helper | 481 | * from there and call the same bpf_perf_event_output() helper inline. |
| 456 | */ | 482 | */ |
| 457 | u64 ctx = *(long *)(uintptr_t)r1; | 483 | return ____bpf_perf_event_output(regs, map, flags, data, size); |
| 458 | |||
| 459 | return bpf_perf_event_output(ctx, r2, index, r4, size); | ||
| 460 | } | 484 | } |
| 461 | 485 | ||
| 462 | static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { | 486 | static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { |
| @@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { | |||
| 470 | .arg5_type = ARG_CONST_STACK_SIZE, | 494 | .arg5_type = ARG_CONST_STACK_SIZE, |
| 471 | }; | 495 | }; |
| 472 | 496 | ||
| 473 | static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 497 | BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, |
| 498 | u64, flags) | ||
| 474 | { | 499 | { |
| 475 | u64 ctx = *(long *)(uintptr_t)r1; | 500 | struct pt_regs *regs = *(struct pt_regs **)tp_buff; |
| 476 | 501 | ||
| 477 | return bpf_get_stackid(ctx, r2, r3, r4, r5); | 502 | /* |
| 503 | * Same comment as in bpf_perf_event_output_tp(), only that this time | ||
| 504 | * the other helper's function body cannot be inlined due to being | ||
| 505 | * external, thus we need to call raw helper function. | ||
| 506 | */ | ||
| 507 | return bpf_get_stackid((unsigned long) regs, (unsigned long) map, | ||
| 508 | flags, 0, 0); | ||
| 478 | } | 509 | } |
| 479 | 510 | ||
| 480 | static const struct bpf_func_proto bpf_get_stackid_proto_tp = { | 511 | static const struct bpf_func_proto bpf_get_stackid_proto_tp = { |
| @@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = { | |||
| 520 | .type = BPF_PROG_TYPE_TRACEPOINT, | 551 | .type = BPF_PROG_TYPE_TRACEPOINT, |
| 521 | }; | 552 | }; |
| 522 | 553 | ||
| 554 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | ||
| 555 | enum bpf_reg_type *reg_type) | ||
| 556 | { | ||
| 557 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) | ||
| 558 | return false; | ||
| 559 | if (type != BPF_READ) | ||
| 560 | return false; | ||
| 561 | if (off % size != 0) | ||
| 562 | return false; | ||
| 563 | if (off == offsetof(struct bpf_perf_event_data, sample_period)) { | ||
| 564 | if (size != sizeof(u64)) | ||
| 565 | return false; | ||
| 566 | } else { | ||
| 567 | if (size != sizeof(long)) | ||
| 568 | return false; | ||
| 569 | } | ||
| 570 | return true; | ||
| 571 | } | ||
| 572 | |||
| 573 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, | ||
| 574 | int src_reg, int ctx_off, | ||
| 575 | struct bpf_insn *insn_buf, | ||
| 576 | struct bpf_prog *prog) | ||
| 577 | { | ||
| 578 | struct bpf_insn *insn = insn_buf; | ||
| 579 | |||
| 580 | switch (ctx_off) { | ||
| 581 | case offsetof(struct bpf_perf_event_data, sample_period): | ||
| 582 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); | ||
| 583 | |||
| 584 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | ||
| 585 | data), dst_reg, src_reg, | ||
| 586 | offsetof(struct bpf_perf_event_data_kern, data)); | ||
| 587 | *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, | ||
| 588 | offsetof(struct perf_sample_data, period)); | ||
| 589 | break; | ||
| 590 | default: | ||
| 591 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | ||
| 592 | regs), dst_reg, src_reg, | ||
| 593 | offsetof(struct bpf_perf_event_data_kern, regs)); | ||
| 594 | *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); | ||
| 595 | break; | ||
| 596 | } | ||
| 597 | |||
| 598 | return insn - insn_buf; | ||
| 599 | } | ||
| 600 | |||
| 601 | static const struct bpf_verifier_ops perf_event_prog_ops = { | ||
| 602 | .get_func_proto = tp_prog_func_proto, | ||
| 603 | .is_valid_access = pe_prog_is_valid_access, | ||
| 604 | .convert_ctx_access = pe_prog_convert_ctx_access, | ||
| 605 | }; | ||
| 606 | |||
| 607 | static struct bpf_prog_type_list perf_event_tl = { | ||
| 608 | .ops = &perf_event_prog_ops, | ||
| 609 | .type = BPF_PROG_TYPE_PERF_EVENT, | ||
| 610 | }; | ||
| 611 | |||
| 523 | static int __init register_kprobe_prog_ops(void) | 612 | static int __init register_kprobe_prog_ops(void) |
| 524 | { | 613 | { |
| 525 | bpf_register_prog_type(&kprobe_tl); | 614 | bpf_register_prog_type(&kprobe_tl); |
| 526 | bpf_register_prog_type(&tracepoint_tl); | 615 | bpf_register_prog_type(&tracepoint_tl); |
| 616 | bpf_register_prog_type(&perf_event_tl); | ||
| 527 | return 0; | 617 | return 0; |
| 528 | } | 618 | } |
| 529 | late_initcall(register_kprobe_prog_ops); | 619 | late_initcall(register_kprobe_prog_ops); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..2050a7652a86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
| 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 872 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 873 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
| 874 | { | 874 | { |
| 875 | int index = trace->depth; | ||
| 876 | |||
| 875 | function_profile_call(trace->func, 0, NULL, NULL); | 877 | function_profile_call(trace->func, 0, NULL, NULL); |
| 878 | |||
| 879 | if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) | ||
| 880 | current->ret_stack[index].subtime = 0; | ||
| 881 | |||
| 876 | return 1; | 882 | return 1; |
| 877 | } | 883 | } |
| 878 | 884 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) | |||
| 1047 | * | 1047 | * |
| 1048 | * Shows real state of the ring buffer if it is enabled or not. | 1048 | * Shows real state of the ring buffer if it is enabled or not. |
| 1049 | */ | 1049 | */ |
| 1050 | static int tracer_tracing_is_on(struct trace_array *tr) | 1050 | int tracer_tracing_is_on(struct trace_array *tr) |
| 1051 | { | 1051 | { |
| 1052 | if (tr->trace_buffer.buffer) | 1052 | if (tr->trace_buffer.buffer) |
| 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); | 1053 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); |
| @@ -4123,6 +4123,30 @@ static const char readme_msg[] = | |||
| 4123 | "\t\t\t traces\n" | 4123 | "\t\t\t traces\n" |
| 4124 | #endif | 4124 | #endif |
| 4125 | #endif /* CONFIG_STACK_TRACER */ | 4125 | #endif /* CONFIG_STACK_TRACER */ |
| 4126 | #ifdef CONFIG_KPROBE_EVENT | ||
| 4127 | " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" | ||
| 4128 | "\t\t\t Write into this file to define/undefine new trace events.\n" | ||
| 4129 | #endif | ||
| 4130 | #ifdef CONFIG_UPROBE_EVENT | ||
| 4131 | " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" | ||
| 4132 | "\t\t\t Write into this file to define/undefine new trace events.\n" | ||
| 4133 | #endif | ||
| 4134 | #if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) | ||
| 4135 | "\t accepts: event-definitions (one definition per line)\n" | ||
| 4136 | "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" | ||
| 4137 | "\t -:[<group>/]<event>\n" | ||
| 4138 | #ifdef CONFIG_KPROBE_EVENT | ||
| 4139 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" | ||
| 4140 | #endif | ||
| 4141 | #ifdef CONFIG_UPROBE_EVENT | ||
| 4142 | "\t place: <path>:<offset>\n" | ||
| 4143 | #endif | ||
| 4144 | "\t args: <name>=fetcharg[:type]\n" | ||
| 4145 | "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" | ||
| 4146 | "\t $stack<index>, $stack, $retval, $comm\n" | ||
| 4147 | "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" | ||
| 4148 | "\t b<bit-width>@<bit-offset>/<container-size>\n" | ||
| 4149 | #endif | ||
| 4126 | " events/\t\t- Directory containing all trace event subsystems:\n" | 4150 | " events/\t\t- Directory containing all trace event subsystems:\n" |
| 4127 | " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" | 4151 | " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" |
| 4128 | " events/<system>/\t- Directory containing all trace events for <system>:\n" | 4152 | " events/<system>/\t- Directory containing all trace events for <system>:\n" |
| @@ -4945,7 +4969,7 @@ out: | |||
| 4945 | return ret; | 4969 | return ret; |
| 4946 | } | 4970 | } |
| 4947 | 4971 | ||
| 4948 | #ifdef CONFIG_TRACER_MAX_TRACE | 4972 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 4949 | 4973 | ||
| 4950 | static ssize_t | 4974 | static ssize_t |
| 4951 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | 4975 | tracing_max_lat_read(struct file *filp, char __user *ubuf, |
| @@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, | |||
| 5124 | struct trace_iterator *iter = filp->private_data; | 5148 | struct trace_iterator *iter = filp->private_data; |
| 5125 | ssize_t sret; | 5149 | ssize_t sret; |
| 5126 | 5150 | ||
| 5127 | /* return any leftover data */ | ||
| 5128 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | ||
| 5129 | if (sret != -EBUSY) | ||
| 5130 | return sret; | ||
| 5131 | |||
| 5132 | trace_seq_init(&iter->seq); | ||
| 5133 | |||
| 5134 | /* | 5151 | /* |
| 5135 | * Avoid more than one consumer on a single file descriptor | 5152 | * Avoid more than one consumer on a single file descriptor |
| 5136 | * This is just a matter of traces coherency, the ring buffer itself | 5153 | * This is just a matter of traces coherency, the ring buffer itself |
| 5137 | * is protected. | 5154 | * is protected. |
| 5138 | */ | 5155 | */ |
| 5139 | mutex_lock(&iter->mutex); | 5156 | mutex_lock(&iter->mutex); |
| 5157 | |||
| 5158 | /* return any leftover data */ | ||
| 5159 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | ||
| 5160 | if (sret != -EBUSY) | ||
| 5161 | goto out; | ||
| 5162 | |||
| 5163 | trace_seq_init(&iter->seq); | ||
| 5164 | |||
| 5140 | if (iter->trace->read) { | 5165 | if (iter->trace->read) { |
| 5141 | sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); | 5166 | sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); |
| 5142 | if (sret) | 5167 | if (sret) |
| @@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = { | |||
| 5867 | .llseek = generic_file_llseek, | 5892 | .llseek = generic_file_llseek, |
| 5868 | }; | 5893 | }; |
| 5869 | 5894 | ||
| 5870 | #ifdef CONFIG_TRACER_MAX_TRACE | 5895 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 5871 | static const struct file_operations tracing_max_lat_fops = { | 5896 | static const struct file_operations tracing_max_lat_fops = { |
| 5872 | .open = tracing_open_generic, | 5897 | .open = tracing_open_generic, |
| 5873 | .read = tracing_max_lat_read, | 5898 | .read = tracing_max_lat_read, |
| @@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 6163 | return -EBUSY; | 6188 | return -EBUSY; |
| 6164 | #endif | 6189 | #endif |
| 6165 | 6190 | ||
| 6166 | if (splice_grow_spd(pipe, &spd)) | ||
| 6167 | return -ENOMEM; | ||
| 6168 | |||
| 6169 | if (*ppos & (PAGE_SIZE - 1)) | 6191 | if (*ppos & (PAGE_SIZE - 1)) |
| 6170 | return -EINVAL; | 6192 | return -EINVAL; |
| 6171 | 6193 | ||
| @@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 6175 | len &= PAGE_MASK; | 6197 | len &= PAGE_MASK; |
| 6176 | } | 6198 | } |
| 6177 | 6199 | ||
| 6200 | if (splice_grow_spd(pipe, &spd)) | ||
| 6201 | return -ENOMEM; | ||
| 6202 | |||
| 6178 | again: | 6203 | again: |
| 6179 | trace_access_lock(iter->cpu_file); | 6204 | trace_access_lock(iter->cpu_file); |
| 6180 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); | 6205 | entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); |
| @@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 6232 | /* did we read anything? */ | 6257 | /* did we read anything? */ |
| 6233 | if (!spd.nr_pages) { | 6258 | if (!spd.nr_pages) { |
| 6234 | if (ret) | 6259 | if (ret) |
| 6235 | return ret; | 6260 | goto out; |
| 6236 | 6261 | ||
| 6262 | ret = -EAGAIN; | ||
| 6237 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) | 6263 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) |
| 6238 | return -EAGAIN; | 6264 | goto out; |
| 6239 | 6265 | ||
| 6240 | ret = wait_on_pipe(iter, true); | 6266 | ret = wait_on_pipe(iter, true); |
| 6241 | if (ret) | 6267 | if (ret) |
| 6242 | return ret; | 6268 | goto out; |
| 6243 | 6269 | ||
| 6244 | goto again; | 6270 | goto again; |
| 6245 | } | 6271 | } |
| 6246 | 6272 | ||
| 6247 | ret = splice_to_pipe(pipe, &spd); | 6273 | ret = splice_to_pipe(pipe, &spd); |
| 6274 | out: | ||
| 6248 | splice_shrink_spd(&spd); | 6275 | splice_shrink_spd(&spd); |
| 6249 | 6276 | ||
| 6250 | return ret; | 6277 | return ret; |
| @@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 7195 | 7222 | ||
| 7196 | create_trace_options_dir(tr); | 7223 | create_trace_options_dir(tr); |
| 7197 | 7224 | ||
| 7198 | #ifdef CONFIG_TRACER_MAX_TRACE | 7225 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
| 7199 | trace_create_file("tracing_max_latency", 0644, d_tracer, | 7226 | trace_create_file("tracing_max_latency", 0644, d_tracer, |
| 7200 | &tr->max_latency, &tracing_max_lat_fops); | 7227 | &tr->max_latency, &tracing_max_lat_fops); |
| 7201 | #endif | 7228 | #endif |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -38,6 +38,7 @@ enum trace_type { | |||
| 38 | TRACE_USER_STACK, | 38 | TRACE_USER_STACK, |
| 39 | TRACE_BLK, | 39 | TRACE_BLK, |
| 40 | TRACE_BPUTS, | 40 | TRACE_BPUTS, |
| 41 | TRACE_HWLAT, | ||
| 41 | 42 | ||
| 42 | __TRACE_LAST_TYPE, | 43 | __TRACE_LAST_TYPE, |
| 43 | }; | 44 | }; |
| @@ -213,6 +214,8 @@ struct trace_array { | |||
| 213 | */ | 214 | */ |
| 214 | struct trace_buffer max_buffer; | 215 | struct trace_buffer max_buffer; |
| 215 | bool allocated_snapshot; | 216 | bool allocated_snapshot; |
| 217 | #endif | ||
| 218 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) | ||
| 216 | unsigned long max_latency; | 219 | unsigned long max_latency; |
| 217 | #endif | 220 | #endif |
| 218 | struct trace_pid_list __rcu *filtered_pids; | 221 | struct trace_pid_list __rcu *filtered_pids; |
| @@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); | |||
| 326 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 329 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
| 327 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 330 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
| 328 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ | 331 | IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ |
| 332 | IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ | ||
| 329 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 333 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
| 330 | TRACE_MMIO_RW); \ | 334 | TRACE_MMIO_RW); \ |
| 331 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 335 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
| @@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); | |||
| 571 | void tracing_reset_all_online_cpus(void); | 575 | void tracing_reset_all_online_cpus(void); |
| 572 | int tracing_open_generic(struct inode *inode, struct file *filp); | 576 | int tracing_open_generic(struct inode *inode, struct file *filp); |
| 573 | bool tracing_is_disabled(void); | 577 | bool tracing_is_disabled(void); |
| 578 | int tracer_tracing_is_on(struct trace_array *tr); | ||
| 574 | struct dentry *trace_create_file(const char *name, | 579 | struct dentry *trace_create_file(const char *name, |
| 575 | umode_t mode, | 580 | umode_t mode, |
| 576 | struct dentry *parent, | 581 | struct dentry *parent, |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, | |||
| 322 | FILTER_OTHER | 322 | FILTER_OTHER |
| 323 | ); | 323 | ); |
| 324 | 324 | ||
| 325 | |||
| 326 | FTRACE_ENTRY(hwlat, hwlat_entry, | ||
| 327 | |||
| 328 | TRACE_HWLAT, | ||
| 329 | |||
| 330 | F_STRUCT( | ||
| 331 | __field( u64, duration ) | ||
| 332 | __field( u64, outer_duration ) | ||
| 333 | __field( u64, nmi_total_ts ) | ||
| 334 | __field_struct( struct timespec, timestamp ) | ||
| 335 | __field_desc( long, timestamp, tv_sec ) | ||
| 336 | __field_desc( long, timestamp, tv_nsec ) | ||
| 337 | __field( unsigned int, nmi_count ) | ||
| 338 | __field( unsigned int, seqnum ) | ||
| 339 | ), | ||
| 340 | |||
| 341 | F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", | ||
| 342 | __entry->seqnum, | ||
| 343 | __entry->tv_sec, | ||
| 344 | __entry->tv_nsec, | ||
| 345 | __entry->duration, | ||
| 346 | __entry->outer_duration, | ||
| 347 | __entry->nmi_total_ts, | ||
| 348 | __entry->nmi_count), | ||
| 349 | |||
| 350 | FILTER_OTHER | ||
| 351 | ); | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { | |||
| 1028 | static struct event_command trigger_traceoff_cmd = { | 1028 | static struct event_command trigger_traceoff_cmd = { |
| 1029 | .name = "traceoff", | 1029 | .name = "traceoff", |
| 1030 | .trigger_type = ETT_TRACE_ONOFF, | 1030 | .trigger_type = ETT_TRACE_ONOFF, |
| 1031 | .flags = EVENT_CMD_FL_POST_TRIGGER, | ||
| 1031 | .func = event_trigger_callback, | 1032 | .func = event_trigger_callback, |
| 1032 | .reg = register_trigger, | 1033 | .reg = register_trigger, |
| 1033 | .unreg = unregister_trigger, | 1034 | .unreg = unregister_trigger, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, | |||
| 119 | /* Add a function return address to the trace stack on thread info.*/ | 119 | /* Add a function return address to the trace stack on thread info.*/ |
| 120 | int | 120 | int |
| 121 | ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | 121 | ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, |
| 122 | unsigned long frame_pointer) | 122 | unsigned long frame_pointer, unsigned long *retp) |
| 123 | { | 123 | { |
| 124 | unsigned long long calltime; | 124 | unsigned long long calltime; |
| 125 | int index; | 125 | int index; |
| @@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
| 170 | current->ret_stack[index].ret = ret; | 170 | current->ret_stack[index].ret = ret; |
| 171 | current->ret_stack[index].func = func; | 171 | current->ret_stack[index].func = func; |
| 172 | current->ret_stack[index].calltime = calltime; | 172 | current->ret_stack[index].calltime = calltime; |
| 173 | current->ret_stack[index].subtime = 0; | 173 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
| 174 | current->ret_stack[index].fp = frame_pointer; | 174 | current->ret_stack[index].fp = frame_pointer; |
| 175 | #endif | ||
| 176 | #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR | ||
| 177 | current->ret_stack[index].retp = retp; | ||
| 178 | #endif | ||
| 175 | *depth = current->curr_ret_stack; | 179 | *depth = current->curr_ret_stack; |
| 176 | 180 | ||
| 177 | return 0; | 181 | return 0; |
| @@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
| 204 | return; | 208 | return; |
| 205 | } | 209 | } |
| 206 | 210 | ||
| 207 | #if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) | 211 | #ifdef HAVE_FUNCTION_GRAPH_FP_TEST |
| 208 | /* | 212 | /* |
| 209 | * The arch may choose to record the frame pointer used | 213 | * The arch may choose to record the frame pointer used |
| 210 | * and check it here to make sure that it is what we expect it | 214 | * and check it here to make sure that it is what we expect it |
| @@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) | |||
| 279 | return ret; | 283 | return ret; |
| 280 | } | 284 | } |
| 281 | 285 | ||
| 286 | /** | ||
| 287 | * ftrace_graph_ret_addr - convert a potentially modified stack return address | ||
| 288 | * to its original value | ||
| 289 | * | ||
| 290 | * This function can be called by stack unwinding code to convert a found stack | ||
| 291 | * return address ('ret') to its original value, in case the function graph | ||
| 292 | * tracer has modified it to be 'return_to_handler'. If the address hasn't | ||
| 293 | * been modified, the unchanged value of 'ret' is returned. | ||
| 294 | * | ||
| 295 | * 'idx' is a state variable which should be initialized by the caller to zero | ||
| 296 | * before the first call. | ||
| 297 | * | ||
| 298 | * 'retp' is a pointer to the return address on the stack. It's ignored if | ||
| 299 | * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. | ||
| 300 | */ | ||
| 301 | #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR | ||
| 302 | unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, | ||
| 303 | unsigned long ret, unsigned long *retp) | ||
| 304 | { | ||
| 305 | int index = task->curr_ret_stack; | ||
| 306 | int i; | ||
| 307 | |||
| 308 | if (ret != (unsigned long)return_to_handler) | ||
| 309 | return ret; | ||
| 310 | |||
| 311 | if (index < -1) | ||
| 312 | index += FTRACE_NOTRACE_DEPTH; | ||
| 313 | |||
| 314 | if (index < 0) | ||
| 315 | return ret; | ||
| 316 | |||
| 317 | for (i = 0; i <= index; i++) | ||
| 318 | if (task->ret_stack[i].retp == retp) | ||
| 319 | return task->ret_stack[i].ret; | ||
| 320 | |||
| 321 | return ret; | ||
| 322 | } | ||
| 323 | #else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ | ||
| 324 | unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, | ||
| 325 | unsigned long ret, unsigned long *retp) | ||
| 326 | { | ||
| 327 | int task_idx; | ||
| 328 | |||
| 329 | if (ret != (unsigned long)return_to_handler) | ||
| 330 | return ret; | ||
| 331 | |||
| 332 | task_idx = task->curr_ret_stack; | ||
| 333 | |||
| 334 | if (!task->ret_stack || task_idx < *idx) | ||
| 335 | return ret; | ||
| 336 | |||
| 337 | task_idx -= *idx; | ||
| 338 | (*idx)++; | ||
| 339 | |||
| 340 | return task->ret_stack[task_idx].ret; | ||
| 341 | } | ||
| 342 | #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ | ||
| 343 | |||
| 282 | int __trace_graph_entry(struct trace_array *tr, | 344 | int __trace_graph_entry(struct trace_array *tr, |
| 283 | struct ftrace_graph_ent *trace, | 345 | struct ftrace_graph_ent *trace, |
| 284 | unsigned long flags, | 346 | unsigned long flags, |
| @@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1120 | trace_seq_puts(s, "/* "); | 1182 | trace_seq_puts(s, "/* "); |
| 1121 | 1183 | ||
| 1122 | switch (iter->ent->type) { | 1184 | switch (iter->ent->type) { |
| 1185 | case TRACE_BPUTS: | ||
| 1186 | ret = trace_print_bputs_msg_only(iter); | ||
| 1187 | if (ret != TRACE_TYPE_HANDLED) | ||
| 1188 | return ret; | ||
| 1189 | break; | ||
| 1123 | case TRACE_BPRINT: | 1190 | case TRACE_BPRINT: |
| 1124 | ret = trace_print_bprintk_msg_only(iter); | 1191 | ret = trace_print_bprintk_msg_only(iter); |
| 1125 | if (ret != TRACE_TYPE_HANDLED) | 1192 | if (ret != TRACE_TYPE_HANDLED) |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c | |||
| @@ -0,0 +1,633 @@ | |||
| 1 | /* | ||
| 2 | * trace_hwlatdetect.c - A simple Hardware Latency detector. | ||
| 3 | * | ||
| 4 | * Use this tracer to detect large system latencies induced by the behavior of | ||
| 5 | * certain underlying system hardware or firmware, independent of Linux itself. | ||
| 6 | * The code was developed originally to detect the presence of SMIs on Intel | ||
| 7 | * and AMD systems, although there is no dependency upon x86 herein. | ||
| 8 | * | ||
| 9 | * The classical example usage of this tracer is in detecting the presence of | ||
| 10 | * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a | ||
| 11 | * somewhat special form of hardware interrupt spawned from earlier CPU debug | ||
| 12 | * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge | ||
| 13 | * LPC (or other device) to generate a special interrupt under certain | ||
| 14 | * circumstances, for example, upon expiration of a special SMI timer device, | ||
| 15 | * due to certain external thermal readings, on certain I/O address accesses, | ||
| 16 | * and other situations. An SMI hits a special CPU pin, triggers a special | ||
| 17 | * SMI mode (complete with special memory map), and the OS is unaware. | ||
| 18 | * | ||
| 19 | * Although certain hardware-inducing latencies are necessary (for example, | ||
| 20 | * a modern system often requires an SMI handler for correct thermal control | ||
| 21 | * and remote management) they can wreak havoc upon any OS-level performance | ||
| 22 | * guarantees toward low-latency, especially when the OS is not even made | ||
| 23 | * aware of the presence of these interrupts. For this reason, we need a | ||
| 24 | * somewhat brute force mechanism to detect these interrupts. In this case, | ||
| 25 | * we do it by hogging all of the CPU(s) for configurable timer intervals, | ||
| 26 | * sampling the built-in CPU timer, looking for discontiguous readings. | ||
| 27 | * | ||
| 28 | * WARNING: This implementation necessarily introduces latencies. Therefore, | ||
| 29 | * you should NEVER use this tracer while running in a production | ||
| 30 | * environment requiring any kind of low-latency performance | ||
| 31 | * guarantee(s). | ||
| 32 | * | ||
| 33 | * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> | ||
| 34 | * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> | ||
| 35 | * | ||
| 36 | * Includes useful feedback from Clark Williams <clark@redhat.com> | ||
| 37 | * | ||
| 38 | * This file is licensed under the terms of the GNU General Public | ||
| 39 | * License version 2. This program is licensed "as is" without any | ||
| 40 | * warranty of any kind, whether express or implied. | ||
| 41 | */ | ||
| 42 | #include <linux/kthread.h> | ||
| 43 | #include <linux/tracefs.h> | ||
| 44 | #include <linux/uaccess.h> | ||
| 45 | #include <linux/cpumask.h> | ||
| 46 | #include <linux/delay.h> | ||
| 47 | #include "trace.h" | ||
| 48 | |||
| 49 | static struct trace_array *hwlat_trace; | ||
| 50 | |||
| 51 | #define U64STR_SIZE 22 /* 20 digits max */ | ||
| 52 | |||
| 53 | #define BANNER "hwlat_detector: " | ||
| 54 | #define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ | ||
| 55 | #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ | ||
| 56 | #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ | ||
| 57 | |||
| 58 | /* sampling thread*/ | ||
| 59 | static struct task_struct *hwlat_kthread; | ||
| 60 | |||
| 61 | static struct dentry *hwlat_sample_width; /* sample width us */ | ||
| 62 | static struct dentry *hwlat_sample_window; /* sample window us */ | ||
| 63 | |||
| 64 | /* Save the previous tracing_thresh value */ | ||
| 65 | static unsigned long save_tracing_thresh; | ||
| 66 | |||
| 67 | /* NMI timestamp counters */ | ||
| 68 | static u64 nmi_ts_start; | ||
| 69 | static u64 nmi_total_ts; | ||
| 70 | static int nmi_count; | ||
| 71 | static int nmi_cpu; | ||
| 72 | |||
| 73 | /* Tells NMIs to call back to the hwlat tracer to record timestamps */ | ||
| 74 | bool trace_hwlat_callback_enabled; | ||
| 75 | |||
| 76 | /* If the user changed threshold, remember it */ | ||
| 77 | static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; | ||
| 78 | |||
| 79 | /* Individual latency samples are stored here when detected. */ | ||
| 80 | struct hwlat_sample { | ||
| 81 | u64 seqnum; /* unique sequence */ | ||
| 82 | u64 duration; /* delta */ | ||
| 83 | u64 outer_duration; /* delta (outer loop) */ | ||
| 84 | u64 nmi_total_ts; /* Total time spent in NMIs */ | ||
| 85 | struct timespec timestamp; /* wall time */ | ||
| 86 | int nmi_count; /* # NMIs during this sample */ | ||
| 87 | }; | ||
| 88 | |||
| 89 | /* keep the global state somewhere. */ | ||
| 90 | static struct hwlat_data { | ||
| 91 | |||
| 92 | struct mutex lock; /* protect changes */ | ||
| 93 | |||
| 94 | u64 count; /* total since reset */ | ||
| 95 | |||
| 96 | u64 sample_window; /* total sampling window (on+off) */ | ||
| 97 | u64 sample_width; /* active sampling portion of window */ | ||
| 98 | |||
| 99 | } hwlat_data = { | ||
| 100 | .sample_window = DEFAULT_SAMPLE_WINDOW, | ||
| 101 | .sample_width = DEFAULT_SAMPLE_WIDTH, | ||
| 102 | }; | ||
| 103 | |||
| 104 | static void trace_hwlat_sample(struct hwlat_sample *sample) | ||
| 105 | { | ||
| 106 | struct trace_array *tr = hwlat_trace; | ||
| 107 | struct trace_event_call *call = &event_hwlat; | ||
| 108 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 109 | struct ring_buffer_event *event; | ||
| 110 | struct hwlat_entry *entry; | ||
| 111 | unsigned long flags; | ||
| 112 | int pc; | ||
| 113 | |||
| 114 | pc = preempt_count(); | ||
| 115 | local_save_flags(flags); | ||
| 116 | |||
| 117 | event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), | ||
| 118 | flags, pc); | ||
| 119 | if (!event) | ||
| 120 | return; | ||
| 121 | entry = ring_buffer_event_data(event); | ||
| 122 | entry->seqnum = sample->seqnum; | ||
| 123 | entry->duration = sample->duration; | ||
| 124 | entry->outer_duration = sample->outer_duration; | ||
| 125 | entry->timestamp = sample->timestamp; | ||
| 126 | entry->nmi_total_ts = sample->nmi_total_ts; | ||
| 127 | entry->nmi_count = sample->nmi_count; | ||
| 128 | |||
| 129 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 130 | __buffer_unlock_commit(buffer, event); | ||
| 131 | } | ||
| 132 | |||
| 133 | /* Macros to encapsulate the time capturing infrastructure */ | ||
| 134 | #define time_type u64 | ||
| 135 | #define time_get() trace_clock_local() | ||
| 136 | #define time_to_us(x) div_u64(x, 1000) | ||
| 137 | #define time_sub(a, b) ((a) - (b)) | ||
| 138 | #define init_time(a, b) (a = b) | ||
| 139 | #define time_u64(a) a | ||
| 140 | |||
| 141 | void trace_hwlat_callback(bool enter) | ||
| 142 | { | ||
| 143 | if (smp_processor_id() != nmi_cpu) | ||
| 144 | return; | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Currently trace_clock_local() calls sched_clock() and the | ||
| 148 | * generic version is not NMI safe. | ||
| 149 | */ | ||
| 150 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { | ||
| 151 | if (enter) | ||
| 152 | nmi_ts_start = time_get(); | ||
| 153 | else | ||
| 154 | nmi_total_ts = time_get() - nmi_ts_start; | ||
| 155 | } | ||
| 156 | |||
| 157 | if (enter) | ||
| 158 | nmi_count++; | ||
| 159 | } | ||
| 160 | |||
| 161 | /** | ||
| 162 | * get_sample - sample the CPU TSC and look for likely hardware latencies | ||
| 163 | * | ||
| 164 | * Used to repeatedly capture the CPU TSC (or similar), looking for potential | ||
| 165 | * hardware-induced latency. Called with interrupts disabled and with | ||
| 166 | * hwlat_data.lock held. | ||
| 167 | */ | ||
| 168 | static int get_sample(void) | ||
| 169 | { | ||
| 170 | struct trace_array *tr = hwlat_trace; | ||
| 171 | time_type start, t1, t2, last_t2; | ||
| 172 | s64 diff, total, last_total = 0; | ||
| 173 | u64 sample = 0; | ||
| 174 | u64 thresh = tracing_thresh; | ||
| 175 | u64 outer_sample = 0; | ||
| 176 | int ret = -1; | ||
| 177 | |||
| 178 | do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ | ||
| 179 | |||
| 180 | nmi_cpu = smp_processor_id(); | ||
| 181 | nmi_total_ts = 0; | ||
| 182 | nmi_count = 0; | ||
| 183 | /* Make sure NMIs see this first */ | ||
| 184 | barrier(); | ||
| 185 | |||
| 186 | trace_hwlat_callback_enabled = true; | ||
| 187 | |||
| 188 | init_time(last_t2, 0); | ||
| 189 | start = time_get(); /* start timestamp */ | ||
| 190 | |||
| 191 | do { | ||
| 192 | |||
| 193 | t1 = time_get(); /* we'll look for a discontinuity */ | ||
| 194 | t2 = time_get(); | ||
| 195 | |||
| 196 | if (time_u64(last_t2)) { | ||
| 197 | /* Check the delta from outer loop (t2 to next t1) */ | ||
| 198 | diff = time_to_us(time_sub(t1, last_t2)); | ||
| 199 | /* This shouldn't happen */ | ||
| 200 | if (diff < 0) { | ||
| 201 | pr_err(BANNER "time running backwards\n"); | ||
| 202 | goto out; | ||
| 203 | } | ||
| 204 | if (diff > outer_sample) | ||
| 205 | outer_sample = diff; | ||
| 206 | } | ||
| 207 | last_t2 = t2; | ||
| 208 | |||
| 209 | total = time_to_us(time_sub(t2, start)); /* sample width */ | ||
| 210 | |||
| 211 | /* Check for possible overflows */ | ||
| 212 | if (total < last_total) { | ||
| 213 | pr_err("Time total overflowed\n"); | ||
| 214 | break; | ||
| 215 | } | ||
| 216 | last_total = total; | ||
| 217 | |||
| 218 | /* This checks the inner loop (t1 to t2) */ | ||
| 219 | diff = time_to_us(time_sub(t2, t1)); /* current diff */ | ||
| 220 | |||
| 221 | /* This shouldn't happen */ | ||
| 222 | if (diff < 0) { | ||
| 223 | pr_err(BANNER "time running backwards\n"); | ||
| 224 | goto out; | ||
| 225 | } | ||
| 226 | |||
| 227 | if (diff > sample) | ||
| 228 | sample = diff; /* only want highest value */ | ||
| 229 | |||
| 230 | } while (total <= hwlat_data.sample_width); | ||
| 231 | |||
| 232 | barrier(); /* finish the above in the view for NMIs */ | ||
| 233 | trace_hwlat_callback_enabled = false; | ||
| 234 | barrier(); /* Make sure nmi_total_ts is no longer updated */ | ||
| 235 | |||
| 236 | ret = 0; | ||
| 237 | |||
| 238 | /* If we exceed the threshold value, we have found a hardware latency */ | ||
| 239 | if (sample > thresh || outer_sample > thresh) { | ||
| 240 | struct hwlat_sample s; | ||
| 241 | |||
| 242 | ret = 1; | ||
| 243 | |||
| 244 | /* We read in microseconds */ | ||
| 245 | if (nmi_total_ts) | ||
| 246 | do_div(nmi_total_ts, NSEC_PER_USEC); | ||
| 247 | |||
| 248 | hwlat_data.count++; | ||
| 249 | s.seqnum = hwlat_data.count; | ||
| 250 | s.duration = sample; | ||
| 251 | s.outer_duration = outer_sample; | ||
| 252 | s.timestamp = CURRENT_TIME; | ||
| 253 | s.nmi_total_ts = nmi_total_ts; | ||
| 254 | s.nmi_count = nmi_count; | ||
| 255 | trace_hwlat_sample(&s); | ||
| 256 | |||
| 257 | /* Keep a running maximum ever recorded hardware latency */ | ||
| 258 | if (sample > tr->max_latency) | ||
| 259 | tr->max_latency = sample; | ||
| 260 | } | ||
| 261 | |||
| 262 | out: | ||
| 263 | return ret; | ||
| 264 | } | ||
| 265 | |||
| 266 | static struct cpumask save_cpumask; | ||
| 267 | static bool disable_migrate; | ||
| 268 | |||
| 269 | static void move_to_next_cpu(void) | ||
| 270 | { | ||
| 271 | static struct cpumask *current_mask; | ||
| 272 | int next_cpu; | ||
| 273 | |||
| 274 | if (disable_migrate) | ||
| 275 | return; | ||
| 276 | |||
| 277 | /* Just pick the first CPU on first iteration */ | ||
| 278 | if (!current_mask) { | ||
| 279 | current_mask = &save_cpumask; | ||
| 280 | get_online_cpus(); | ||
| 281 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
| 282 | put_online_cpus(); | ||
| 283 | next_cpu = cpumask_first(current_mask); | ||
| 284 | goto set_affinity; | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | * If for some reason the user modifies the CPU affinity | ||
| 289 | * of this thread, than stop migrating for the duration | ||
| 290 | * of the current test. | ||
| 291 | */ | ||
| 292 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | ||
| 293 | goto disable; | ||
| 294 | |||
| 295 | get_online_cpus(); | ||
| 296 | cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); | ||
| 297 | next_cpu = cpumask_next(smp_processor_id(), current_mask); | ||
| 298 | put_online_cpus(); | ||
| 299 | |||
| 300 | if (next_cpu >= nr_cpu_ids) | ||
| 301 | next_cpu = cpumask_first(current_mask); | ||
| 302 | |||
| 303 | set_affinity: | ||
| 304 | if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ | ||
| 305 | goto disable; | ||
| 306 | |||
| 307 | cpumask_clear(current_mask); | ||
| 308 | cpumask_set_cpu(next_cpu, current_mask); | ||
| 309 | |||
| 310 | sched_setaffinity(0, current_mask); | ||
| 311 | return; | ||
| 312 | |||
| 313 | disable: | ||
| 314 | disable_migrate = true; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * kthread_fn - The CPU time sampling/hardware latency detection kernel thread | ||
| 319 | * | ||
| 320 | * Used to periodically sample the CPU TSC via a call to get_sample. We | ||
| 321 | * disable interrupts, which does (intentionally) introduce latency since we | ||
| 322 | * need to ensure nothing else might be running (and thus preempting). | ||
| 323 | * Obviously this should never be used in production environments. | ||
| 324 | * | ||
| 325 | * Currently this runs on which ever CPU it was scheduled on, but most | ||
| 326 | * real-world hardware latency situations occur across several CPUs, | ||
| 327 | * but we might later generalize this if we find there are any actualy | ||
| 328 | * systems with alternate SMI delivery or other hardware latencies. | ||
| 329 | */ | ||
| 330 | static int kthread_fn(void *data) | ||
| 331 | { | ||
| 332 | u64 interval; | ||
| 333 | |||
| 334 | while (!kthread_should_stop()) { | ||
| 335 | |||
| 336 | move_to_next_cpu(); | ||
| 337 | |||
| 338 | local_irq_disable(); | ||
| 339 | get_sample(); | ||
| 340 | local_irq_enable(); | ||
| 341 | |||
| 342 | mutex_lock(&hwlat_data.lock); | ||
| 343 | interval = hwlat_data.sample_window - hwlat_data.sample_width; | ||
| 344 | mutex_unlock(&hwlat_data.lock); | ||
| 345 | |||
| 346 | do_div(interval, USEC_PER_MSEC); /* modifies interval value */ | ||
| 347 | |||
| 348 | /* Always sleep for at least 1ms */ | ||
| 349 | if (interval < 1) | ||
| 350 | interval = 1; | ||
| 351 | |||
| 352 | if (msleep_interruptible(interval)) | ||
| 353 | break; | ||
| 354 | } | ||
| 355 | |||
| 356 | return 0; | ||
| 357 | } | ||
| 358 | |||
| 359 | /** | ||
| 360 | * start_kthread - Kick off the hardware latency sampling/detector kthread | ||
| 361 | * | ||
| 362 | * This starts the kernel thread that will sit and sample the CPU timestamp | ||
| 363 | * counter (TSC or similar) and look for potential hardware latencies. | ||
| 364 | */ | ||
| 365 | static int start_kthread(struct trace_array *tr) | ||
| 366 | { | ||
| 367 | struct task_struct *kthread; | ||
| 368 | |||
| 369 | kthread = kthread_create(kthread_fn, NULL, "hwlatd"); | ||
| 370 | if (IS_ERR(kthread)) { | ||
| 371 | pr_err(BANNER "could not start sampling thread\n"); | ||
| 372 | return -ENOMEM; | ||
| 373 | } | ||
| 374 | hwlat_kthread = kthread; | ||
| 375 | wake_up_process(kthread); | ||
| 376 | |||
| 377 | return 0; | ||
| 378 | } | ||
| 379 | |||
| 380 | /** | ||
| 381 | * stop_kthread - Inform the hardware latency samping/detector kthread to stop | ||
| 382 | * | ||
| 383 | * This kicks the running hardware latency sampling/detector kernel thread and | ||
| 384 | * tells it to stop sampling now. Use this on unload and at system shutdown. | ||
| 385 | */ | ||
| 386 | static void stop_kthread(void) | ||
| 387 | { | ||
| 388 | if (!hwlat_kthread) | ||
| 389 | return; | ||
| 390 | kthread_stop(hwlat_kthread); | ||
| 391 | hwlat_kthread = NULL; | ||
| 392 | } | ||
| 393 | |||
| 394 | /* | ||
| 395 | * hwlat_read - Wrapper read function for reading both window and width | ||
| 396 | * @filp: The active open file structure | ||
| 397 | * @ubuf: The userspace provided buffer to read value into | ||
| 398 | * @cnt: The maximum number of bytes to read | ||
| 399 | * @ppos: The current "file" position | ||
| 400 | * | ||
| 401 | * This function provides a generic read implementation for the global state | ||
| 402 | * "hwlat_data" structure filesystem entries. | ||
| 403 | */ | ||
| 404 | static ssize_t hwlat_read(struct file *filp, char __user *ubuf, | ||
| 405 | size_t cnt, loff_t *ppos) | ||
| 406 | { | ||
| 407 | char buf[U64STR_SIZE]; | ||
| 408 | u64 *entry = filp->private_data; | ||
| 409 | u64 val; | ||
| 410 | int len; | ||
| 411 | |||
| 412 | if (!entry) | ||
| 413 | return -EFAULT; | ||
| 414 | |||
| 415 | if (cnt > sizeof(buf)) | ||
| 416 | cnt = sizeof(buf); | ||
| 417 | |||
| 418 | val = *entry; | ||
| 419 | |||
| 420 | len = snprintf(buf, sizeof(buf), "%llu\n", val); | ||
| 421 | |||
| 422 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); | ||
| 423 | } | ||
| 424 | |||
| 425 | /** | ||
| 426 | * hwlat_width_write - Write function for "width" entry | ||
| 427 | * @filp: The active open file structure | ||
| 428 | * @ubuf: The user buffer that contains the value to write | ||
| 429 | * @cnt: The maximum number of bytes to write to "file" | ||
| 430 | * @ppos: The current position in @file | ||
| 431 | * | ||
| 432 | * This function provides a write implementation for the "width" interface | ||
| 433 | * to the hardware latency detector. It can be used to configure | ||
| 434 | * for how many us of the total window us we will actively sample for any | ||
| 435 | * hardware-induced latency periods. Obviously, it is not possible to | ||
| 436 | * sample constantly and have the system respond to a sample reader, or, | ||
| 437 | * worse, without having the system appear to have gone out to lunch. It | ||
| 438 | * is enforced that width is less that the total window size. | ||
| 439 | */ | ||
| 440 | static ssize_t | ||
| 441 | hwlat_width_write(struct file *filp, const char __user *ubuf, | ||
| 442 | size_t cnt, loff_t *ppos) | ||
| 443 | { | ||
| 444 | u64 val; | ||
| 445 | int err; | ||
| 446 | |||
| 447 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
| 448 | if (err) | ||
| 449 | return err; | ||
| 450 | |||
| 451 | mutex_lock(&hwlat_data.lock); | ||
| 452 | if (val < hwlat_data.sample_window) | ||
| 453 | hwlat_data.sample_width = val; | ||
| 454 | else | ||
| 455 | err = -EINVAL; | ||
| 456 | mutex_unlock(&hwlat_data.lock); | ||
| 457 | |||
| 458 | if (err) | ||
| 459 | return err; | ||
| 460 | |||
| 461 | return cnt; | ||
| 462 | } | ||
| 463 | |||
| 464 | /** | ||
| 465 | * hwlat_window_write - Write function for "window" entry | ||
| 466 | * @filp: The active open file structure | ||
| 467 | * @ubuf: The user buffer that contains the value to write | ||
| 468 | * @cnt: The maximum number of bytes to write to "file" | ||
| 469 | * @ppos: The current position in @file | ||
| 470 | * | ||
| 471 | * This function provides a write implementation for the "window" interface | ||
| 472 | * to the hardware latency detetector. The window is the total time | ||
| 473 | * in us that will be considered one sample period. Conceptually, windows | ||
| 474 | * occur back-to-back and contain a sample width period during which | ||
| 475 | * actual sampling occurs. Can be used to write a new total window size. It | ||
| 476 | * is enfoced that any value written must be greater than the sample width | ||
| 477 | * size, or an error results. | ||
| 478 | */ | ||
| 479 | static ssize_t | ||
| 480 | hwlat_window_write(struct file *filp, const char __user *ubuf, | ||
| 481 | size_t cnt, loff_t *ppos) | ||
| 482 | { | ||
| 483 | u64 val; | ||
| 484 | int err; | ||
| 485 | |||
| 486 | err = kstrtoull_from_user(ubuf, cnt, 10, &val); | ||
| 487 | if (err) | ||
| 488 | return err; | ||
| 489 | |||
| 490 | mutex_lock(&hwlat_data.lock); | ||
| 491 | if (hwlat_data.sample_width < val) | ||
| 492 | hwlat_data.sample_window = val; | ||
| 493 | else | ||
| 494 | err = -EINVAL; | ||
| 495 | mutex_unlock(&hwlat_data.lock); | ||
| 496 | |||
| 497 | if (err) | ||
| 498 | return err; | ||
| 499 | |||
| 500 | return cnt; | ||
| 501 | } | ||
| 502 | |||
| 503 | static const struct file_operations width_fops = { | ||
| 504 | .open = tracing_open_generic, | ||
| 505 | .read = hwlat_read, | ||
| 506 | .write = hwlat_width_write, | ||
| 507 | }; | ||
| 508 | |||
| 509 | static const struct file_operations window_fops = { | ||
| 510 | .open = tracing_open_generic, | ||
| 511 | .read = hwlat_read, | ||
| 512 | .write = hwlat_window_write, | ||
| 513 | }; | ||
| 514 | |||
| 515 | /** | ||
| 516 | * init_tracefs - A function to initialize the tracefs interface files | ||
| 517 | * | ||
| 518 | * This function creates entries in tracefs for "hwlat_detector". | ||
| 519 | * It creates the hwlat_detector directory in the tracing directory, | ||
| 520 | * and within that directory is the count, width and window files to | ||
| 521 | * change and view those values. | ||
| 522 | */ | ||
| 523 | static int init_tracefs(void) | ||
| 524 | { | ||
| 525 | struct dentry *d_tracer; | ||
| 526 | struct dentry *top_dir; | ||
| 527 | |||
| 528 | d_tracer = tracing_init_dentry(); | ||
| 529 | if (IS_ERR(d_tracer)) | ||
| 530 | return -ENOMEM; | ||
| 531 | |||
| 532 | top_dir = tracefs_create_dir("hwlat_detector", d_tracer); | ||
| 533 | if (!top_dir) | ||
| 534 | return -ENOMEM; | ||
| 535 | |||
| 536 | hwlat_sample_window = tracefs_create_file("window", 0640, | ||
| 537 | top_dir, | ||
| 538 | &hwlat_data.sample_window, | ||
| 539 | &window_fops); | ||
| 540 | if (!hwlat_sample_window) | ||
| 541 | goto err; | ||
| 542 | |||
| 543 | hwlat_sample_width = tracefs_create_file("width", 0644, | ||
| 544 | top_dir, | ||
| 545 | &hwlat_data.sample_width, | ||
| 546 | &width_fops); | ||
| 547 | if (!hwlat_sample_width) | ||
| 548 | goto err; | ||
| 549 | |||
| 550 | return 0; | ||
| 551 | |||
| 552 | err: | ||
| 553 | tracefs_remove_recursive(top_dir); | ||
| 554 | return -ENOMEM; | ||
| 555 | } | ||
| 556 | |||
| 557 | static void hwlat_tracer_start(struct trace_array *tr) | ||
| 558 | { | ||
| 559 | int err; | ||
| 560 | |||
| 561 | err = start_kthread(tr); | ||
| 562 | if (err) | ||
| 563 | pr_err(BANNER "Cannot start hwlat kthread\n"); | ||
| 564 | } | ||
| 565 | |||
| 566 | static void hwlat_tracer_stop(struct trace_array *tr) | ||
| 567 | { | ||
| 568 | stop_kthread(); | ||
| 569 | } | ||
| 570 | |||
| 571 | static bool hwlat_busy; | ||
| 572 | |||
| 573 | static int hwlat_tracer_init(struct trace_array *tr) | ||
| 574 | { | ||
| 575 | /* Only allow one instance to enable this */ | ||
| 576 | if (hwlat_busy) | ||
| 577 | return -EBUSY; | ||
| 578 | |||
| 579 | hwlat_trace = tr; | ||
| 580 | |||
| 581 | disable_migrate = false; | ||
| 582 | hwlat_data.count = 0; | ||
| 583 | tr->max_latency = 0; | ||
| 584 | save_tracing_thresh = tracing_thresh; | ||
| 585 | |||
| 586 | /* tracing_thresh is in nsecs, we speak in usecs */ | ||
| 587 | if (!tracing_thresh) | ||
| 588 | tracing_thresh = last_tracing_thresh; | ||
| 589 | |||
| 590 | if (tracer_tracing_is_on(tr)) | ||
| 591 | hwlat_tracer_start(tr); | ||
| 592 | |||
| 593 | hwlat_busy = true; | ||
| 594 | |||
| 595 | return 0; | ||
| 596 | } | ||
| 597 | |||
| 598 | static void hwlat_tracer_reset(struct trace_array *tr) | ||
| 599 | { | ||
| 600 | stop_kthread(); | ||
| 601 | |||
| 602 | /* the tracing threshold is static between runs */ | ||
| 603 | last_tracing_thresh = tracing_thresh; | ||
| 604 | |||
| 605 | tracing_thresh = save_tracing_thresh; | ||
| 606 | hwlat_busy = false; | ||
| 607 | } | ||
| 608 | |||
| 609 | static struct tracer hwlat_tracer __read_mostly = | ||
| 610 | { | ||
| 611 | .name = "hwlat", | ||
| 612 | .init = hwlat_tracer_init, | ||
| 613 | .reset = hwlat_tracer_reset, | ||
| 614 | .start = hwlat_tracer_start, | ||
| 615 | .stop = hwlat_tracer_stop, | ||
| 616 | .allow_instances = true, | ||
| 617 | }; | ||
| 618 | |||
| 619 | __init static int init_hwlat_tracer(void) | ||
| 620 | { | ||
| 621 | int ret; | ||
| 622 | |||
| 623 | mutex_init(&hwlat_data.lock); | ||
| 624 | |||
| 625 | ret = register_tracer(&hwlat_tracer); | ||
| 626 | if (ret) | ||
| 627 | return ret; | ||
| 628 | |||
| 629 | init_tracefs(); | ||
| 630 | |||
| 631 | return 0; | ||
| 632 | } | ||
| 633 | late_initcall(init_hwlat_tracer); | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = { | |||
| 253 | ASSIGN_FETCH_TYPE(s16, u16, 1), | 253 | ASSIGN_FETCH_TYPE(s16, u16, 1), |
| 254 | ASSIGN_FETCH_TYPE(s32, u32, 1), | 254 | ASSIGN_FETCH_TYPE(s32, u32, 1), |
| 255 | ASSIGN_FETCH_TYPE(s64, u64, 1), | 255 | ASSIGN_FETCH_TYPE(s64, u64, 1), |
| 256 | ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), | ||
| 257 | ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), | ||
| 258 | ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), | ||
| 259 | ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), | ||
| 256 | 260 | ||
| 257 | ASSIGN_FETCH_TYPE_END | 261 | ASSIGN_FETCH_TYPE_END |
| 258 | }; | 262 | }; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { | |||
| 1098 | .funcs = &trace_user_stack_funcs, | 1098 | .funcs = &trace_user_stack_funcs, |
| 1099 | }; | 1099 | }; |
| 1100 | 1100 | ||
| 1101 | /* TRACE_HWLAT */ | ||
| 1102 | static enum print_line_t | ||
| 1103 | trace_hwlat_print(struct trace_iterator *iter, int flags, | ||
| 1104 | struct trace_event *event) | ||
| 1105 | { | ||
| 1106 | struct trace_entry *entry = iter->ent; | ||
| 1107 | struct trace_seq *s = &iter->seq; | ||
| 1108 | struct hwlat_entry *field; | ||
| 1109 | |||
| 1110 | trace_assign_type(field, entry); | ||
| 1111 | |||
| 1112 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", | ||
| 1113 | field->seqnum, | ||
| 1114 | field->duration, | ||
| 1115 | field->outer_duration, | ||
| 1116 | field->timestamp.tv_sec, | ||
| 1117 | field->timestamp.tv_nsec); | ||
| 1118 | |||
| 1119 | if (field->nmi_count) { | ||
| 1120 | /* | ||
| 1121 | * The generic sched_clock() is not NMI safe, thus | ||
| 1122 | * we only record the count and not the time. | ||
| 1123 | */ | ||
| 1124 | if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) | ||
| 1125 | trace_seq_printf(s, " nmi-total:%llu", | ||
| 1126 | field->nmi_total_ts); | ||
| 1127 | trace_seq_printf(s, " nmi-count:%u", | ||
| 1128 | field->nmi_count); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | trace_seq_putc(s, '\n'); | ||
| 1132 | |||
| 1133 | return trace_handle_return(s); | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | |||
| 1137 | static enum print_line_t | ||
| 1138 | trace_hwlat_raw(struct trace_iterator *iter, int flags, | ||
| 1139 | struct trace_event *event) | ||
| 1140 | { | ||
| 1141 | struct hwlat_entry *field; | ||
| 1142 | struct trace_seq *s = &iter->seq; | ||
| 1143 | |||
| 1144 | trace_assign_type(field, iter->ent); | ||
| 1145 | |||
| 1146 | trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", | ||
| 1147 | field->duration, | ||
| 1148 | field->outer_duration, | ||
| 1149 | field->timestamp.tv_sec, | ||
| 1150 | field->timestamp.tv_nsec, | ||
| 1151 | field->seqnum); | ||
| 1152 | |||
| 1153 | return trace_handle_return(s); | ||
| 1154 | } | ||
| 1155 | |||
| 1156 | static struct trace_event_functions trace_hwlat_funcs = { | ||
| 1157 | .trace = trace_hwlat_print, | ||
| 1158 | .raw = trace_hwlat_raw, | ||
| 1159 | }; | ||
| 1160 | |||
| 1161 | static struct trace_event trace_hwlat_event = { | ||
| 1162 | .type = TRACE_HWLAT, | ||
| 1163 | .funcs = &trace_hwlat_funcs, | ||
| 1164 | }; | ||
| 1165 | |||
| 1101 | /* TRACE_BPUTS */ | 1166 | /* TRACE_BPUTS */ |
| 1102 | static enum print_line_t | 1167 | static enum print_line_t |
| 1103 | trace_bputs_print(struct trace_iterator *iter, int flags, | 1168 | trace_bputs_print(struct trace_iterator *iter, int flags, |
| @@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { | |||
| 1233 | &trace_bputs_event, | 1298 | &trace_bputs_event, |
| 1234 | &trace_bprint_event, | 1299 | &trace_bprint_event, |
| 1235 | &trace_print_event, | 1300 | &trace_print_event, |
| 1301 | &trace_hwlat_event, | ||
| 1236 | NULL | 1302 | NULL |
| 1237 | }; | 1303 | }; |
| 1238 | 1304 | ||
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -36,24 +36,28 @@ const char *reserved_field_names[] = { | |||
| 36 | }; | 36 | }; |
| 37 | 37 | ||
| 38 | /* Printing in basic type function template */ | 38 | /* Printing in basic type function template */ |
| 39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ | 39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ |
| 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ | 40 | int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ |
| 41 | void *data, void *ent) \ | 41 | void *data, void *ent) \ |
| 42 | { \ | 42 | { \ |
| 43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ | 43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
| 44 | return !trace_seq_has_overflowed(s); \ | 44 | return !trace_seq_has_overflowed(s); \ |
| 45 | } \ | 45 | } \ |
| 46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ | 46 | const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ |
| 47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); | 47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); |
| 48 | 48 | ||
| 49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") | 49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") |
| 50 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") | 50 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") |
| 51 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") | 51 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") |
| 52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") | 52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") |
| 53 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") | 53 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") |
| 54 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") | 54 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") |
| 55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") | 55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") |
| 56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") | 56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") |
| 57 | DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") | ||
| 58 | DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") | ||
| 59 | DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") | ||
| 60 | DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") | ||
| 57 | 61 | ||
| 58 | /* Print type function for string type */ | 62 | /* Print type function for string type */ |
| 59 | int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, | 63 | int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8); | |||
| 149 | DECLARE_BASIC_PRINT_TYPE_FUNC(s16); | 149 | DECLARE_BASIC_PRINT_TYPE_FUNC(s16); |
| 150 | DECLARE_BASIC_PRINT_TYPE_FUNC(s32); | 150 | DECLARE_BASIC_PRINT_TYPE_FUNC(s32); |
| 151 | DECLARE_BASIC_PRINT_TYPE_FUNC(s64); | 151 | DECLARE_BASIC_PRINT_TYPE_FUNC(s64); |
| 152 | DECLARE_BASIC_PRINT_TYPE_FUNC(x8); | ||
| 153 | DECLARE_BASIC_PRINT_TYPE_FUNC(x16); | ||
| 154 | DECLARE_BASIC_PRINT_TYPE_FUNC(x32); | ||
| 155 | DECLARE_BASIC_PRINT_TYPE_FUNC(x64); | ||
| 156 | |||
| 152 | DECLARE_BASIC_PRINT_TYPE_FUNC(string); | 157 | DECLARE_BASIC_PRINT_TYPE_FUNC(string); |
| 153 | 158 | ||
| 154 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | 159 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type |
| @@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \ | |||
| 203 | DEFINE_FETCH_##method(u64) | 208 | DEFINE_FETCH_##method(u64) |
| 204 | 209 | ||
| 205 | /* Default (unsigned long) fetch type */ | 210 | /* Default (unsigned long) fetch type */ |
| 206 | #define __DEFAULT_FETCH_TYPE(t) u##t | 211 | #define __DEFAULT_FETCH_TYPE(t) x##t |
| 207 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 212 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
| 208 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | 213 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) |
| 209 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | 214 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) |
| @@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ | |||
| 234 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | 239 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ |
| 235 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | 240 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) |
| 236 | 241 | ||
| 242 | /* If ptype is an alias of atype, use this macro (show atype in format) */ | ||
| 243 | #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ | ||
| 244 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) | ||
| 245 | |||
| 237 | #define ASSIGN_FETCH_TYPE_END {} | 246 | #define ASSIGN_FETCH_TYPE_END {} |
| 238 | 247 | ||
| 239 | #define FETCH_TYPE_STRING 0 | 248 | #define FETCH_TYPE_STRING 0 |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) | |||
| 610 | if (!sys_perf_refcount_enter) | 610 | if (!sys_perf_refcount_enter) |
| 611 | ret = register_trace_sys_enter(perf_syscall_enter, NULL); | 611 | ret = register_trace_sys_enter(perf_syscall_enter, NULL); |
| 612 | if (ret) { | 612 | if (ret) { |
| 613 | pr_info("event trace: Could not activate" | 613 | pr_info("event trace: Could not activate syscall entry trace point"); |
| 614 | "syscall entry trace point"); | ||
| 615 | } else { | 614 | } else { |
| 616 | set_bit(num, enabled_perf_enter_syscalls); | 615 | set_bit(num, enabled_perf_enter_syscalls); |
| 617 | sys_perf_refcount_enter++; | 616 | sys_perf_refcount_enter++; |
| @@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) | |||
| 682 | if (!sys_perf_refcount_exit) | 681 | if (!sys_perf_refcount_exit) |
| 683 | ret = register_trace_sys_exit(perf_syscall_exit, NULL); | 682 | ret = register_trace_sys_exit(perf_syscall_exit, NULL); |
| 684 | if (ret) { | 683 | if (ret) { |
| 685 | pr_info("event trace: Could not activate" | 684 | pr_info("event trace: Could not activate syscall exit trace point"); |
| 686 | "syscall exit trace point"); | ||
| 687 | } else { | 685 | } else { |
| 688 | set_bit(num, enabled_perf_exit_syscalls); | 686 | set_bit(num, enabled_perf_exit_syscalls); |
| 689 | sys_perf_refcount_exit++; | 687 | sys_perf_refcount_exit++; |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = { | |||
| 211 | ASSIGN_FETCH_TYPE(s16, u16, 1), | 211 | ASSIGN_FETCH_TYPE(s16, u16, 1), |
| 212 | ASSIGN_FETCH_TYPE(s32, u32, 1), | 212 | ASSIGN_FETCH_TYPE(s32, u32, 1), |
| 213 | ASSIGN_FETCH_TYPE(s64, u64, 1), | 213 | ASSIGN_FETCH_TYPE(s64, u64, 1), |
| 214 | ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), | ||
| 215 | ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), | ||
| 216 | ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), | ||
| 217 | ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), | ||
| 214 | 218 | ||
| 215 | ASSIGN_FETCH_TYPE_END | 219 | ASSIGN_FETCH_TYPE_END |
| 216 | }; | 220 | }; |
| @@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) | |||
| 427 | pr_info("Probe point is not specified.\n"); | 431 | pr_info("Probe point is not specified.\n"); |
| 428 | return -EINVAL; | 432 | return -EINVAL; |
| 429 | } | 433 | } |
| 430 | if (isdigit(argv[1][0])) { | ||
| 431 | pr_info("probe point must be have a filename.\n"); | ||
| 432 | return -EINVAL; | ||
| 433 | } | ||
| 434 | arg = strchr(argv[1], ':'); | 434 | arg = strchr(argv[1], ':'); |
| 435 | if (!arg) { | 435 | if (!arg) { |
| 436 | ret = -EINVAL; | 436 | ret = -EINVAL; |
diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..9d20d5dd298a --- /dev/null +++ b/kernel/ucount.c | |||
| @@ -0,0 +1,235 @@ | |||
| 1 | /* | ||
| 2 | * This program is free software; you can redistribute it and/or | ||
| 3 | * modify it under the terms of the GNU General Public License as | ||
| 4 | * published by the Free Software Foundation, version 2 of the | ||
| 5 | * License. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/stat.h> | ||
| 9 | #include <linux/sysctl.h> | ||
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/hash.h> | ||
| 12 | #include <linux/user_namespace.h> | ||
| 13 | |||
| 14 | #define UCOUNTS_HASHTABLE_BITS 10 | ||
| 15 | static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; | ||
| 16 | static DEFINE_SPINLOCK(ucounts_lock); | ||
| 17 | |||
| 18 | #define ucounts_hashfn(ns, uid) \ | ||
| 19 | hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ | ||
| 20 | UCOUNTS_HASHTABLE_BITS) | ||
| 21 | #define ucounts_hashentry(ns, uid) \ | ||
| 22 | (ucounts_hashtable + ucounts_hashfn(ns, uid)) | ||
| 23 | |||
| 24 | |||
| 25 | #ifdef CONFIG_SYSCTL | ||
| 26 | static struct ctl_table_set * | ||
| 27 | set_lookup(struct ctl_table_root *root) | ||
| 28 | { | ||
| 29 | return ¤t_user_ns()->set; | ||
| 30 | } | ||
| 31 | |||
| 32 | static int set_is_seen(struct ctl_table_set *set) | ||
| 33 | { | ||
| 34 | return ¤t_user_ns()->set == set; | ||
| 35 | } | ||
| 36 | |||
| 37 | static int set_permissions(struct ctl_table_header *head, | ||
| 38 | struct ctl_table *table) | ||
| 39 | { | ||
| 40 | struct user_namespace *user_ns = | ||
| 41 | container_of(head->set, struct user_namespace, set); | ||
| 42 | int mode; | ||
| 43 | |||
| 44 | /* Allow users with CAP_SYS_RESOURCE unrestrained access */ | ||
| 45 | if (ns_capable(user_ns, CAP_SYS_RESOURCE)) | ||
| 46 | mode = (table->mode & S_IRWXU) >> 6; | ||
| 47 | else | ||
| 48 | /* Allow all others at most read-only access */ | ||
| 49 | mode = table->mode & S_IROTH; | ||
| 50 | return (mode << 6) | (mode << 3) | mode; | ||
| 51 | } | ||
| 52 | |||
| 53 | static struct ctl_table_root set_root = { | ||
| 54 | .lookup = set_lookup, | ||
| 55 | .permissions = set_permissions, | ||
| 56 | }; | ||
| 57 | |||
| 58 | static int zero = 0; | ||
| 59 | static int int_max = INT_MAX; | ||
| 60 | #define UCOUNT_ENTRY(name) \ | ||
| 61 | { \ | ||
| 62 | .procname = name, \ | ||
| 63 | .maxlen = sizeof(int), \ | ||
| 64 | .mode = 0644, \ | ||
| 65 | .proc_handler = proc_dointvec_minmax, \ | ||
| 66 | .extra1 = &zero, \ | ||
| 67 | .extra2 = &int_max, \ | ||
| 68 | } | ||
| 69 | static struct ctl_table user_table[] = { | ||
| 70 | UCOUNT_ENTRY("max_user_namespaces"), | ||
| 71 | UCOUNT_ENTRY("max_pid_namespaces"), | ||
| 72 | UCOUNT_ENTRY("max_uts_namespaces"), | ||
| 73 | UCOUNT_ENTRY("max_ipc_namespaces"), | ||
| 74 | UCOUNT_ENTRY("max_net_namespaces"), | ||
| 75 | UCOUNT_ENTRY("max_mnt_namespaces"), | ||
| 76 | UCOUNT_ENTRY("max_cgroup_namespaces"), | ||
| 77 | { } | ||
| 78 | }; | ||
| 79 | #endif /* CONFIG_SYSCTL */ | ||
| 80 | |||
| 81 | bool setup_userns_sysctls(struct user_namespace *ns) | ||
| 82 | { | ||
| 83 | #ifdef CONFIG_SYSCTL | ||
| 84 | struct ctl_table *tbl; | ||
| 85 | setup_sysctl_set(&ns->set, &set_root, set_is_seen); | ||
| 86 | tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); | ||
| 87 | if (tbl) { | ||
| 88 | int i; | ||
| 89 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
| 90 | tbl[i].data = &ns->ucount_max[i]; | ||
| 91 | } | ||
| 92 | ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); | ||
| 93 | } | ||
| 94 | if (!ns->sysctls) { | ||
| 95 | kfree(tbl); | ||
| 96 | retire_sysctl_set(&ns->set); | ||
| 97 | return false; | ||
| 98 | } | ||
| 99 | #endif | ||
| 100 | return true; | ||
| 101 | } | ||
| 102 | |||
| 103 | void retire_userns_sysctls(struct user_namespace *ns) | ||
| 104 | { | ||
| 105 | #ifdef CONFIG_SYSCTL | ||
| 106 | struct ctl_table *tbl; | ||
| 107 | |||
| 108 | tbl = ns->sysctls->ctl_table_arg; | ||
| 109 | unregister_sysctl_table(ns->sysctls); | ||
| 110 | retire_sysctl_set(&ns->set); | ||
| 111 | kfree(tbl); | ||
| 112 | #endif | ||
| 113 | } | ||
| 114 | |||
| 115 | static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) | ||
| 116 | { | ||
| 117 | struct ucounts *ucounts; | ||
| 118 | |||
| 119 | hlist_for_each_entry(ucounts, hashent, node) { | ||
| 120 | if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) | ||
| 121 | return ucounts; | ||
| 122 | } | ||
| 123 | return NULL; | ||
| 124 | } | ||
| 125 | |||
| 126 | static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) | ||
| 127 | { | ||
| 128 | struct hlist_head *hashent = ucounts_hashentry(ns, uid); | ||
| 129 | struct ucounts *ucounts, *new; | ||
| 130 | |||
| 131 | spin_lock(&ucounts_lock); | ||
| 132 | ucounts = find_ucounts(ns, uid, hashent); | ||
| 133 | if (!ucounts) { | ||
| 134 | spin_unlock(&ucounts_lock); | ||
| 135 | |||
| 136 | new = kzalloc(sizeof(*new), GFP_KERNEL); | ||
| 137 | if (!new) | ||
| 138 | return NULL; | ||
| 139 | |||
| 140 | new->ns = ns; | ||
| 141 | new->uid = uid; | ||
| 142 | atomic_set(&new->count, 0); | ||
| 143 | |||
| 144 | spin_lock(&ucounts_lock); | ||
| 145 | ucounts = find_ucounts(ns, uid, hashent); | ||
| 146 | if (ucounts) { | ||
| 147 | kfree(new); | ||
| 148 | } else { | ||
| 149 | hlist_add_head(&new->node, hashent); | ||
| 150 | ucounts = new; | ||
| 151 | } | ||
| 152 | } | ||
| 153 | if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) | ||
| 154 | ucounts = NULL; | ||
| 155 | spin_unlock(&ucounts_lock); | ||
| 156 | return ucounts; | ||
| 157 | } | ||
| 158 | |||
| 159 | static void put_ucounts(struct ucounts *ucounts) | ||
| 160 | { | ||
| 161 | if (atomic_dec_and_test(&ucounts->count)) { | ||
| 162 | spin_lock(&ucounts_lock); | ||
| 163 | hlist_del_init(&ucounts->node); | ||
| 164 | spin_unlock(&ucounts_lock); | ||
| 165 | |||
| 166 | kfree(ucounts); | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | static inline bool atomic_inc_below(atomic_t *v, int u) | ||
| 171 | { | ||
| 172 | int c, old; | ||
| 173 | c = atomic_read(v); | ||
| 174 | for (;;) { | ||
| 175 | if (unlikely(c >= u)) | ||
| 176 | return false; | ||
| 177 | old = atomic_cmpxchg(v, c, c+1); | ||
| 178 | if (likely(old == c)) | ||
| 179 | return true; | ||
| 180 | c = old; | ||
| 181 | } | ||
| 182 | } | ||
| 183 | |||
| 184 | struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, | ||
| 185 | enum ucount_type type) | ||
| 186 | { | ||
| 187 | struct ucounts *ucounts, *iter, *bad; | ||
| 188 | struct user_namespace *tns; | ||
| 189 | ucounts = get_ucounts(ns, uid); | ||
| 190 | for (iter = ucounts; iter; iter = tns->ucounts) { | ||
| 191 | int max; | ||
| 192 | tns = iter->ns; | ||
| 193 | max = READ_ONCE(tns->ucount_max[type]); | ||
| 194 | if (!atomic_inc_below(&iter->ucount[type], max)) | ||
| 195 | goto fail; | ||
| 196 | } | ||
| 197 | return ucounts; | ||
| 198 | fail: | ||
| 199 | bad = iter; | ||
| 200 | for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) | ||
| 201 | atomic_dec(&iter->ucount[type]); | ||
| 202 | |||
| 203 | put_ucounts(ucounts); | ||
| 204 | return NULL; | ||
| 205 | } | ||
| 206 | |||
| 207 | void dec_ucount(struct ucounts *ucounts, enum ucount_type type) | ||
| 208 | { | ||
| 209 | struct ucounts *iter; | ||
| 210 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { | ||
| 211 | int dec = atomic_dec_if_positive(&iter->ucount[type]); | ||
| 212 | WARN_ON_ONCE(dec < 0); | ||
| 213 | } | ||
| 214 | put_ucounts(ucounts); | ||
| 215 | } | ||
| 216 | |||
| 217 | static __init int user_namespace_sysctl_init(void) | ||
| 218 | { | ||
| 219 | #ifdef CONFIG_SYSCTL | ||
| 220 | static struct ctl_table_header *user_header; | ||
| 221 | static struct ctl_table empty[1]; | ||
| 222 | /* | ||
| 223 | * It is necessary to register the user directory in the | ||
| 224 | * default set so that registrations in the child sets work | ||
| 225 | * properly. | ||
| 226 | */ | ||
| 227 | user_header = register_sysctl("user", empty); | ||
| 228 | BUG_ON(!user_header); | ||
| 229 | BUG_ON(!setup_userns_sysctls(&init_user_ns)); | ||
| 230 | #endif | ||
| 231 | return 0; | ||
| 232 | } | ||
| 233 | subsys_initcall(user_namespace_sysctl_init); | ||
| 234 | |||
| 235 | |||
diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, | |||
| 117 | kgid_t kgid; | 117 | kgid_t kgid; |
| 118 | 118 | ||
| 119 | for (i = 0; i < group_info->ngroups; i++) { | 119 | for (i = 0; i < group_info->ngroups; i++) { |
| 120 | kgid = GROUP_AT(group_info, i); | 120 | kgid = group_info->gid[i]; |
| 121 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); | 121 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); |
| 122 | if (put_user(group, grouplist+i)) | 122 | if (put_user(group, grouplist+i)) |
| 123 | return -EFAULT; | 123 | return -EFAULT; |
| @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, | |||
| 142 | if (!gid_valid(kgid)) | 142 | if (!gid_valid(kgid)) |
| 143 | return -EINVAL; | 143 | return -EINVAL; |
| 144 | 144 | ||
| 145 | GROUP_AT(group_info, i) = kgid; | 145 | group_info->gid[i] = kgid; |
| 146 | } | 146 | } |
| 147 | 147 | ||
| 148 | return 0; | 148 | return 0; |
diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
| 7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
| 8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
| 9 | #include <linux/hypervisor.h> | ||
| 9 | 10 | ||
| 10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 11 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, |
| 11 | int wait) | 12 | int wait) |
| @@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
| 82 | preempt_enable(); | 83 | preempt_enable(); |
| 83 | } | 84 | } |
| 84 | EXPORT_SYMBOL(on_each_cpu_cond); | 85 | EXPORT_SYMBOL(on_each_cpu_cond); |
| 86 | |||
| 87 | int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) | ||
| 88 | { | ||
| 89 | int ret; | ||
| 90 | |||
| 91 | if (cpu != 0) | ||
| 92 | return -ENXIO; | ||
| 93 | |||
| 94 | if (phys) | ||
| 95 | hypervisor_pin_vcpu(0); | ||
| 96 | ret = func(par); | ||
| 97 | if (phys) | ||
| 98 | hypervisor_pin_vcpu(-1); | ||
| 99 | |||
| 100 | return ret; | ||
| 101 | } | ||
| 102 | EXPORT_SYMBOL_GPL(smp_call_on_cpu); | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..86b7854fec8e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex); | |||
| 29 | static bool new_idmap_permitted(const struct file *file, | 29 | static bool new_idmap_permitted(const struct file *file, |
| 30 | struct user_namespace *ns, int cap_setid, | 30 | struct user_namespace *ns, int cap_setid, |
| 31 | struct uid_gid_map *map); | 31 | struct uid_gid_map *map); |
| 32 | static void free_user_ns(struct work_struct *work); | ||
| 33 | |||
| 34 | static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) | ||
| 35 | { | ||
| 36 | return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); | ||
| 37 | } | ||
| 38 | |||
| 39 | static void dec_user_namespaces(struct ucounts *ucounts) | ||
| 40 | { | ||
| 41 | return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); | ||
| 42 | } | ||
| 32 | 43 | ||
| 33 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) | 44 | static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) |
| 34 | { | 45 | { |
| @@ -62,10 +73,16 @@ int create_user_ns(struct cred *new) | |||
| 62 | struct user_namespace *ns, *parent_ns = new->user_ns; | 73 | struct user_namespace *ns, *parent_ns = new->user_ns; |
| 63 | kuid_t owner = new->euid; | 74 | kuid_t owner = new->euid; |
| 64 | kgid_t group = new->egid; | 75 | kgid_t group = new->egid; |
| 65 | int ret; | 76 | struct ucounts *ucounts; |
| 77 | int ret, i; | ||
| 66 | 78 | ||
| 79 | ret = -ENOSPC; | ||
| 67 | if (parent_ns->level > 32) | 80 | if (parent_ns->level > 32) |
| 68 | return -EUSERS; | 81 | goto fail; |
| 82 | |||
| 83 | ucounts = inc_user_namespaces(parent_ns, owner); | ||
| 84 | if (!ucounts) | ||
| 85 | goto fail; | ||
| 69 | 86 | ||
| 70 | /* | 87 | /* |
| 71 | * Verify that we can not violate the policy of which files | 88 | * Verify that we can not violate the policy of which files |
| @@ -73,26 +90,27 @@ int create_user_ns(struct cred *new) | |||
| 73 | * by verifing that the root directory is at the root of the | 90 | * by verifing that the root directory is at the root of the |
| 74 | * mount namespace which allows all files to be accessed. | 91 | * mount namespace which allows all files to be accessed. |
| 75 | */ | 92 | */ |
| 93 | ret = -EPERM; | ||
| 76 | if (current_chrooted()) | 94 | if (current_chrooted()) |
| 77 | return -EPERM; | 95 | goto fail_dec; |
| 78 | 96 | ||
| 79 | /* The creator needs a mapping in the parent user namespace | 97 | /* The creator needs a mapping in the parent user namespace |
| 80 | * or else we won't be able to reasonably tell userspace who | 98 | * or else we won't be able to reasonably tell userspace who |
| 81 | * created a user_namespace. | 99 | * created a user_namespace. |
| 82 | */ | 100 | */ |
| 101 | ret = -EPERM; | ||
| 83 | if (!kuid_has_mapping(parent_ns, owner) || | 102 | if (!kuid_has_mapping(parent_ns, owner) || |
| 84 | !kgid_has_mapping(parent_ns, group)) | 103 | !kgid_has_mapping(parent_ns, group)) |
| 85 | return -EPERM; | 104 | goto fail_dec; |
| 86 | 105 | ||
| 106 | ret = -ENOMEM; | ||
| 87 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); | 107 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); |
| 88 | if (!ns) | 108 | if (!ns) |
| 89 | return -ENOMEM; | 109 | goto fail_dec; |
| 90 | 110 | ||
| 91 | ret = ns_alloc_inum(&ns->ns); | 111 | ret = ns_alloc_inum(&ns->ns); |
| 92 | if (ret) { | 112 | if (ret) |
| 93 | kmem_cache_free(user_ns_cachep, ns); | 113 | goto fail_free; |
| 94 | return ret; | ||
| 95 | } | ||
| 96 | ns->ns.ops = &userns_operations; | 114 | ns->ns.ops = &userns_operations; |
| 97 | 115 | ||
| 98 | atomic_set(&ns->count, 1); | 116 | atomic_set(&ns->count, 1); |
| @@ -101,18 +119,37 @@ int create_user_ns(struct cred *new) | |||
| 101 | ns->level = parent_ns->level + 1; | 119 | ns->level = parent_ns->level + 1; |
| 102 | ns->owner = owner; | 120 | ns->owner = owner; |
| 103 | ns->group = group; | 121 | ns->group = group; |
| 122 | INIT_WORK(&ns->work, free_user_ns); | ||
| 123 | for (i = 0; i < UCOUNT_COUNTS; i++) { | ||
| 124 | ns->ucount_max[i] = INT_MAX; | ||
| 125 | } | ||
| 126 | ns->ucounts = ucounts; | ||
| 104 | 127 | ||
| 105 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ | 128 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ |
| 106 | mutex_lock(&userns_state_mutex); | 129 | mutex_lock(&userns_state_mutex); |
| 107 | ns->flags = parent_ns->flags; | 130 | ns->flags = parent_ns->flags; |
| 108 | mutex_unlock(&userns_state_mutex); | 131 | mutex_unlock(&userns_state_mutex); |
| 109 | 132 | ||
| 110 | set_cred_user_ns(new, ns); | ||
| 111 | |||
| 112 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 133 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 113 | init_rwsem(&ns->persistent_keyring_register_sem); | 134 | init_rwsem(&ns->persistent_keyring_register_sem); |
| 114 | #endif | 135 | #endif |
| 136 | ret = -ENOMEM; | ||
| 137 | if (!setup_userns_sysctls(ns)) | ||
| 138 | goto fail_keyring; | ||
| 139 | |||
| 140 | set_cred_user_ns(new, ns); | ||
| 115 | return 0; | 141 | return 0; |
| 142 | fail_keyring: | ||
| 143 | #ifdef CONFIG_PERSISTENT_KEYRINGS | ||
| 144 | key_put(ns->persistent_keyring_register); | ||
| 145 | #endif | ||
| 146 | ns_free_inum(&ns->ns); | ||
| 147 | fail_free: | ||
| 148 | kmem_cache_free(user_ns_cachep, ns); | ||
| 149 | fail_dec: | ||
| 150 | dec_user_namespaces(ucounts); | ||
| 151 | fail: | ||
| 152 | return ret; | ||
| 116 | } | 153 | } |
| 117 | 154 | ||
| 118 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | 155 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) |
| @@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | |||
| 135 | return err; | 172 | return err; |
| 136 | } | 173 | } |
| 137 | 174 | ||
| 138 | void free_user_ns(struct user_namespace *ns) | 175 | static void free_user_ns(struct work_struct *work) |
| 139 | { | 176 | { |
| 140 | struct user_namespace *parent; | 177 | struct user_namespace *parent, *ns = |
| 178 | container_of(work, struct user_namespace, work); | ||
| 141 | 179 | ||
| 142 | do { | 180 | do { |
| 181 | struct ucounts *ucounts = ns->ucounts; | ||
| 143 | parent = ns->parent; | 182 | parent = ns->parent; |
| 183 | retire_userns_sysctls(ns); | ||
| 144 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 184 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 145 | key_put(ns->persistent_keyring_register); | 185 | key_put(ns->persistent_keyring_register); |
| 146 | #endif | 186 | #endif |
| 147 | ns_free_inum(&ns->ns); | 187 | ns_free_inum(&ns->ns); |
| 148 | kmem_cache_free(user_ns_cachep, ns); | 188 | kmem_cache_free(user_ns_cachep, ns); |
| 189 | dec_user_namespaces(ucounts); | ||
| 149 | ns = parent; | 190 | ns = parent; |
| 150 | } while (atomic_dec_and_test(&parent->count)); | 191 | } while (atomic_dec_and_test(&parent->count)); |
| 151 | } | 192 | } |
| 152 | EXPORT_SYMBOL(free_user_ns); | 193 | |
| 194 | void __put_user_ns(struct user_namespace *ns) | ||
| 195 | { | ||
| 196 | schedule_work(&ns->work); | ||
| 197 | } | ||
| 198 | EXPORT_SYMBOL(__put_user_ns); | ||
| 153 | 199 | ||
| 154 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) | 200 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) |
| 155 | { | 201 | { |
| @@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) | |||
| 1004 | return commit_creds(cred); | 1050 | return commit_creds(cred); |
| 1005 | } | 1051 | } |
| 1006 | 1052 | ||
| 1053 | struct ns_common *ns_get_owner(struct ns_common *ns) | ||
| 1054 | { | ||
| 1055 | struct user_namespace *my_user_ns = current_user_ns(); | ||
| 1056 | struct user_namespace *owner, *p; | ||
| 1057 | |||
| 1058 | /* See if the owner is in the current user namespace */ | ||
| 1059 | owner = p = ns->ops->owner(ns); | ||
| 1060 | for (;;) { | ||
| 1061 | if (!p) | ||
| 1062 | return ERR_PTR(-EPERM); | ||
| 1063 | if (p == my_user_ns) | ||
| 1064 | break; | ||
| 1065 | p = p->parent; | ||
| 1066 | } | ||
| 1067 | |||
| 1068 | return &get_user_ns(owner)->ns; | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | static struct user_namespace *userns_owner(struct ns_common *ns) | ||
| 1072 | { | ||
| 1073 | return to_user_ns(ns)->parent; | ||
| 1074 | } | ||
| 1075 | |||
| 1007 | const struct proc_ns_operations userns_operations = { | 1076 | const struct proc_ns_operations userns_operations = { |
| 1008 | .name = "user", | 1077 | .name = "user", |
| 1009 | .type = CLONE_NEWUSER, | 1078 | .type = CLONE_NEWUSER, |
| 1010 | .get = userns_get, | 1079 | .get = userns_get, |
| 1011 | .put = userns_put, | 1080 | .put = userns_put, |
| 1012 | .install = userns_install, | 1081 | .install = userns_install, |
| 1082 | .owner = userns_owner, | ||
| 1083 | .get_parent = ns_get_owner, | ||
| 1013 | }; | 1084 | }; |
| 1014 | 1085 | ||
| 1015 | static __init int user_namespaces_init(void) | 1086 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..6976cd47dcf6 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -17,6 +17,16 @@ | |||
| 17 | #include <linux/user_namespace.h> | 17 | #include <linux/user_namespace.h> |
| 18 | #include <linux/proc_ns.h> | 18 | #include <linux/proc_ns.h> |
| 19 | 19 | ||
| 20 | static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) | ||
| 21 | { | ||
| 22 | return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); | ||
| 23 | } | ||
| 24 | |||
| 25 | static void dec_uts_namespaces(struct ucounts *ucounts) | ||
| 26 | { | ||
| 27 | dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); | ||
| 28 | } | ||
| 29 | |||
| 20 | static struct uts_namespace *create_uts_ns(void) | 30 | static struct uts_namespace *create_uts_ns(void) |
| 21 | { | 31 | { |
| 22 | struct uts_namespace *uts_ns; | 32 | struct uts_namespace *uts_ns; |
| @@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
| 36 | struct uts_namespace *old_ns) | 46 | struct uts_namespace *old_ns) |
| 37 | { | 47 | { |
| 38 | struct uts_namespace *ns; | 48 | struct uts_namespace *ns; |
| 49 | struct ucounts *ucounts; | ||
| 39 | int err; | 50 | int err; |
| 40 | 51 | ||
| 52 | err = -ENOSPC; | ||
| 53 | ucounts = inc_uts_namespaces(user_ns); | ||
| 54 | if (!ucounts) | ||
| 55 | goto fail; | ||
| 56 | |||
| 57 | err = -ENOMEM; | ||
| 41 | ns = create_uts_ns(); | 58 | ns = create_uts_ns(); |
| 42 | if (!ns) | 59 | if (!ns) |
| 43 | return ERR_PTR(-ENOMEM); | 60 | goto fail_dec; |
| 44 | 61 | ||
| 45 | err = ns_alloc_inum(&ns->ns); | 62 | err = ns_alloc_inum(&ns->ns); |
| 46 | if (err) { | 63 | if (err) |
| 47 | kfree(ns); | 64 | goto fail_free; |
| 48 | return ERR_PTR(err); | ||
| 49 | } | ||
| 50 | 65 | ||
| 66 | ns->ucounts = ucounts; | ||
| 51 | ns->ns.ops = &utsns_operations; | 67 | ns->ns.ops = &utsns_operations; |
| 52 | 68 | ||
| 53 | down_read(&uts_sem); | 69 | down_read(&uts_sem); |
| @@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
| 55 | ns->user_ns = get_user_ns(user_ns); | 71 | ns->user_ns = get_user_ns(user_ns); |
| 56 | up_read(&uts_sem); | 72 | up_read(&uts_sem); |
| 57 | return ns; | 73 | return ns; |
| 74 | |||
| 75 | fail_free: | ||
| 76 | kfree(ns); | ||
| 77 | fail_dec: | ||
| 78 | dec_uts_namespaces(ucounts); | ||
| 79 | fail: | ||
| 80 | return ERR_PTR(err); | ||
| 58 | } | 81 | } |
| 59 | 82 | ||
| 60 | /* | 83 | /* |
| @@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) | |||
| 85 | struct uts_namespace *ns; | 108 | struct uts_namespace *ns; |
| 86 | 109 | ||
| 87 | ns = container_of(kref, struct uts_namespace, kref); | 110 | ns = container_of(kref, struct uts_namespace, kref); |
| 111 | dec_uts_namespaces(ns->ucounts); | ||
| 88 | put_user_ns(ns->user_ns); | 112 | put_user_ns(ns->user_ns); |
| 89 | ns_free_inum(&ns->ns); | 113 | ns_free_inum(&ns->ns); |
| 90 | kfree(ns); | 114 | kfree(ns); |
| @@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) | |||
| 130 | return 0; | 154 | return 0; |
| 131 | } | 155 | } |
| 132 | 156 | ||
| 157 | static struct user_namespace *utsns_owner(struct ns_common *ns) | ||
| 158 | { | ||
| 159 | return to_uts_ns(ns)->user_ns; | ||
| 160 | } | ||
| 161 | |||
| 133 | const struct proc_ns_operations utsns_operations = { | 162 | const struct proc_ns_operations utsns_operations = { |
| 134 | .name = "uts", | 163 | .name = "uts", |
| 135 | .type = CLONE_NEWUTS, | 164 | .type = CLONE_NEWUTS, |
| 136 | .get = utsns_get, | 165 | .get = utsns_get, |
| 137 | .put = utsns_put, | 166 | .put = utsns_put, |
| 138 | .install = utsns_install, | 167 | .install = utsns_install, |
| 168 | .owner = utsns_owner, | ||
| 139 | }; | 169 | }; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..479d840db286 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) | |||
| 2974 | } | 2974 | } |
| 2975 | EXPORT_SYMBOL(flush_delayed_work); | 2975 | EXPORT_SYMBOL(flush_delayed_work); |
| 2976 | 2976 | ||
| 2977 | static bool __cancel_work(struct work_struct *work, bool is_dwork) | ||
| 2978 | { | ||
| 2979 | unsigned long flags; | ||
| 2980 | int ret; | ||
| 2981 | |||
| 2982 | do { | ||
| 2983 | ret = try_to_grab_pending(work, is_dwork, &flags); | ||
| 2984 | } while (unlikely(ret == -EAGAIN)); | ||
| 2985 | |||
| 2986 | if (unlikely(ret < 0)) | ||
| 2987 | return false; | ||
| 2988 | |||
| 2989 | set_work_pool_and_clear_pending(work, get_work_pool_id(work)); | ||
| 2990 | local_irq_restore(flags); | ||
| 2991 | return ret; | ||
| 2992 | } | ||
| 2993 | |||
| 2994 | /* | ||
| 2995 | * See cancel_delayed_work() | ||
| 2996 | */ | ||
| 2997 | bool cancel_work(struct work_struct *work) | ||
| 2998 | { | ||
| 2999 | return __cancel_work(work, false); | ||
| 3000 | } | ||
| 3001 | |||
| 2977 | /** | 3002 | /** |
| 2978 | * cancel_delayed_work - cancel a delayed work | 3003 | * cancel_delayed_work - cancel a delayed work |
| 2979 | * @dwork: delayed_work to cancel | 3004 | * @dwork: delayed_work to cancel |
| @@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
| 2992 | */ | 3017 | */ |
| 2993 | bool cancel_delayed_work(struct delayed_work *dwork) | 3018 | bool cancel_delayed_work(struct delayed_work *dwork) |
| 2994 | { | 3019 | { |
| 2995 | unsigned long flags; | 3020 | return __cancel_work(&dwork->work, true); |
| 2996 | int ret; | ||
| 2997 | |||
| 2998 | do { | ||
| 2999 | ret = try_to_grab_pending(&dwork->work, true, &flags); | ||
| 3000 | } while (unlikely(ret == -EAGAIN)); | ||
| 3001 | |||
| 3002 | if (unlikely(ret < 0)) | ||
| 3003 | return false; | ||
| 3004 | |||
| 3005 | set_work_pool_and_clear_pending(&dwork->work, | ||
| 3006 | get_work_pool_id(&dwork->work)); | ||
| 3007 | local_irq_restore(flags); | ||
| 3008 | return ret; | ||
| 3009 | } | 3021 | } |
| 3010 | EXPORT_SYMBOL(cancel_delayed_work); | 3022 | EXPORT_SYMBOL(cancel_delayed_work); |
| 3011 | 3023 | ||
| @@ -4249,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
| 4249 | * This function is called without any synchronization and @task | 4261 | * This function is called without any synchronization and @task |
| 4250 | * could be in any state. Be careful with dereferences. | 4262 | * could be in any state. Be careful with dereferences. |
| 4251 | */ | 4263 | */ |
| 4252 | worker = probe_kthread_data(task); | 4264 | worker = kthread_probe_data(task); |
| 4253 | 4265 | ||
| 4254 | /* | 4266 | /* |
| 4255 | * Carefully copy the associated workqueue's workfn and name. Keep | 4267 | * Carefully copy the associated workqueue's workfn and name. Keep |
