aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-07-30 05:15:13 -0400
committerIngo Molnar <mingo@kernel.org>2017-07-30 05:15:13 -0400
commitf5db340f19f14a8df9dfd22d71fba1513e9f1f7e (patch)
tree131d3345bc987aee3c922624de816492e7f323a4 /kernel
parentee438ec8f33c5af0d4a4ffb935c5b9272e8c2680 (diff)
parent38115f2f8cec8087d558c062e779c443a01f87d6 (diff)
Merge branch 'perf/urgent' into perf/core, to pick up latest fixes and refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c62
-rw-r--r--kernel/audit.h29
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c55
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c49
-rw-r--r--kernel/bpf/hashtab.c21
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/syscall.c510
-rw-r--r--kernel/bpf/verifier.c297
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c155
-rw-r--r--kernel/cgroup/cgroup.c155
-rw-r--r--kernel/cgroup/cpuset.c33
-rw-r--r--kernel/cgroup/debug.c357
-rw-r--r--kernel/compat.c398
-rw-r--r--kernel/configs/android-base.config11
-rw-r--r--kernel/configs/android-recommended.config5
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/crash_core.c44
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c52
-rw-r--r--kernel/exit.c323
-rw-r--r--kernel/extable.c5
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/futex.c44
-rw-r--r--kernel/groups.c35
-rw-r--r--kernel/irq/affinity.c13
-rw-r--r--kernel/irq/chip.c14
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h14
-rw-r--r--kernel/irq/irqdesc.c3
-rw-r--r--kernel/irq/irqdomain.c19
-rw-r--r--kernel/irq/manage.c111
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kallsyms.c10
-rw-r--r--kernel/kcmp.c57
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kexec_core.c39
-rw-r--r--kernel/kexec_file.c29
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kmod.c56
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/locking/mutex.c6
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/qspinlock.c1
-rw-r--r--kernel/locking/qspinlock_paravirt.h3
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-spinlock.c4
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/module.c102
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c13
-rw-r--r--kernel/power/suspend.c35
-rw-r--r--kernel/printk/internal.h6
-rw-r--r--kernel/printk/printk.c19
-rw-r--r--kernel/printk/printk_safe.c36
-rw-r--r--kernel/sched/cpufreq_schedutil.c5
-rw-r--r--kernel/sched/cputime.c180
-rw-r--r--kernel/sched/deadline.c14
-rw-r--r--kernel/sched/fair.c32
-rw-r--r--kernel/seccomp.c16
-rw-r--r--kernel/signal.c159
-rw-r--r--kernel/sys.c122
-rw-r--r--kernel/sysctl.c335
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/hrtimer.c30
-rw-r--r--kernel/time/posix-cpu-timers.c8
-rw-r--r--kernel/time/posix-stubs.c96
-rw-r--r--kernel/time/posix-timers.c127
-rw-r--r--kernel/time/time.c58
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/bpf_trace.c66
-rw-r--r--kernel/trace/ftrace.c411
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c470
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_events.c66
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_sched_switch.c72
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/watchdog.c289
-rw-r--r--kernel/watchdog_hld.c37
92 files changed, 4246 insertions, 1884 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..4cb8e8b23c6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
82obj-$(CONFIG_KGDB) += debug/ 82obj-$(CONFIG_KGDB) += debug/
83obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 83obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
84obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 84obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
85obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o 85obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
86obj-$(CONFIG_SECCOMP) += seccomp.o 86obj-$(CONFIG_SECCOMP) += seccomp.o
87obj-$(CONFIG_RELAY) += relay.o 87obj-$(CONFIG_RELAY) += relay.o
88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 88obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 4b7d49868ce1..6dd556931739 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
575 575
576/** 576/**
577 * auditd_reset - Disconnect the auditd connection 577 * auditd_reset - Disconnect the auditd connection
578 * @ac: auditd connection state
578 * 579 *
579 * Description: 580 * Description:
580 * Break the auditd/kauditd connection and move all the queued records into the 581 * Break the auditd/kauditd connection and move all the queued records into the
581 * hold queue in case auditd reconnects. 582 * hold queue in case auditd reconnects. It is important to note that the @ac
583 * pointer should never be dereferenced inside this function as it may be NULL
584 * or invalid, you can only compare the memory address! If @ac is NULL then
585 * the connection will always be reset.
582 */ 586 */
583static void auditd_reset(void) 587static void auditd_reset(const struct auditd_connection *ac)
584{ 588{
585 unsigned long flags; 589 unsigned long flags;
586 struct sk_buff *skb; 590 struct sk_buff *skb;
@@ -590,17 +594,21 @@ static void auditd_reset(void)
590 spin_lock_irqsave(&auditd_conn_lock, flags); 594 spin_lock_irqsave(&auditd_conn_lock, flags);
591 ac_old = rcu_dereference_protected(auditd_conn, 595 ac_old = rcu_dereference_protected(auditd_conn,
592 lockdep_is_held(&auditd_conn_lock)); 596 lockdep_is_held(&auditd_conn_lock));
597 if (ac && ac != ac_old) {
598 /* someone already registered a new auditd connection */
599 spin_unlock_irqrestore(&auditd_conn_lock, flags);
600 return;
601 }
593 rcu_assign_pointer(auditd_conn, NULL); 602 rcu_assign_pointer(auditd_conn, NULL);
594 spin_unlock_irqrestore(&auditd_conn_lock, flags); 603 spin_unlock_irqrestore(&auditd_conn_lock, flags);
595 604
596 if (ac_old) 605 if (ac_old)
597 call_rcu(&ac_old->rcu, auditd_conn_free); 606 call_rcu(&ac_old->rcu, auditd_conn_free);
598 607
599 /* flush all of the main and retry queues to the hold queue */ 608 /* flush the retry queue to the hold queue, but don't touch the main
609 * queue since we need to process that normally for multicast */
600 while ((skb = skb_dequeue(&audit_retry_queue))) 610 while ((skb = skb_dequeue(&audit_retry_queue)))
601 kauditd_hold_skb(skb); 611 kauditd_hold_skb(skb);
602 while ((skb = skb_dequeue(&audit_queue)))
603 kauditd_hold_skb(skb);
604} 612}
605 613
606/** 614/**
@@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
633 ac = rcu_dereference(auditd_conn); 641 ac = rcu_dereference(auditd_conn);
634 if (!ac) { 642 if (!ac) {
635 rcu_read_unlock(); 643 rcu_read_unlock();
644 kfree_skb(skb);
636 rc = -ECONNREFUSED; 645 rc = -ECONNREFUSED;
637 goto err; 646 goto err;
638 } 647 }
@@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
649 return rc; 658 return rc;
650 659
651err: 660err:
652 if (rc == -ECONNREFUSED) 661 if (ac && rc == -ECONNREFUSED)
653 auditd_reset(); 662 auditd_reset(ac);
654 return rc; 663 return rc;
655} 664}
656 665
@@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy)
795 rc = kauditd_send_queue(sk, portid, 804 rc = kauditd_send_queue(sk, portid,
796 &audit_hold_queue, UNICAST_RETRIES, 805 &audit_hold_queue, UNICAST_RETRIES,
797 NULL, kauditd_rehold_skb); 806 NULL, kauditd_rehold_skb);
798 if (rc < 0) { 807 if (ac && rc < 0) {
799 sk = NULL; 808 sk = NULL;
800 auditd_reset(); 809 auditd_reset(ac);
801 goto main_queue; 810 goto main_queue;
802 } 811 }
803 812
@@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy)
805 rc = kauditd_send_queue(sk, portid, 814 rc = kauditd_send_queue(sk, portid,
806 &audit_retry_queue, UNICAST_RETRIES, 815 &audit_retry_queue, UNICAST_RETRIES,
807 NULL, kauditd_hold_skb); 816 NULL, kauditd_hold_skb);
808 if (rc < 0) { 817 if (ac && rc < 0) {
809 sk = NULL; 818 sk = NULL;
810 auditd_reset(); 819 auditd_reset(ac);
811 goto main_queue; 820 goto main_queue;
812 } 821 }
813 822
@@ -815,12 +824,13 @@ main_queue:
815 /* process the main queue - do the multicast send and attempt 824 /* process the main queue - do the multicast send and attempt
816 * unicast, dump failed record sends to the retry queue; if 825 * unicast, dump failed record sends to the retry queue; if
817 * sk == NULL due to previous failures we will just do the 826 * sk == NULL due to previous failures we will just do the
818 * multicast send and move the record to the retry queue */ 827 * multicast send and move the record to the hold queue */
819 rc = kauditd_send_queue(sk, portid, &audit_queue, 1, 828 rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
820 kauditd_send_multicast_skb, 829 kauditd_send_multicast_skb,
821 kauditd_retry_skb); 830 (sk ?
822 if (sk == NULL || rc < 0) 831 kauditd_retry_skb : kauditd_hold_skb));
823 auditd_reset(); 832 if (ac && rc < 0)
833 auditd_reset(ac);
824 sk = NULL; 834 sk = NULL;
825 835
826 /* drop our netns reference, no auditd sends past this line */ 836 /* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1230 auditd_pid, 1); 1240 auditd_pid, 1);
1231 1241
1232 /* unregister the auditd connection */ 1242 /* unregister the auditd connection */
1233 auditd_reset(); 1243 auditd_reset(NULL);
1234 } 1244 }
1235 } 1245 }
1236 if (s.mask & AUDIT_STATUS_RATE_LIMIT) { 1246 if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1999 2009
2000static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) 2010static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
2001{ 2011{
2002 kernel_cap_t *perm = &name->fcap.permitted; 2012 audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
2003 kernel_cap_t *inh = &name->fcap.inheritable; 2013 audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
2004 int log = 0; 2014 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
2005 2015 name->fcap.fE, name->fcap_ver);
2006 if (!cap_isclear(*perm)) {
2007 audit_log_cap(ab, "cap_fp", perm);
2008 log = 1;
2009 }
2010 if (!cap_isclear(*inh)) {
2011 audit_log_cap(ab, "cap_fi", inh);
2012 log = 1;
2013 }
2014
2015 if (log)
2016 audit_log_format(ab, " cap_fe=%d cap_fver=%x",
2017 name->fcap.fE, name->fcap_ver);
2018} 2016}
2019 2017
2020static inline int audit_copy_fcaps(struct audit_names *name, 2018static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
68 unsigned int fE; /* effective bit of file cap */ 68 unsigned int fE; /* effective bit of file cap */
69 kernel_cap_t effective; /* effective set of process */ 69 kernel_cap_t effective; /* effective set of process */
70 }; 70 };
71 kernel_cap_t ambient;
71}; 72};
72 73
73/* When fs/namei.c:getname() is called, we store the pointer in name and bump 74/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -247,13 +248,13 @@ struct audit_netlink_list {
247 struct sk_buff_head q; 248 struct sk_buff_head q;
248}; 249};
249 250
250int audit_send_list(void *); 251int audit_send_list(void *_dest);
251 252
252extern int selinux_audit_rule_update(void); 253extern int selinux_audit_rule_update(void);
253 254
254extern struct mutex audit_filter_mutex; 255extern struct mutex audit_filter_mutex;
255extern int audit_del_rule(struct audit_entry *); 256extern int audit_del_rule(struct audit_entry *entry);
256extern void audit_free_rule_rcu(struct rcu_head *); 257extern void audit_free_rule_rcu(struct rcu_head *head);
257extern struct list_head audit_filter_list[]; 258extern struct list_head audit_filter_list[];
258 259
259extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); 260extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
301#endif /* CONFIG_AUDIT_WATCH */ 302#endif /* CONFIG_AUDIT_WATCH */
302 303
303#ifdef CONFIG_AUDIT_TREE 304#ifdef CONFIG_AUDIT_TREE
304extern struct audit_chunk *audit_tree_lookup(const struct inode *); 305extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
305extern void audit_put_chunk(struct audit_chunk *); 306extern void audit_put_chunk(struct audit_chunk *chunk);
306extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); 307extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
307extern int audit_make_tree(struct audit_krule *, char *, u32); 308extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
308extern int audit_add_tree_rule(struct audit_krule *); 309extern int audit_add_tree_rule(struct audit_krule *rule);
309extern int audit_remove_tree_rule(struct audit_krule *); 310extern int audit_remove_tree_rule(struct audit_krule *rule);
310extern void audit_trim_trees(void); 311extern void audit_trim_trees(void);
311extern int audit_tag_tree(char *old, char *new); 312extern int audit_tag_tree(char *old, char *new);
312extern const char *audit_tree_path(struct audit_tree *); 313extern const char *audit_tree_path(struct audit_tree *tree);
313extern void audit_put_tree(struct audit_tree *); 314extern void audit_put_tree(struct audit_tree *tree);
314extern void audit_kill_trees(struct list_head *); 315extern void audit_kill_trees(struct list_head *list);
315#else 316#else
316#define audit_remove_tree_rule(rule) BUG() 317#define audit_remove_tree_rule(rule) BUG()
317#define audit_add_tree_rule(rule) -EINVAL 318#define audit_add_tree_rule(rule) -EINVAL
@@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
323#define audit_kill_trees(list) BUG() 324#define audit_kill_trees(list) BUG()
324#endif 325#endif
325 326
326extern char *audit_unpack_string(void **, size_t *, size_t); 327extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
327 328
328extern pid_t audit_sig_pid; 329extern pid_t audit_sig_pid;
329extern kuid_t audit_sig_uid; 330extern kuid_t audit_sig_uid;
@@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
333 334
334#ifdef CONFIG_AUDITSYSCALL 335#ifdef CONFIG_AUDITSYSCALL
335extern int audit_signal_info(int sig, struct task_struct *t); 336extern int audit_signal_info(int sig, struct task_struct *t);
336extern void audit_filter_inodes(struct task_struct *, struct audit_context *); 337extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
337extern struct list_head *audit_killed_trees(void); 338extern struct list_head *audit_killed_trees(void);
338#else 339#else
339#define audit_signal_info(s,t) AUDIT_DISABLED 340#define audit_signal_info(s,t) AUDIT_DISABLED
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb724baa7ac9..3260ba2312a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1261 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); 1261 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1262 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); 1262 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1263 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); 1263 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1264 audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
1264 break; 1265 break;
1265 case AUDIT_MMAP: 1266 case AUDIT_MMAP:
1266 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, 1267 audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1382 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); 1383 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1383 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); 1384 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1384 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); 1385 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1385 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); 1386 audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
1386 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); 1387 audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
1387 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); 1388 audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
1389 audit_log_cap(ab, "pe", &axs->new_pcap.effective);
1390 audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
1388 break; } 1391 break; }
1389 1392
1390 } 1393 }
@@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2342 ax->old_pcap.permitted = old->cap_permitted; 2345 ax->old_pcap.permitted = old->cap_permitted;
2343 ax->old_pcap.inheritable = old->cap_inheritable; 2346 ax->old_pcap.inheritable = old->cap_inheritable;
2344 ax->old_pcap.effective = old->cap_effective; 2347 ax->old_pcap.effective = old->cap_effective;
2348 ax->old_pcap.ambient = old->cap_ambient;
2345 2349
2346 ax->new_pcap.permitted = new->cap_permitted; 2350 ax->new_pcap.permitted = new->cap_permitted;
2347 ax->new_pcap.inheritable = new->cap_inheritable; 2351 ax->new_pcap.inheritable = new->cap_inheritable;
2348 ax->new_pcap.effective = new->cap_effective; 2352 ax->new_pcap.effective = new->cap_effective;
2353 ax->new_pcap.ambient = new->cap_ambient;
2349 return 0; 2354 return 0;
2350} 2355}
2351 2356
@@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
2364 context->capset.cap.effective = new->cap_effective; 2369 context->capset.cap.effective = new->cap_effective;
2365 context->capset.cap.inheritable = new->cap_effective; 2370 context->capset.cap.inheritable = new->cap_effective;
2366 context->capset.cap.permitted = new->cap_permitted; 2371 context->capset.cap.permitted = new->cap_permitted;
2372 context->capset.cap.ambient = new->cap_ambient;
2367 context->type = AUDIT_CAPSET; 2373 context->type = AUDIT_CAPSET;
2368} 2374}
2369 2375
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
335} 335}
336 336
337/* only called from syscall */ 337/* only called from syscall */
338int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
339{
340 void **elem, *ptr;
341 int ret = 0;
342
343 if (!map->ops->map_fd_sys_lookup_elem)
344 return -ENOTSUPP;
345
346 rcu_read_lock();
347 elem = array_map_lookup_elem(map, key);
348 if (elem && (ptr = READ_ONCE(*elem)))
349 *value = map->ops->map_fd_sys_lookup_elem(ptr);
350 else
351 ret = -ENOENT;
352 rcu_read_unlock();
353
354 return ret;
355}
356
357/* only called from syscall */
338int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 358int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
339 void *key, void *value, u64 map_flags) 359 void *key, void *value, u64 map_flags)
340{ 360{
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
400 bpf_prog_put(ptr); 420 bpf_prog_put(ptr);
401} 421}
402 422
423static u32 prog_fd_array_sys_lookup_elem(void *ptr)
424{
425 return ((struct bpf_prog *)ptr)->aux->id;
426}
427
403/* decrement refcnt of all bpf_progs that are stored in this map */ 428/* decrement refcnt of all bpf_progs that are stored in this map */
404void bpf_fd_array_map_clear(struct bpf_map *map) 429void bpf_fd_array_map_clear(struct bpf_map *map)
405{ 430{
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
418 .map_delete_elem = fd_array_map_delete_elem, 443 .map_delete_elem = fd_array_map_delete_elem,
419 .map_fd_get_ptr = prog_fd_array_get_ptr, 444 .map_fd_get_ptr = prog_fd_array_get_ptr,
420 .map_fd_put_ptr = prog_fd_array_put_ptr, 445 .map_fd_put_ptr = prog_fd_array_put_ptr,
446 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
421}; 447};
422 448
423static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 449static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -452,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
452static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 478static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
453 struct file *map_file, int fd) 479 struct file *map_file, int fd)
454{ 480{
455 const struct perf_event_attr *attr;
456 struct bpf_event_entry *ee; 481 struct bpf_event_entry *ee;
457 struct perf_event *event; 482 struct perf_event *event;
458 struct file *perf_file; 483 struct file *perf_file;
484 u64 value;
459 485
460 perf_file = perf_event_get(fd); 486 perf_file = perf_event_get(fd);
461 if (IS_ERR(perf_file)) 487 if (IS_ERR(perf_file))
462 return perf_file; 488 return perf_file;
463 489
490 ee = ERR_PTR(-EOPNOTSUPP);
464 event = perf_file->private_data; 491 event = perf_file->private_data;
465 ee = ERR_PTR(-EINVAL); 492 if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
466
467 attr = perf_event_attrs(event);
468 if (IS_ERR(attr) || attr->inherit)
469 goto err_out; 493 goto err_out;
470 494
471 switch (attr->type) { 495 ee = bpf_event_entry_gen(perf_file, map_file);
472 case PERF_TYPE_SOFTWARE: 496 if (ee)
473 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) 497 return ee;
474 goto err_out; 498 ee = ERR_PTR(-ENOMEM);
475 /* fall-through */
476 case PERF_TYPE_RAW:
477 case PERF_TYPE_HARDWARE:
478 ee = bpf_event_entry_gen(perf_file, map_file);
479 if (ee)
480 return ee;
481 ee = ERR_PTR(-ENOMEM);
482 /* fall-through */
483 default:
484 break;
485 }
486
487err_out: 499err_out:
488 fput(perf_file); 500 fput(perf_file);
489 return ee; 501 return ee;
@@ -599,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
599 .map_delete_elem = fd_array_map_delete_elem, 611 .map_delete_elem = fd_array_map_delete_elem,
600 .map_fd_get_ptr = bpf_map_fd_get_ptr, 612 .map_fd_get_ptr = bpf_map_fd_get_ptr,
601 .map_fd_put_ptr = bpf_map_fd_put_ptr, 613 .map_fd_put_ptr = bpf_map_fd_put_ptr,
614 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
602}; 615};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
236 return ret; 236 return ret;
237} 237}
238EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 238EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
239
240/**
241 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
242 * @sk: socket to get cgroup from
243 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
244 * sk with connection information (IP addresses, etc.) May not contain
245 * cgroup info if it is a req sock.
246 * @type: The type of program to be exectuted
247 *
248 * socket passed is expected to be of type INET or INET6.
249 *
250 * The program type passed in via @type must be suitable for sock_ops
251 * filtering. No further check is performed to assert that.
252 *
253 * This function will return %-EPERM if any if an attached program was found
254 * and if it returned != 1 during execution. In all other cases, 0 is returned.
255 */
256int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
257 struct bpf_sock_ops_kern *sock_ops,
258 enum bpf_attach_type type)
259{
260 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
261 struct bpf_prog *prog;
262 int ret = 0;
263
264
265 rcu_read_lock();
266
267 prog = rcu_dereference(cgrp->bpf.effective[type]);
268 if (prog)
269 ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
270
271 rcu_read_unlock();
272
273 return ret;
274}
275EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
763 * 763 *
764 * Decode and execute eBPF instructions. 764 * Decode and execute eBPF instructions.
765 */ 765 */
766static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) 766static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
767 u64 *stack)
767{ 768{
768 u64 stack[MAX_BPF_STACK / sizeof(u64)]; 769 u64 tmp;
769 u64 regs[MAX_BPF_REG], tmp;
770 static const void *jumptable[256] = { 770 static const void *jumptable[256] = {
771 [0 ... 255] = &&default_label, 771 [0 ... 255] = &&default_label,
772 /* Now overwrite non-defaults ... */ 772 /* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
824 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, 824 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
825 /* Call instruction */ 825 /* Call instruction */
826 [BPF_JMP | BPF_CALL] = &&JMP_CALL, 826 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
827 [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, 827 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
828 /* Jumps */ 828 /* Jumps */
829 [BPF_JMP | BPF_JA] = &&JMP_JA, 829 [BPF_JMP | BPF_JA] = &&JMP_JA,
830 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, 830 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
874#define CONT ({ insn++; goto select_insn; }) 874#define CONT ({ insn++; goto select_insn; })
875#define CONT_JMP ({ insn++; goto select_insn; }) 875#define CONT_JMP ({ insn++; goto select_insn; })
876 876
877 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
878 ARG1 = (u64) (unsigned long) ctx;
879
880select_insn: 877select_insn:
881 goto *jumptable[insn->code]; 878 goto *jumptable[insn->code];
882 879
@@ -1219,7 +1216,39 @@ load_byte:
1219 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); 1216 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
1220 return 0; 1217 return 0;
1221} 1218}
1222STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ 1219STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
1220
1221#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
1222#define DEFINE_BPF_PROG_RUN(stack_size) \
1223static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
1224{ \
1225 u64 stack[stack_size / sizeof(u64)]; \
1226 u64 regs[MAX_BPF_REG]; \
1227\
1228 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
1229 ARG1 = (u64) (unsigned long) ctx; \
1230 return ___bpf_prog_run(regs, insn, stack); \
1231}
1232
1233#define EVAL1(FN, X) FN(X)
1234#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
1235#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
1236#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
1237#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
1238#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
1239
1240EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
1241EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
1242EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
1243
1244#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
1245
1246static unsigned int (*interpreters[])(const void *ctx,
1247 const struct bpf_insn *insn) = {
1248EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
1249EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1250EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1251};
1223 1252
1224bool bpf_prog_array_compatible(struct bpf_array *array, 1253bool bpf_prog_array_compatible(struct bpf_array *array,
1225 const struct bpf_prog *fp) 1254 const struct bpf_prog *fp)
@@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1268 */ 1297 */
1269struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) 1298struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1270{ 1299{
1271 fp->bpf_func = (void *) __bpf_prog_run; 1300 u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
1301
1302 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
1272 1303
1273 /* eBPF JITs can rewrite the program in case constant 1304 /* eBPF JITs can rewrite the program in case constant
1274 * blinding is active. However, in case of error during 1305 * blinding is active. However, in case of error during
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map)
1244} 1244}
1245 1245
1246/* only called from syscall */ 1246/* only called from syscall */
1247int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
1248{
1249 void **ptr;
1250 int ret = 0;
1251
1252 if (!map->ops->map_fd_sys_lookup_elem)
1253 return -ENOTSUPP;
1254
1255 rcu_read_lock();
1256 ptr = htab_map_lookup_elem(map, key);
1257 if (ptr)
1258 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
1259 else
1260 ret = -ENOENT;
1261 rcu_read_unlock();
1262
1263 return ret;
1264}
1265
1266/* only called from syscall */
1247int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 1267int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
1248 void *key, void *value, u64 map_flags) 1268 void *key, void *value, u64 map_flags)
1249{ 1269{
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
1305 .map_delete_elem = htab_map_delete_elem, 1325 .map_delete_elem = htab_map_delete_elem,
1306 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1326 .map_fd_get_ptr = bpf_map_fd_get_ptr,
1307 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1327 .map_fd_put_ptr = bpf_map_fd_put_ptr,
1328 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1308}; 1329};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
377 bpf_any_put(inode->i_private, type); 377 bpf_any_put(inode->i_private, type);
378} 378}
379 379
380/*
381 * Display the mount options in /proc/mounts.
382 */
383static int bpf_show_options(struct seq_file *m, struct dentry *root)
384{
385 umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
386
387 if (mode != S_IRWXUGO)
388 seq_printf(m, ",mode=%o", mode);
389 return 0;
390}
391
380static const struct super_operations bpf_super_ops = { 392static const struct super_operations bpf_super_ops = {
381 .statfs = simple_statfs, 393 .statfs = simple_statfs,
382 .drop_inode = generic_delete_inode, 394 .drop_inode = generic_delete_inode,
383 .show_options = generic_show_options, 395 .show_options = bpf_show_options,
384 .evict_inode = bpf_evict_inode, 396 .evict_inode = bpf_evict_inode,
385}; 397};
386 398
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
434 struct inode *inode; 446 struct inode *inode;
435 int ret; 447 int ret;
436 448
437 save_mount_options(sb, data);
438
439 ret = bpf_parse_options(data, &opts); 449 ret = bpf_parse_options(data, &opts);
440 if (ret) 450 if (ret)
441 return ret; 451 return ret;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
95 */ 95 */
96 bpf_map_put(ptr); 96 bpf_map_put(ptr);
97} 97}
98
99u32 bpf_map_fd_sys_lookup_elem(void *ptr)
100{
101 return ((struct bpf_map *)ptr)->id;
102}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
19void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, 19void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
20 int ufd); 20 int ufd);
21void bpf_map_fd_put_ptr(void *ptr); 21void bpf_map_fd_put_ptr(void *ptr);
22u32 bpf_map_fd_sys_lookup_elem(void *ptr);
22 23
23#endif 24#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..045646da97cc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
22#include <linux/filter.h> 22#include <linux/filter.h>
23#include <linux/version.h> 23#include <linux/version.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/idr.h>
26
27#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
28 (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
29 (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
30 (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
31#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
32#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
25 33
26DEFINE_PER_CPU(int, bpf_prog_active); 34DEFINE_PER_CPU(int, bpf_prog_active);
35static DEFINE_IDR(prog_idr);
36static DEFINE_SPINLOCK(prog_idr_lock);
37static DEFINE_IDR(map_idr);
38static DEFINE_SPINLOCK(map_idr_lock);
27 39
28int sysctl_unprivileged_bpf_disabled __read_mostly; 40int sysctl_unprivileged_bpf_disabled __read_mostly;
29 41
@@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
114 free_uid(user); 126 free_uid(user);
115} 127}
116 128
129static int bpf_map_alloc_id(struct bpf_map *map)
130{
131 int id;
132
133 spin_lock_bh(&map_idr_lock);
134 id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
135 if (id > 0)
136 map->id = id;
137 spin_unlock_bh(&map_idr_lock);
138
139 if (WARN_ON_ONCE(!id))
140 return -ENOSPC;
141
142 return id > 0 ? 0 : id;
143}
144
145static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
146{
147 if (do_idr_lock)
148 spin_lock_bh(&map_idr_lock);
149 else
150 __acquire(&map_idr_lock);
151
152 idr_remove(&map_idr, map->id);
153
154 if (do_idr_lock)
155 spin_unlock_bh(&map_idr_lock);
156 else
157 __release(&map_idr_lock);
158}
159
117/* called from workqueue */ 160/* called from workqueue */
118static void bpf_map_free_deferred(struct work_struct *work) 161static void bpf_map_free_deferred(struct work_struct *work)
119{ 162{
@@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
135/* decrement map refcnt and schedule it for freeing via workqueue 178/* decrement map refcnt and schedule it for freeing via workqueue
136 * (unrelying map implementation ops->map_free() might sleep) 179 * (unrelying map implementation ops->map_free() might sleep)
137 */ 180 */
138void bpf_map_put(struct bpf_map *map) 181static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
139{ 182{
140 if (atomic_dec_and_test(&map->refcnt)) { 183 if (atomic_dec_and_test(&map->refcnt)) {
184 /* bpf_map_free_id() must be called first */
185 bpf_map_free_id(map, do_idr_lock);
141 INIT_WORK(&map->work, bpf_map_free_deferred); 186 INIT_WORK(&map->work, bpf_map_free_deferred);
142 schedule_work(&map->work); 187 schedule_work(&map->work);
143 } 188 }
144} 189}
145 190
191void bpf_map_put(struct bpf_map *map)
192{
193 __bpf_map_put(map, true);
194}
195
146void bpf_map_put_with_uref(struct bpf_map *map) 196void bpf_map_put_with_uref(struct bpf_map *map)
147{ 197{
148 bpf_map_put_uref(map); 198 bpf_map_put_uref(map);
@@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
166 const struct bpf_map *map = filp->private_data; 216 const struct bpf_map *map = filp->private_data;
167 const struct bpf_array *array; 217 const struct bpf_array *array;
168 u32 owner_prog_type = 0; 218 u32 owner_prog_type = 0;
219 u32 owner_jited = 0;
169 220
170 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { 221 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
171 array = container_of(map, struct bpf_array, map); 222 array = container_of(map, struct bpf_array, map);
172 owner_prog_type = array->owner_prog_type; 223 owner_prog_type = array->owner_prog_type;
224 owner_jited = array->owner_jited;
173 } 225 }
174 226
175 seq_printf(m, 227 seq_printf(m,
@@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
186 map->map_flags, 238 map->map_flags,
187 map->pages * 1ULL << PAGE_SHIFT); 239 map->pages * 1ULL << PAGE_SHIFT);
188 240
189 if (owner_prog_type) 241 if (owner_prog_type) {
190 seq_printf(m, "owner_prog_type:\t%u\n", 242 seq_printf(m, "owner_prog_type:\t%u\n",
191 owner_prog_type); 243 owner_prog_type);
244 seq_printf(m, "owner_jited:\t%u\n",
245 owner_jited);
246 }
192} 247}
193#endif 248#endif
194 249
@@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr)
236 if (err) 291 if (err)
237 goto free_map_nouncharge; 292 goto free_map_nouncharge;
238 293
239 err = bpf_map_new_fd(map); 294 err = bpf_map_alloc_id(map);
240 if (err < 0) 295 if (err)
241 /* failed to allocate fd */
242 goto free_map; 296 goto free_map;
243 297
298 err = bpf_map_new_fd(map);
299 if (err < 0) {
300 /* failed to allocate fd.
301 * bpf_map_put() is needed because the above
302 * bpf_map_alloc_id() has published the map
303 * to the userspace and the userspace may
304 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
305 */
306 bpf_map_put(map);
307 return err;
308 }
309
244 trace_bpf_map_create(map, err); 310 trace_bpf_map_create(map, err);
245 return err; 311 return err;
246 312
@@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
295 return map; 361 return map;
296} 362}
297 363
364/* map_idr_lock should have been held */
365static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
366 bool uref)
367{
368 int refold;
369
370 refold = __atomic_add_unless(&map->refcnt, 1, 0);
371
372 if (refold >= BPF_MAX_REFCNT) {
373 __bpf_map_put(map, false);
374 return ERR_PTR(-EBUSY);
375 }
376
377 if (!refold)
378 return ERR_PTR(-ENOENT);
379
380 if (uref)
381 atomic_inc(&map->usercnt);
382
383 return map;
384}
385
298int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) 386int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
299{ 387{
300 return -ENOTSUPP; 388 return -ENOTSUPP;
@@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr)
322 if (IS_ERR(map)) 410 if (IS_ERR(map))
323 return PTR_ERR(map); 411 return PTR_ERR(map);
324 412
325 err = -ENOMEM; 413 key = memdup_user(ukey, map->key_size);
326 key = kmalloc(map->key_size, GFP_USER); 414 if (IS_ERR(key)) {
327 if (!key) 415 err = PTR_ERR(key);
328 goto err_put; 416 goto err_put;
329 417 }
330 err = -EFAULT;
331 if (copy_from_user(key, ukey, map->key_size) != 0)
332 goto free_key;
333 418
334 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 419 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
335 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 420 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
336 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 421 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
337 value_size = round_up(map->value_size, 8) * num_possible_cpus(); 422 value_size = round_up(map->value_size, 8) * num_possible_cpus();
423 else if (IS_FD_MAP(map))
424 value_size = sizeof(u32);
338 else 425 else
339 value_size = map->value_size; 426 value_size = map->value_size;
340 427
@@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr)
350 err = bpf_percpu_array_copy(map, key, value); 437 err = bpf_percpu_array_copy(map, key, value);
351 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 438 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
352 err = bpf_stackmap_copy(map, key, value); 439 err = bpf_stackmap_copy(map, key, value);
353 } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 440 } else if (IS_FD_ARRAY(map)) {
354 map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 441 err = bpf_fd_array_map_lookup_elem(map, key, value);
355 err = -ENOTSUPP; 442 } else if (IS_FD_HASH(map)) {
443 err = bpf_fd_htab_map_lookup_elem(map, key, value);
356 } else { 444 } else {
357 rcu_read_lock(); 445 rcu_read_lock();
358 ptr = map->ops->map_lookup_elem(map, key); 446 ptr = map->ops->map_lookup_elem(map, key);
@@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr)
402 if (IS_ERR(map)) 490 if (IS_ERR(map))
403 return PTR_ERR(map); 491 return PTR_ERR(map);
404 492
405 err = -ENOMEM; 493 key = memdup_user(ukey, map->key_size);
406 key = kmalloc(map->key_size, GFP_USER); 494 if (IS_ERR(key)) {
407 if (!key) 495 err = PTR_ERR(key);
408 goto err_put; 496 goto err_put;
409 497 }
410 err = -EFAULT;
411 if (copy_from_user(key, ukey, map->key_size) != 0)
412 goto free_key;
413 498
414 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 499 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
415 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 500 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr)
488 if (IS_ERR(map)) 573 if (IS_ERR(map))
489 return PTR_ERR(map); 574 return PTR_ERR(map);
490 575
491 err = -ENOMEM; 576 key = memdup_user(ukey, map->key_size);
492 key = kmalloc(map->key_size, GFP_USER); 577 if (IS_ERR(key)) {
493 if (!key) 578 err = PTR_ERR(key);
494 goto err_put; 579 goto err_put;
495 580 }
496 err = -EFAULT;
497 if (copy_from_user(key, ukey, map->key_size) != 0)
498 goto free_key;
499 581
500 preempt_disable(); 582 preempt_disable();
501 __this_cpu_inc(bpf_prog_active); 583 __this_cpu_inc(bpf_prog_active);
@@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr)
507 589
508 if (!err) 590 if (!err)
509 trace_bpf_map_delete_elem(map, ufd, key); 591 trace_bpf_map_delete_elem(map, ufd, key);
510free_key:
511 kfree(key); 592 kfree(key);
512err_put: 593err_put:
513 fdput(f); 594 fdput(f);
@@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr)
536 return PTR_ERR(map); 617 return PTR_ERR(map);
537 618
538 if (ukey) { 619 if (ukey) {
539 err = -ENOMEM; 620 key = memdup_user(ukey, map->key_size);
540 key = kmalloc(map->key_size, GFP_USER); 621 if (IS_ERR(key)) {
541 if (!key) 622 err = PTR_ERR(key);
542 goto err_put; 623 goto err_put;
543 624 }
544 err = -EFAULT;
545 if (copy_from_user(key, ukey, map->key_size) != 0)
546 goto free_key;
547 } else { 625 } else {
548 key = NULL; 626 key = NULL;
549 } 627 }
@@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
650 free_uid(user); 728 free_uid(user);
651} 729}
652 730
731static int bpf_prog_alloc_id(struct bpf_prog *prog)
732{
733 int id;
734
735 spin_lock_bh(&prog_idr_lock);
736 id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
737 if (id > 0)
738 prog->aux->id = id;
739 spin_unlock_bh(&prog_idr_lock);
740
741 /* id is in [1, INT_MAX) */
742 if (WARN_ON_ONCE(!id))
743 return -ENOSPC;
744
745 return id > 0 ? 0 : id;
746}
747
748static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
749{
750 /* cBPF to eBPF migrations are currently not in the idr store. */
751 if (!prog->aux->id)
752 return;
753
754 if (do_idr_lock)
755 spin_lock_bh(&prog_idr_lock);
756 else
757 __acquire(&prog_idr_lock);
758
759 idr_remove(&prog_idr, prog->aux->id);
760
761 if (do_idr_lock)
762 spin_unlock_bh(&prog_idr_lock);
763 else
764 __release(&prog_idr_lock);
765}
766
653static void __bpf_prog_put_rcu(struct rcu_head *rcu) 767static void __bpf_prog_put_rcu(struct rcu_head *rcu)
654{ 768{
655 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 769 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
659 bpf_prog_free(aux->prog); 773 bpf_prog_free(aux->prog);
660} 774}
661 775
662void bpf_prog_put(struct bpf_prog *prog) 776static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
663{ 777{
664 if (atomic_dec_and_test(&prog->aux->refcnt)) { 778 if (atomic_dec_and_test(&prog->aux->refcnt)) {
665 trace_bpf_prog_put_rcu(prog); 779 trace_bpf_prog_put_rcu(prog);
780 /* bpf_prog_free_id() must be called first */
781 bpf_prog_free_id(prog, do_idr_lock);
666 bpf_prog_kallsyms_del(prog); 782 bpf_prog_kallsyms_del(prog);
667 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 783 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
668 } 784 }
669} 785}
786
787void bpf_prog_put(struct bpf_prog *prog)
788{
789 __bpf_prog_put(prog, true);
790}
670EXPORT_SYMBOL_GPL(bpf_prog_put); 791EXPORT_SYMBOL_GPL(bpf_prog_put);
671 792
672static int bpf_prog_release(struct inode *inode, struct file *filp) 793static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
748} 869}
749EXPORT_SYMBOL_GPL(bpf_prog_inc); 870EXPORT_SYMBOL_GPL(bpf_prog_inc);
750 871
872/* prog_idr_lock should have been held */
873static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
874{
875 int refold;
876
877 refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
878
879 if (refold >= BPF_MAX_REFCNT) {
880 __bpf_prog_put(prog, false);
881 return ERR_PTR(-EBUSY);
882 }
883
884 if (!refold)
885 return ERR_PTR(-ENOENT);
886
887 return prog;
888}
889
751static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) 890static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
752{ 891{
753 struct fd f = fdget(ufd); 892 struct fd f = fdget(ufd);
@@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr)
815 attr->kern_version != LINUX_VERSION_CODE) 954 attr->kern_version != LINUX_VERSION_CODE)
816 return -EINVAL; 955 return -EINVAL;
817 956
818 if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) 957 if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
958 type != BPF_PROG_TYPE_CGROUP_SKB &&
959 !capable(CAP_SYS_ADMIN))
819 return -EPERM; 960 return -EPERM;
820 961
821 /* plain bpf_prog allocation */ 962 /* plain bpf_prog allocation */
@@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr)
855 if (err < 0) 996 if (err < 0)
856 goto free_used_maps; 997 goto free_used_maps;
857 998
858 err = bpf_prog_new_fd(prog); 999 err = bpf_prog_alloc_id(prog);
859 if (err < 0) 1000 if (err)
860 /* failed to allocate fd */
861 goto free_used_maps; 1001 goto free_used_maps;
862 1002
1003 err = bpf_prog_new_fd(prog);
1004 if (err < 0) {
1005 /* failed to allocate fd.
1006 * bpf_prog_put() is needed because the above
1007 * bpf_prog_alloc_id() has published the prog
1008 * to the userspace and the userspace may
1009 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
1010 */
1011 bpf_prog_put(prog);
1012 return err;
1013 }
1014
863 bpf_prog_kallsyms_add(prog); 1015 bpf_prog_kallsyms_add(prog);
864 trace_bpf_prog_load(prog, err); 1016 trace_bpf_prog_load(prog, err);
865 return err; 1017 return err;
@@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
919 case BPF_CGROUP_INET_SOCK_CREATE: 1071 case BPF_CGROUP_INET_SOCK_CREATE:
920 ptype = BPF_PROG_TYPE_CGROUP_SOCK; 1072 ptype = BPF_PROG_TYPE_CGROUP_SOCK;
921 break; 1073 break;
1074 case BPF_CGROUP_SOCK_OPS:
1075 ptype = BPF_PROG_TYPE_SOCK_OPS;
1076 break;
922 default: 1077 default:
923 return -EINVAL; 1078 return -EINVAL;
924 } 1079 }
@@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
959 case BPF_CGROUP_INET_INGRESS: 1114 case BPF_CGROUP_INET_INGRESS:
960 case BPF_CGROUP_INET_EGRESS: 1115 case BPF_CGROUP_INET_EGRESS:
961 case BPF_CGROUP_INET_SOCK_CREATE: 1116 case BPF_CGROUP_INET_SOCK_CREATE:
1117 case BPF_CGROUP_SOCK_OPS:
962 cgrp = cgroup_get_from_fd(attr->target_fd); 1118 cgrp = cgroup_get_from_fd(attr->target_fd);
963 if (IS_ERR(cgrp)) 1119 if (IS_ERR(cgrp))
964 return PTR_ERR(cgrp); 1120 return PTR_ERR(cgrp);
@@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
973 1129
974 return ret; 1130 return ret;
975} 1131}
1132
976#endif /* CONFIG_CGROUP_BPF */ 1133#endif /* CONFIG_CGROUP_BPF */
977 1134
978#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration 1135#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
997 return ret; 1154 return ret;
998} 1155}
999 1156
1157#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
1158
1159static int bpf_obj_get_next_id(const union bpf_attr *attr,
1160 union bpf_attr __user *uattr,
1161 struct idr *idr,
1162 spinlock_t *lock)
1163{
1164 u32 next_id = attr->start_id;
1165 int err = 0;
1166
1167 if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
1168 return -EINVAL;
1169
1170 if (!capable(CAP_SYS_ADMIN))
1171 return -EPERM;
1172
1173 next_id++;
1174 spin_lock_bh(lock);
1175 if (!idr_get_next(idr, &next_id))
1176 err = -ENOENT;
1177 spin_unlock_bh(lock);
1178
1179 if (!err)
1180 err = put_user(next_id, &uattr->next_id);
1181
1182 return err;
1183}
1184
1185#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
1186
1187static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
1188{
1189 struct bpf_prog *prog;
1190 u32 id = attr->prog_id;
1191 int fd;
1192
1193 if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
1194 return -EINVAL;
1195
1196 if (!capable(CAP_SYS_ADMIN))
1197 return -EPERM;
1198
1199 spin_lock_bh(&prog_idr_lock);
1200 prog = idr_find(&prog_idr, id);
1201 if (prog)
1202 prog = bpf_prog_inc_not_zero(prog);
1203 else
1204 prog = ERR_PTR(-ENOENT);
1205 spin_unlock_bh(&prog_idr_lock);
1206
1207 if (IS_ERR(prog))
1208 return PTR_ERR(prog);
1209
1210 fd = bpf_prog_new_fd(prog);
1211 if (fd < 0)
1212 bpf_prog_put(prog);
1213
1214 return fd;
1215}
1216
1217#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
1218
1219static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
1220{
1221 struct bpf_map *map;
1222 u32 id = attr->map_id;
1223 int fd;
1224
1225 if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
1226 return -EINVAL;
1227
1228 if (!capable(CAP_SYS_ADMIN))
1229 return -EPERM;
1230
1231 spin_lock_bh(&map_idr_lock);
1232 map = idr_find(&map_idr, id);
1233 if (map)
1234 map = bpf_map_inc_not_zero(map, true);
1235 else
1236 map = ERR_PTR(-ENOENT);
1237 spin_unlock_bh(&map_idr_lock);
1238
1239 if (IS_ERR(map))
1240 return PTR_ERR(map);
1241
1242 fd = bpf_map_new_fd(map);
1243 if (fd < 0)
1244 bpf_map_put(map);
1245
1246 return fd;
1247}
1248
1249static int check_uarg_tail_zero(void __user *uaddr,
1250 size_t expected_size,
1251 size_t actual_size)
1252{
1253 unsigned char __user *addr;
1254 unsigned char __user *end;
1255 unsigned char val;
1256 int err;
1257
1258 if (actual_size <= expected_size)
1259 return 0;
1260
1261 addr = uaddr + expected_size;
1262 end = uaddr + actual_size;
1263
1264 for (; addr < end; addr++) {
1265 err = get_user(val, addr);
1266 if (err)
1267 return err;
1268 if (val)
1269 return -E2BIG;
1270 }
1271
1272 return 0;
1273}
1274
1275static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
1276 const union bpf_attr *attr,
1277 union bpf_attr __user *uattr)
1278{
1279 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
1280 struct bpf_prog_info info = {};
1281 u32 info_len = attr->info.info_len;
1282 char __user *uinsns;
1283 u32 ulen;
1284 int err;
1285
1286 err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
1287 if (err)
1288 return err;
1289 info_len = min_t(u32, sizeof(info), info_len);
1290
1291 if (copy_from_user(&info, uinfo, info_len))
1292 return err;
1293
1294 info.type = prog->type;
1295 info.id = prog->aux->id;
1296
1297 memcpy(info.tag, prog->tag, sizeof(prog->tag));
1298
1299 if (!capable(CAP_SYS_ADMIN)) {
1300 info.jited_prog_len = 0;
1301 info.xlated_prog_len = 0;
1302 goto done;
1303 }
1304
1305 ulen = info.jited_prog_len;
1306 info.jited_prog_len = prog->jited_len;
1307 if (info.jited_prog_len && ulen) {
1308 uinsns = u64_to_user_ptr(info.jited_prog_insns);
1309 ulen = min_t(u32, info.jited_prog_len, ulen);
1310 if (copy_to_user(uinsns, prog->bpf_func, ulen))
1311 return -EFAULT;
1312 }
1313
1314 ulen = info.xlated_prog_len;
1315 info.xlated_prog_len = bpf_prog_size(prog->len);
1316 if (info.xlated_prog_len && ulen) {
1317 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
1318 ulen = min_t(u32, info.xlated_prog_len, ulen);
1319 if (copy_to_user(uinsns, prog->insnsi, ulen))
1320 return -EFAULT;
1321 }
1322
1323done:
1324 if (copy_to_user(uinfo, &info, info_len) ||
1325 put_user(info_len, &uattr->info.info_len))
1326 return -EFAULT;
1327
1328 return 0;
1329}
1330
1331static int bpf_map_get_info_by_fd(struct bpf_map *map,
1332 const union bpf_attr *attr,
1333 union bpf_attr __user *uattr)
1334{
1335 struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
1336 struct bpf_map_info info = {};
1337 u32 info_len = attr->info.info_len;
1338 int err;
1339
1340 err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
1341 if (err)
1342 return err;
1343 info_len = min_t(u32, sizeof(info), info_len);
1344
1345 info.type = map->map_type;
1346 info.id = map->id;
1347 info.key_size = map->key_size;
1348 info.value_size = map->value_size;
1349 info.max_entries = map->max_entries;
1350 info.map_flags = map->map_flags;
1351
1352 if (copy_to_user(uinfo, &info, info_len) ||
1353 put_user(info_len, &uattr->info.info_len))
1354 return -EFAULT;
1355
1356 return 0;
1357}
1358
1359#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
1360
1361static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
1362 union bpf_attr __user *uattr)
1363{
1364 int ufd = attr->info.bpf_fd;
1365 struct fd f;
1366 int err;
1367
1368 if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
1369 return -EINVAL;
1370
1371 f = fdget(ufd);
1372 if (!f.file)
1373 return -EBADFD;
1374
1375 if (f.file->f_op == &bpf_prog_fops)
1376 err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
1377 uattr);
1378 else if (f.file->f_op == &bpf_map_fops)
1379 err = bpf_map_get_info_by_fd(f.file->private_data, attr,
1380 uattr);
1381 else
1382 err = -EINVAL;
1383
1384 fdput(f);
1385 return err;
1386}
1387
1000SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 1388SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
1001{ 1389{
1002 union bpf_attr attr = {}; 1390 union bpf_attr attr = {};
@@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1016 * user-space does not rely on any kernel feature 1404 * user-space does not rely on any kernel feature
1017 * extensions we dont know about yet. 1405 * extensions we dont know about yet.
1018 */ 1406 */
1019 if (size > sizeof(attr)) { 1407 err = check_uarg_tail_zero(uattr, sizeof(attr), size);
1020 unsigned char __user *addr; 1408 if (err)
1021 unsigned char __user *end; 1409 return err;
1022 unsigned char val; 1410 size = min_t(u32, size, sizeof(attr));
1023
1024 addr = (void __user *)uattr + sizeof(attr);
1025 end = (void __user *)uattr + size;
1026
1027 for (; addr < end; addr++) {
1028 err = get_user(val, addr);
1029 if (err)
1030 return err;
1031 if (val)
1032 return -E2BIG;
1033 }
1034 size = sizeof(attr);
1035 }
1036 1411
1037 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 1412 /* copy attributes from user space, may be less than sizeof(bpf_attr) */
1038 if (copy_from_user(&attr, uattr, size) != 0) 1413 if (copy_from_user(&attr, uattr, size) != 0)
@@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
1074 case BPF_PROG_TEST_RUN: 1449 case BPF_PROG_TEST_RUN:
1075 err = bpf_prog_test_run(&attr, uattr); 1450 err = bpf_prog_test_run(&attr, uattr);
1076 break; 1451 break;
1452 case BPF_PROG_GET_NEXT_ID:
1453 err = bpf_obj_get_next_id(&attr, uattr,
1454 &prog_idr, &prog_idr_lock);
1455 break;
1456 case BPF_MAP_GET_NEXT_ID:
1457 err = bpf_obj_get_next_id(&attr, uattr,
1458 &map_idr, &map_idr_lock);
1459 break;
1460 case BPF_PROG_GET_FD_BY_ID:
1461 err = bpf_prog_get_fd_by_id(&attr);
1462 break;
1463 case BPF_MAP_GET_FD_BY_ID:
1464 err = bpf_map_get_fd_by_id(&attr);
1465 break;
1466 case BPF_OBJ_GET_INFO_BY_FD:
1467 err = bpf_obj_get_info_by_fd(&attr, uattr);
1468 break;
1077 default: 1469 default:
1078 err = -EINVAL; 1470 err = -EINVAL;
1079 break; 1471 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..af9e84a4944e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
504{ 504{
505 regs[regno].min_value = BPF_REGISTER_MIN_RANGE; 505 regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
506 regs[regno].max_value = BPF_REGISTER_MAX_RANGE; 506 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
507 regs[regno].value_from_signed = false;
507 regs[regno].min_align = 0; 508 regs[regno].min_align = 0;
508} 509}
509 510
@@ -546,20 +547,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
546 return 0; 547 return 0;
547} 548}
548 549
549static int bpf_size_to_bytes(int bpf_size)
550{
551 if (bpf_size == BPF_W)
552 return 4;
553 else if (bpf_size == BPF_H)
554 return 2;
555 else if (bpf_size == BPF_B)
556 return 1;
557 else if (bpf_size == BPF_DW)
558 return 8;
559 else
560 return -EINVAL;
561}
562
563static bool is_spillable_regtype(enum bpf_reg_type type) 550static bool is_spillable_regtype(enum bpf_reg_type type)
564{ 551{
565 switch (type) { 552 switch (type) {
@@ -758,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
758} 745}
759 746
760/* check access to 'struct bpf_context' fields */ 747/* check access to 'struct bpf_context' fields */
761static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, 748static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
762 enum bpf_access_type t, enum bpf_reg_type *reg_type) 749 enum bpf_access_type t, enum bpf_reg_type *reg_type)
763{ 750{
751 struct bpf_insn_access_aux info = {
752 .reg_type = *reg_type,
753 };
754
764 /* for analyzer ctx accesses are already validated and converted */ 755 /* for analyzer ctx accesses are already validated and converted */
765 if (env->analyzer_ops) 756 if (env->analyzer_ops)
766 return 0; 757 return 0;
767 758
768 if (env->prog->aux->ops->is_valid_access && 759 if (env->prog->aux->ops->is_valid_access &&
769 env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { 760 env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
761 /* A non zero info.ctx_field_size indicates that this field is a
762 * candidate for later verifier transformation to load the whole
763 * field and then apply a mask when accessed with a narrower
764 * access than actual ctx access size. A zero info.ctx_field_size
765 * will only allow for whole field access and rejects any other
766 * type of narrower access.
767 */
768 env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
769 *reg_type = info.reg_type;
770
770 /* remember the offset of last byte accessed in ctx */ 771 /* remember the offset of last byte accessed in ctx */
771 if (env->prog->aux->max_ctx_offset < off + size) 772 if (env->prog->aux->max_ctx_offset < off + size)
772 env->prog->aux->max_ctx_offset = off + size; 773 env->prog->aux->max_ctx_offset = off + size;
@@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
777 return -EACCES; 778 return -EACCES;
778} 779}
779 780
780static bool is_pointer_value(struct bpf_verifier_env *env, int regno) 781static bool __is_pointer_value(bool allow_ptr_leaks,
782 const struct bpf_reg_state *reg)
781{ 783{
782 if (env->allow_ptr_leaks) 784 if (allow_ptr_leaks)
783 return false; 785 return false;
784 786
785 switch (env->cur_state.regs[regno].type) { 787 switch (reg->type) {
786 case UNKNOWN_VALUE: 788 case UNKNOWN_VALUE:
787 case CONST_IMM: 789 case CONST_IMM:
788 return false; 790 return false;
@@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
791 } 793 }
792} 794}
793 795
796static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
797{
798 return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
799}
800
794static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, 801static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
795 int off, int size, bool strict) 802 int off, int size, bool strict)
796{ 803{
@@ -868,7 +875,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
868 * if t==write && value_regno==-1, some unknown value is stored into memory 875 * if t==write && value_regno==-1, some unknown value is stored into memory
869 * if t==read && value_regno==-1, don't care what we read from memory 876 * if t==read && value_regno==-1, don't care what we read from memory
870 */ 877 */
871static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, 878static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
872 int bpf_size, enum bpf_access_type t, 879 int bpf_size, enum bpf_access_type t,
873 int value_regno) 880 int value_regno)
874{ 881{
@@ -911,7 +918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
911 verbose("R%d leaks addr into ctx\n", value_regno); 918 verbose("R%d leaks addr into ctx\n", value_regno);
912 return -EACCES; 919 return -EACCES;
913 } 920 }
914 err = check_ctx_access(env, off, size, t, &reg_type); 921 err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
915 if (!err && t == BPF_READ && value_regno >= 0) { 922 if (!err && t == BPF_READ && value_regno >= 0) {
916 mark_reg_unknown_value_and_range(state->regs, 923 mark_reg_unknown_value_and_range(state->regs,
917 value_regno); 924 value_regno);
@@ -926,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
926 verbose("invalid stack off=%d size=%d\n", off, size); 933 verbose("invalid stack off=%d size=%d\n", off, size);
927 return -EACCES; 934 return -EACCES;
928 } 935 }
936
937 if (env->prog->aux->stack_depth < -off)
938 env->prog->aux->stack_depth = -off;
939
929 if (t == BPF_WRITE) { 940 if (t == BPF_WRITE) {
930 if (!env->allow_ptr_leaks && 941 if (!env->allow_ptr_leaks &&
931 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && 942 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -968,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
968 return err; 979 return err;
969} 980}
970 981
971static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) 982static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
972{ 983{
973 struct bpf_reg_state *regs = env->cur_state.regs; 984 struct bpf_reg_state *regs = env->cur_state.regs;
974 int err; 985 int err;
@@ -995,13 +1006,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
995 } 1006 }
996 1007
997 /* check whether atomic_add can read the memory */ 1008 /* check whether atomic_add can read the memory */
998 err = check_mem_access(env, insn->dst_reg, insn->off, 1009 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
999 BPF_SIZE(insn->code), BPF_READ, -1); 1010 BPF_SIZE(insn->code), BPF_READ, -1);
1000 if (err) 1011 if (err)
1001 return err; 1012 return err;
1002 1013
1003 /* check whether atomic_add can write into the same memory */ 1014 /* check whether atomic_add can write into the same memory */
1004 return check_mem_access(env, insn->dst_reg, insn->off, 1015 return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
1005 BPF_SIZE(insn->code), BPF_WRITE, -1); 1016 BPF_SIZE(insn->code), BPF_WRITE, -1);
1006} 1017}
1007 1018
@@ -1037,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1037 return -EACCES; 1048 return -EACCES;
1038 } 1049 }
1039 1050
1051 if (env->prog->aux->stack_depth < -off)
1052 env->prog->aux->stack_depth = -off;
1053
1040 if (meta && meta->raw_mode) { 1054 if (meta && meta->raw_mode) {
1041 meta->access_size = access_size; 1055 meta->access_size = access_size;
1042 meta->regno = regno; 1056 meta->regno = regno;
@@ -1344,8 +1358,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
1344 if (reg->type != PTR_TO_PACKET && 1358 if (reg->type != PTR_TO_PACKET &&
1345 reg->type != PTR_TO_PACKET_END) 1359 reg->type != PTR_TO_PACKET_END)
1346 continue; 1360 continue;
1347 reg->type = UNKNOWN_VALUE; 1361 __mark_reg_unknown_value(state->spilled_regs,
1348 reg->imm = 0; 1362 i / BPF_REG_SIZE);
1349 } 1363 }
1350} 1364}
1351 1365
@@ -1414,7 +1428,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1414 * is inferred from register state. 1428 * is inferred from register state.
1415 */ 1429 */
1416 for (i = 0; i < meta.access_size; i++) { 1430 for (i = 0; i < meta.access_size; i++) {
1417 err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); 1431 err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
1418 if (err) 1432 if (err)
1419 return err; 1433 return err;
1420 } 1434 }
@@ -1650,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
1650 return 0; 1664 return 0;
1651} 1665}
1652 1666
1667static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
1668 struct bpf_insn *insn)
1669{
1670 struct bpf_reg_state *regs = env->cur_state.regs;
1671 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1672 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1673 u8 opcode = BPF_OP(insn->code);
1674 s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
1675
1676 /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
1677 if (src_reg->imm > 0 && dst_reg->imm) {
1678 switch (opcode) {
1679 case BPF_ADD:
1680 /* dreg += sreg
1681 * where both have zero upper bits. Adding them
1682 * can only result making one more bit non-zero
1683 * in the larger value.
1684 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
1685 * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
1686 */
1687 dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
1688 dst_reg->imm--;
1689 break;
1690 case BPF_AND:
1691 /* dreg &= sreg
1692 * AND can not extend zero bits only shrink
1693 * Ex. 0x00..00ffffff
1694 * & 0x0f..ffffffff
1695 * ----------------
1696 * 0x00..00ffffff
1697 */
1698 dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
1699 break;
1700 case BPF_OR:
1701 /* dreg |= sreg
1702 * OR can only extend zero bits
1703 * Ex. 0x00..00ffffff
1704 * | 0x0f..ffffffff
1705 * ----------------
1706 * 0x0f..00ffffff
1707 */
1708 dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
1709 break;
1710 case BPF_SUB:
1711 case BPF_MUL:
1712 case BPF_RSH:
1713 case BPF_LSH:
1714 /* These may be flushed out later */
1715 default:
1716 mark_reg_unknown_value(regs, insn->dst_reg);
1717 }
1718 } else {
1719 mark_reg_unknown_value(regs, insn->dst_reg);
1720 }
1721
1722 dst_reg->type = UNKNOWN_VALUE;
1723 return 0;
1724}
1725
1653static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, 1726static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1654 struct bpf_insn *insn) 1727 struct bpf_insn *insn)
1655{ 1728{
@@ -1659,6 +1732,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1659 u8 opcode = BPF_OP(insn->code); 1732 u8 opcode = BPF_OP(insn->code);
1660 u64 dst_imm = dst_reg->imm; 1733 u64 dst_imm = dst_reg->imm;
1661 1734
1735 if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
1736 return evaluate_reg_imm_alu_unknown(env, insn);
1737
1662 /* dst_reg->type == CONST_IMM here. Simulate execution of insns 1738 /* dst_reg->type == CONST_IMM here. Simulate execution of insns
1663 * containing ALU ops. Don't care about overflow or negative 1739 * containing ALU ops. Don't care about overflow or negative
1664 * values, just add/sub/... them; registers are in u64. 1740 * values, just add/sub/... them; registers are in u64.
@@ -1763,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
1763 dst_align = dst_reg->min_align; 1839 dst_align = dst_reg->min_align;
1764 1840
1765 /* We don't know anything about what was done to this register, mark it 1841 /* We don't know anything about what was done to this register, mark it
1766 * as unknown. 1842 * as unknown. Also, if both derived bounds came from signed/unsigned
1843 * mixed compares and one side is unbounded, we cannot really do anything
1844 * with them as boundaries cannot be trusted. Thus, arithmetic of two
1845 * regs of such kind will get invalidated bounds on the dst side.
1767 */ 1846 */
1768 if (min_val == BPF_REGISTER_MIN_RANGE && 1847 if ((min_val == BPF_REGISTER_MIN_RANGE &&
1769 max_val == BPF_REGISTER_MAX_RANGE) { 1848 max_val == BPF_REGISTER_MAX_RANGE) ||
1849 (BPF_SRC(insn->code) == BPF_X &&
1850 ((min_val != BPF_REGISTER_MIN_RANGE &&
1851 max_val == BPF_REGISTER_MAX_RANGE) ||
1852 (min_val == BPF_REGISTER_MIN_RANGE &&
1853 max_val != BPF_REGISTER_MAX_RANGE) ||
1854 (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
1855 dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
1856 (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
1857 dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
1858 regs[insn->dst_reg].value_from_signed !=
1859 regs[insn->src_reg].value_from_signed)) {
1770 reset_reg_range_values(regs, insn->dst_reg); 1860 reset_reg_range_values(regs, insn->dst_reg);
1771 return; 1861 return;
1772 } 1862 }
@@ -1950,9 +2040,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
1950 */ 2040 */
1951 regs[insn->dst_reg].type = CONST_IMM; 2041 regs[insn->dst_reg].type = CONST_IMM;
1952 regs[insn->dst_reg].imm = insn->imm; 2042 regs[insn->dst_reg].imm = insn->imm;
2043 regs[insn->dst_reg].id = 0;
1953 regs[insn->dst_reg].max_value = insn->imm; 2044 regs[insn->dst_reg].max_value = insn->imm;
1954 regs[insn->dst_reg].min_value = insn->imm; 2045 regs[insn->dst_reg].min_value = insn->imm;
1955 regs[insn->dst_reg].min_align = calc_align(insn->imm); 2046 regs[insn->dst_reg].min_align = calc_align(insn->imm);
2047 regs[insn->dst_reg].value_from_signed = false;
1956 } 2048 }
1957 2049
1958 } else if (opcode > BPF_END) { 2050 } else if (opcode > BPF_END) {
@@ -2128,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
2128 struct bpf_reg_state *false_reg, u64 val, 2220 struct bpf_reg_state *false_reg, u64 val,
2129 u8 opcode) 2221 u8 opcode)
2130{ 2222{
2223 bool value_from_signed = true;
2224 bool is_range = true;
2225
2131 switch (opcode) { 2226 switch (opcode) {
2132 case BPF_JEQ: 2227 case BPF_JEQ:
2133 /* If this is false then we know nothing Jon Snow, but if it is 2228 /* If this is false then we know nothing Jon Snow, but if it is
2134 * true then we know for sure. 2229 * true then we know for sure.
2135 */ 2230 */
2136 true_reg->max_value = true_reg->min_value = val; 2231 true_reg->max_value = true_reg->min_value = val;
2232 is_range = false;
2137 break; 2233 break;
2138 case BPF_JNE: 2234 case BPF_JNE:
2139 /* If this is true we know nothing Jon Snow, but if it is false 2235 /* If this is true we know nothing Jon Snow, but if it is false
2140 * we know the value for sure; 2236 * we know the value for sure;
2141 */ 2237 */
2142 false_reg->max_value = false_reg->min_value = val; 2238 false_reg->max_value = false_reg->min_value = val;
2239 is_range = false;
2143 break; 2240 break;
2144 case BPF_JGT: 2241 case BPF_JGT:
2145 /* Unsigned comparison, the minimum value is 0. */ 2242 value_from_signed = false;
2146 false_reg->min_value = 0;
2147 /* fallthrough */ 2243 /* fallthrough */
2148 case BPF_JSGT: 2244 case BPF_JSGT:
2245 if (true_reg->value_from_signed != value_from_signed)
2246 reset_reg_range_values(true_reg, 0);
2247 if (false_reg->value_from_signed != value_from_signed)
2248 reset_reg_range_values(false_reg, 0);
2249 if (opcode == BPF_JGT) {
2250 /* Unsigned comparison, the minimum value is 0. */
2251 false_reg->min_value = 0;
2252 }
2149 /* If this is false then we know the maximum val is val, 2253 /* If this is false then we know the maximum val is val,
2150 * otherwise we know the min val is val+1. 2254 * otherwise we know the min val is val+1.
2151 */ 2255 */
2152 false_reg->max_value = val; 2256 false_reg->max_value = val;
2257 false_reg->value_from_signed = value_from_signed;
2153 true_reg->min_value = val + 1; 2258 true_reg->min_value = val + 1;
2259 true_reg->value_from_signed = value_from_signed;
2154 break; 2260 break;
2155 case BPF_JGE: 2261 case BPF_JGE:
2156 /* Unsigned comparison, the minimum value is 0. */ 2262 value_from_signed = false;
2157 false_reg->min_value = 0;
2158 /* fallthrough */ 2263 /* fallthrough */
2159 case BPF_JSGE: 2264 case BPF_JSGE:
2265 if (true_reg->value_from_signed != value_from_signed)
2266 reset_reg_range_values(true_reg, 0);
2267 if (false_reg->value_from_signed != value_from_signed)
2268 reset_reg_range_values(false_reg, 0);
2269 if (opcode == BPF_JGE) {
2270 /* Unsigned comparison, the minimum value is 0. */
2271 false_reg->min_value = 0;
2272 }
2160 /* If this is false then we know the maximum value is val - 1, 2273 /* If this is false then we know the maximum value is val - 1,
2161 * otherwise we know the mimimum value is val. 2274 * otherwise we know the mimimum value is val.
2162 */ 2275 */
2163 false_reg->max_value = val - 1; 2276 false_reg->max_value = val - 1;
2277 false_reg->value_from_signed = value_from_signed;
2164 true_reg->min_value = val; 2278 true_reg->min_value = val;
2279 true_reg->value_from_signed = value_from_signed;
2165 break; 2280 break;
2166 default: 2281 default:
2167 break; 2282 break;
@@ -2169,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
2169 2284
2170 check_reg_overflow(false_reg); 2285 check_reg_overflow(false_reg);
2171 check_reg_overflow(true_reg); 2286 check_reg_overflow(true_reg);
2287 if (is_range) {
2288 if (__is_pointer_value(false, false_reg))
2289 reset_reg_range_values(false_reg, 0);
2290 if (__is_pointer_value(false, true_reg))
2291 reset_reg_range_values(true_reg, 0);
2292 }
2172} 2293}
2173 2294
2174/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg 2295/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2178,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
2178 struct bpf_reg_state *false_reg, u64 val, 2299 struct bpf_reg_state *false_reg, u64 val,
2179 u8 opcode) 2300 u8 opcode)
2180{ 2301{
2302 bool value_from_signed = true;
2303 bool is_range = true;
2304
2181 switch (opcode) { 2305 switch (opcode) {
2182 case BPF_JEQ: 2306 case BPF_JEQ:
2183 /* If this is false then we know nothing Jon Snow, but if it is 2307 /* If this is false then we know nothing Jon Snow, but if it is
2184 * true then we know for sure. 2308 * true then we know for sure.
2185 */ 2309 */
2186 true_reg->max_value = true_reg->min_value = val; 2310 true_reg->max_value = true_reg->min_value = val;
2311 is_range = false;
2187 break; 2312 break;
2188 case BPF_JNE: 2313 case BPF_JNE:
2189 /* If this is true we know nothing Jon Snow, but if it is false 2314 /* If this is true we know nothing Jon Snow, but if it is false
2190 * we know the value for sure; 2315 * we know the value for sure;
2191 */ 2316 */
2192 false_reg->max_value = false_reg->min_value = val; 2317 false_reg->max_value = false_reg->min_value = val;
2318 is_range = false;
2193 break; 2319 break;
2194 case BPF_JGT: 2320 case BPF_JGT:
2195 /* Unsigned comparison, the minimum value is 0. */ 2321 value_from_signed = false;
2196 true_reg->min_value = 0;
2197 /* fallthrough */ 2322 /* fallthrough */
2198 case BPF_JSGT: 2323 case BPF_JSGT:
2324 if (true_reg->value_from_signed != value_from_signed)
2325 reset_reg_range_values(true_reg, 0);
2326 if (false_reg->value_from_signed != value_from_signed)
2327 reset_reg_range_values(false_reg, 0);
2328 if (opcode == BPF_JGT) {
2329 /* Unsigned comparison, the minimum value is 0. */
2330 true_reg->min_value = 0;
2331 }
2199 /* 2332 /*
2200 * If this is false, then the val is <= the register, if it is 2333 * If this is false, then the val is <= the register, if it is
2201 * true the register <= to the val. 2334 * true the register <= to the val.
2202 */ 2335 */
2203 false_reg->min_value = val; 2336 false_reg->min_value = val;
2337 false_reg->value_from_signed = value_from_signed;
2204 true_reg->max_value = val - 1; 2338 true_reg->max_value = val - 1;
2339 true_reg->value_from_signed = value_from_signed;
2205 break; 2340 break;
2206 case BPF_JGE: 2341 case BPF_JGE:
2207 /* Unsigned comparison, the minimum value is 0. */ 2342 value_from_signed = false;
2208 true_reg->min_value = 0;
2209 /* fallthrough */ 2343 /* fallthrough */
2210 case BPF_JSGE: 2344 case BPF_JSGE:
2345 if (true_reg->value_from_signed != value_from_signed)
2346 reset_reg_range_values(true_reg, 0);
2347 if (false_reg->value_from_signed != value_from_signed)
2348 reset_reg_range_values(false_reg, 0);
2349 if (opcode == BPF_JGE) {
2350 /* Unsigned comparison, the minimum value is 0. */
2351 true_reg->min_value = 0;
2352 }
2211 /* If this is false then constant < register, if it is true then 2353 /* If this is false then constant < register, if it is true then
2212 * the register < constant. 2354 * the register < constant.
2213 */ 2355 */
2214 false_reg->min_value = val + 1; 2356 false_reg->min_value = val + 1;
2357 false_reg->value_from_signed = value_from_signed;
2215 true_reg->max_value = val; 2358 true_reg->max_value = val;
2359 true_reg->value_from_signed = value_from_signed;
2216 break; 2360 break;
2217 default: 2361 default:
2218 break; 2362 break;
@@ -2220,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
2220 2364
2221 check_reg_overflow(false_reg); 2365 check_reg_overflow(false_reg);
2222 check_reg_overflow(true_reg); 2366 check_reg_overflow(true_reg);
2367 if (is_range) {
2368 if (__is_pointer_value(false, false_reg))
2369 reset_reg_range_values(false_reg, 0);
2370 if (__is_pointer_value(false, true_reg))
2371 reset_reg_range_values(true_reg, 0);
2372 }
2223} 2373}
2224 2374
2225static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, 2375static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
@@ -2407,6 +2557,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
2407 2557
2408 regs[insn->dst_reg].type = CONST_IMM; 2558 regs[insn->dst_reg].type = CONST_IMM;
2409 regs[insn->dst_reg].imm = imm; 2559 regs[insn->dst_reg].imm = imm;
2560 regs[insn->dst_reg].id = 0;
2410 return 0; 2561 return 0;
2411 } 2562 }
2412 2563
@@ -2826,6 +2977,8 @@ static bool states_equal(struct bpf_verifier_env *env,
2826 return false; 2977 return false;
2827 if (i % BPF_REG_SIZE) 2978 if (i % BPF_REG_SIZE)
2828 continue; 2979 continue;
2980 if (old->stack_slot_type[i] != STACK_SPILL)
2981 continue;
2829 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], 2982 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
2830 &cur->spilled_regs[i / BPF_REG_SIZE], 2983 &cur->spilled_regs[i / BPF_REG_SIZE],
2831 sizeof(old->spilled_regs[0]))) 2984 sizeof(old->spilled_regs[0])))
@@ -2987,18 +3140,12 @@ static int do_check(struct bpf_verifier_env *env)
2987 /* check that memory (src_reg + off) is readable, 3140 /* check that memory (src_reg + off) is readable,
2988 * the state of dst_reg will be updated by this func 3141 * the state of dst_reg will be updated by this func
2989 */ 3142 */
2990 err = check_mem_access(env, insn->src_reg, insn->off, 3143 err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
2991 BPF_SIZE(insn->code), BPF_READ, 3144 BPF_SIZE(insn->code), BPF_READ,
2992 insn->dst_reg); 3145 insn->dst_reg);
2993 if (err) 3146 if (err)
2994 return err; 3147 return err;
2995 3148
2996 if (BPF_SIZE(insn->code) != BPF_W &&
2997 BPF_SIZE(insn->code) != BPF_DW) {
2998 insn_idx++;
2999 continue;
3000 }
3001
3002 prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; 3149 prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
3003 3150
3004 if (*prev_src_type == NOT_INIT) { 3151 if (*prev_src_type == NOT_INIT) {
@@ -3026,7 +3173,7 @@ static int do_check(struct bpf_verifier_env *env)
3026 enum bpf_reg_type *prev_dst_type, dst_reg_type; 3173 enum bpf_reg_type *prev_dst_type, dst_reg_type;
3027 3174
3028 if (BPF_MODE(insn->code) == BPF_XADD) { 3175 if (BPF_MODE(insn->code) == BPF_XADD) {
3029 err = check_xadd(env, insn); 3176 err = check_xadd(env, insn_idx, insn);
3030 if (err) 3177 if (err)
3031 return err; 3178 return err;
3032 insn_idx++; 3179 insn_idx++;
@@ -3045,7 +3192,7 @@ static int do_check(struct bpf_verifier_env *env)
3045 dst_reg_type = regs[insn->dst_reg].type; 3192 dst_reg_type = regs[insn->dst_reg].type;
3046 3193
3047 /* check that memory (dst_reg + off) is writeable */ 3194 /* check that memory (dst_reg + off) is writeable */
3048 err = check_mem_access(env, insn->dst_reg, insn->off, 3195 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
3049 BPF_SIZE(insn->code), BPF_WRITE, 3196 BPF_SIZE(insn->code), BPF_WRITE,
3050 insn->src_reg); 3197 insn->src_reg);
3051 if (err) 3198 if (err)
@@ -3074,7 +3221,7 @@ static int do_check(struct bpf_verifier_env *env)
3074 return err; 3221 return err;
3075 3222
3076 /* check that memory (dst_reg + off) is writeable */ 3223 /* check that memory (dst_reg + off) is writeable */
3077 err = check_mem_access(env, insn->dst_reg, insn->off, 3224 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
3078 BPF_SIZE(insn->code), BPF_WRITE, 3225 BPF_SIZE(insn->code), BPF_WRITE,
3079 -1); 3226 -1);
3080 if (err) 3227 if (err)
@@ -3172,7 +3319,8 @@ process_bpf_exit:
3172 insn_idx++; 3319 insn_idx++;
3173 } 3320 }
3174 3321
3175 verbose("processed %d insns\n", insn_processed); 3322 verbose("processed %d insns, stack depth %d\n",
3323 insn_processed, env->prog->aux->stack_depth);
3176 return 0; 3324 return 0;
3177} 3325}
3178 3326
@@ -3372,11 +3520,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
3372static int convert_ctx_accesses(struct bpf_verifier_env *env) 3520static int convert_ctx_accesses(struct bpf_verifier_env *env)
3373{ 3521{
3374 const struct bpf_verifier_ops *ops = env->prog->aux->ops; 3522 const struct bpf_verifier_ops *ops = env->prog->aux->ops;
3523 int i, cnt, size, ctx_field_size, delta = 0;
3375 const int insn_cnt = env->prog->len; 3524 const int insn_cnt = env->prog->len;
3376 struct bpf_insn insn_buf[16], *insn; 3525 struct bpf_insn insn_buf[16], *insn;
3377 struct bpf_prog *new_prog; 3526 struct bpf_prog *new_prog;
3378 enum bpf_access_type type; 3527 enum bpf_access_type type;
3379 int i, cnt, delta = 0; 3528 bool is_narrower_load;
3529 u32 target_size;
3380 3530
3381 if (ops->gen_prologue) { 3531 if (ops->gen_prologue) {
3382 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, 3532 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3416,12 +3566,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3416 if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) 3566 if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
3417 continue; 3567 continue;
3418 3568
3419 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); 3569 ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
3420 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 3570 size = BPF_LDST_BYTES(insn);
3571
3572 /* If the read access is a narrower load of the field,
3573 * convert to a 4/8-byte load, to minimum program type specific
3574 * convert_ctx_access changes. If conversion is successful,
3575 * we will apply proper mask to the result.
3576 */
3577 is_narrower_load = size < ctx_field_size;
3578 if (is_narrower_load) {
3579 u32 off = insn->off;
3580 u8 size_code;
3581
3582 if (type == BPF_WRITE) {
3583 verbose("bpf verifier narrow ctx access misconfigured\n");
3584 return -EINVAL;
3585 }
3586
3587 size_code = BPF_H;
3588 if (ctx_field_size == 4)
3589 size_code = BPF_W;
3590 else if (ctx_field_size == 8)
3591 size_code = BPF_DW;
3592
3593 insn->off = off & ~(ctx_field_size - 1);
3594 insn->code = BPF_LDX | BPF_MEM | size_code;
3595 }
3596
3597 target_size = 0;
3598 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
3599 &target_size);
3600 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
3601 (ctx_field_size && !target_size)) {
3421 verbose("bpf verifier is misconfigured\n"); 3602 verbose("bpf verifier is misconfigured\n");
3422 return -EINVAL; 3603 return -EINVAL;
3423 } 3604 }
3424 3605
3606 if (is_narrower_load && size < target_size) {
3607 if (ctx_field_size <= 4)
3608 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
3609 (1 << size * 8) - 1);
3610 else
3611 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
3612 (1 << size * 8) - 1);
3613 }
3614
3425 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 3615 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
3426 if (!new_prog) 3616 if (!new_prog)
3427 return -ENOMEM; 3617 return -ENOMEM;
@@ -3467,6 +3657,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
3467 * the program array. 3657 * the program array.
3468 */ 3658 */
3469 prog->cb_access = 1; 3659 prog->cb_access = 1;
3660 env->prog->aux->stack_depth = MAX_BPF_STACK;
3470 3661
3471 /* mark bpf_tail_call as different opcode to avoid 3662 /* mark bpf_tail_call as different opcode to avoid
3472 * conditional branch in the interpeter for every normal 3663 * conditional branch in the interpeter for every normal
@@ -3474,7 +3665,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
3474 * that doesn't support bpf_tail_call yet 3665 * that doesn't support bpf_tail_call yet
3475 */ 3666 */
3476 insn->imm = 0; 3667 insn->imm = 0;
3477 insn->code |= BPF_X; 3668 insn->code = BPF_JMP | BPF_TAIL_CALL;
3478 continue; 3669 continue;
3479 } 3670 }
3480 3671
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
4obj-$(CONFIG_CGROUP_PIDS) += pids.o 4obj-$(CONFIG_CGROUP_PIDS) += pids.o
5obj-$(CONFIG_CGROUP_RDMA) += rdma.o 5obj-$(CONFIG_CGROUP_RDMA) += rdma.o
6obj-$(CONFIG_CPUSETS) += cpuset.o 6obj-$(CONFIG_CPUSETS) += cpuset.o
7obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, 192int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
193 struct kernfs_root *kf_root); 193 struct kernfs_root *kf_root);
194 194
195int cgroup_task_count(const struct cgroup *cgrp);
196
195/* 197/*
196 * namespace.c 198 * namespace.c
197 */ 199 */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
334/** 334/**
335 * cgroup_task_count - count the number of tasks in a cgroup. 335 * cgroup_task_count - count the number of tasks in a cgroup.
336 * @cgrp: the cgroup in question 336 * @cgrp: the cgroup in question
337 *
338 * Return the number of tasks in the cgroup. The returned number can be
339 * higher than the actual number of tasks due to css_set references from
340 * namespace roots and temporary usages.
341 */ 337 */
342static int cgroup_task_count(const struct cgroup *cgrp) 338int cgroup_task_count(const struct cgroup *cgrp)
343{ 339{
344 int count = 0; 340 int count = 0;
345 struct cgrp_cset_link *link; 341 struct cgrp_cset_link *link;
346 342
347 spin_lock_irq(&css_set_lock); 343 spin_lock_irq(&css_set_lock);
348 list_for_each_entry(link, &cgrp->cset_links, cset_link) 344 list_for_each_entry(link, &cgrp->cset_links, cset_link)
349 count += refcount_read(&link->cset->refcount); 345 count += link->cset->nr_tasks;
350 spin_unlock_irq(&css_set_lock); 346 spin_unlock_irq(&css_set_lock);
351 return count; 347 return count;
352} 348}
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
1263 return 1; 1259 return 1;
1264} 1260}
1265__setup("cgroup_no_v1=", cgroup_no_v1); 1261__setup("cgroup_no_v1=", cgroup_no_v1);
1266
1267
1268#ifdef CONFIG_CGROUP_DEBUG
1269static struct cgroup_subsys_state *
1270debug_css_alloc(struct cgroup_subsys_state *parent_css)
1271{
1272 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
1273
1274 if (!css)
1275 return ERR_PTR(-ENOMEM);
1276
1277 return css;
1278}
1279
1280static void debug_css_free(struct cgroup_subsys_state *css)
1281{
1282 kfree(css);
1283}
1284
1285static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
1286 struct cftype *cft)
1287{
1288 return cgroup_task_count(css->cgroup);
1289}
1290
1291static u64 current_css_set_read(struct cgroup_subsys_state *css,
1292 struct cftype *cft)
1293{
1294 return (u64)(unsigned long)current->cgroups;
1295}
1296
1297static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
1298 struct cftype *cft)
1299{
1300 u64 count;
1301
1302 rcu_read_lock();
1303 count = refcount_read(&task_css_set(current)->refcount);
1304 rcu_read_unlock();
1305 return count;
1306}
1307
1308static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
1309{
1310 struct cgrp_cset_link *link;
1311 struct css_set *cset;
1312 char *name_buf;
1313
1314 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
1315 if (!name_buf)
1316 return -ENOMEM;
1317
1318 spin_lock_irq(&css_set_lock);
1319 rcu_read_lock();
1320 cset = rcu_dereference(current->cgroups);
1321 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1322 struct cgroup *c = link->cgrp;
1323
1324 cgroup_name(c, name_buf, NAME_MAX + 1);
1325 seq_printf(seq, "Root %d group %s\n",
1326 c->root->hierarchy_id, name_buf);
1327 }
1328 rcu_read_unlock();
1329 spin_unlock_irq(&css_set_lock);
1330 kfree(name_buf);
1331 return 0;
1332}
1333
1334#define MAX_TASKS_SHOWN_PER_CSS 25
1335static int cgroup_css_links_read(struct seq_file *seq, void *v)
1336{
1337 struct cgroup_subsys_state *css = seq_css(seq);
1338 struct cgrp_cset_link *link;
1339
1340 spin_lock_irq(&css_set_lock);
1341 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
1342 struct css_set *cset = link->cset;
1343 struct task_struct *task;
1344 int count = 0;
1345
1346 seq_printf(seq, "css_set %pK\n", cset);
1347
1348 list_for_each_entry(task, &cset->tasks, cg_list) {
1349 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1350 goto overflow;
1351 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1352 }
1353
1354 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
1355 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
1356 goto overflow;
1357 seq_printf(seq, " task %d\n", task_pid_vnr(task));
1358 }
1359 continue;
1360 overflow:
1361 seq_puts(seq, " ...\n");
1362 }
1363 spin_unlock_irq(&css_set_lock);
1364 return 0;
1365}
1366
1367static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
1368{
1369 return (!cgroup_is_populated(css->cgroup) &&
1370 !css_has_online_children(&css->cgroup->self));
1371}
1372
1373static struct cftype debug_files[] = {
1374 {
1375 .name = "taskcount",
1376 .read_u64 = debug_taskcount_read,
1377 },
1378
1379 {
1380 .name = "current_css_set",
1381 .read_u64 = current_css_set_read,
1382 },
1383
1384 {
1385 .name = "current_css_set_refcount",
1386 .read_u64 = current_css_set_refcount_read,
1387 },
1388
1389 {
1390 .name = "current_css_set_cg_links",
1391 .seq_show = current_css_set_cg_links_read,
1392 },
1393
1394 {
1395 .name = "cgroup_css_links",
1396 .seq_show = cgroup_css_links_read,
1397 },
1398
1399 {
1400 .name = "releasable",
1401 .read_u64 = releasable_read,
1402 },
1403
1404 { } /* terminate */
1405};
1406
1407struct cgroup_subsys debug_cgrp_subsys = {
1408 .css_alloc = debug_css_alloc,
1409 .css_free = debug_css_free,
1410 .legacy_cftypes = debug_files,
1411};
1412#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */
573/** 573/**
574 * css_set_populated - does a css_set contain any tasks? 574 * css_set_populated - does a css_set contain any tasks?
575 * @cset: target css_set 575 * @cset: target css_set
576 *
577 * css_set_populated() should be the same as !!cset->nr_tasks at steady
578 * state. However, css_set_populated() can be called while a task is being
579 * added to or removed from the linked list before the nr_tasks is
580 * properly updated. Hence, we can't just look at ->nr_tasks here.
576 */ 581 */
577static bool css_set_populated(struct css_set *cset) 582static bool css_set_populated(struct css_set *cset)
578{ 583{
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1542 return len; 1547 return len;
1543} 1548}
1544 1549
1550static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1551{
1552 char *token;
1553
1554 *root_flags = 0;
1555
1556 if (!data)
1557 return 0;
1558
1559 while ((token = strsep(&data, ",")) != NULL) {
1560 if (!strcmp(token, "nsdelegate")) {
1561 *root_flags |= CGRP_ROOT_NS_DELEGATE;
1562 continue;
1563 }
1564
1565 pr_err("cgroup2: unknown option \"%s\"\n", token);
1566 return -EINVAL;
1567 }
1568
1569 return 0;
1570}
1571
1572static void apply_cgroup_root_flags(unsigned int root_flags)
1573{
1574 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1575 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1576 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1577 else
1578 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1579 }
1580}
1581
1582static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1583{
1584 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1585 seq_puts(seq, ",nsdelegate");
1586 return 0;
1587}
1588
1545static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) 1589static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1546{ 1590{
1547 pr_err("remount is not allowed\n"); 1591 unsigned int root_flags;
1548 return -EINVAL; 1592 int ret;
1593
1594 ret = parse_cgroup_root_flags(data, &root_flags);
1595 if (ret)
1596 return ret;
1597
1598 apply_cgroup_root_flags(root_flags);
1599 return 0;
1549} 1600}
1550 1601
1551/* 1602/*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
1598 css_set_update_populated(cset, true); 1649 css_set_update_populated(cset, true);
1599 list_add_tail(&p->cg_list, &cset->tasks); 1650 list_add_tail(&p->cg_list, &cset->tasks);
1600 get_css_set(cset); 1651 get_css_set(cset);
1652 cset->nr_tasks++;
1601 } 1653 }
1602 spin_unlock(&p->sighand->siglock); 1654 spin_unlock(&p->sighand->siglock);
1603 } while_each_thread(g, p); 1655 } while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1784{ 1836{
1785 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 1837 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1786 struct dentry *dentry; 1838 struct dentry *dentry;
1839 int ret;
1787 1840
1788 get_cgroup_ns(ns); 1841 get_cgroup_ns(ns);
1789 1842
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1801 cgroup_enable_task_cg_lists(); 1854 cgroup_enable_task_cg_lists();
1802 1855
1803 if (fs_type == &cgroup2_fs_type) { 1856 if (fs_type == &cgroup2_fs_type) {
1804 if (data) { 1857 unsigned int root_flags;
1805 pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); 1858
1859 ret = parse_cgroup_root_flags(data, &root_flags);
1860 if (ret) {
1806 put_cgroup_ns(ns); 1861 put_cgroup_ns(ns);
1807 return ERR_PTR(-EINVAL); 1862 return ERR_PTR(ret);
1808 } 1863 }
1864
1809 cgrp_dfl_visible = true; 1865 cgrp_dfl_visible = true;
1810 cgroup_get_live(&cgrp_dfl_root.cgrp); 1866 cgroup_get_live(&cgrp_dfl_root.cgrp);
1811 1867
1812 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, 1868 dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
1813 CGROUP2_SUPER_MAGIC, ns); 1869 CGROUP2_SUPER_MAGIC, ns);
1870 if (!IS_ERR(dentry))
1871 apply_cgroup_root_flags(root_flags);
1814 } else { 1872 } else {
1815 dentry = cgroup1_mount(&cgroup_fs_type, flags, data, 1873 dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
1816 CGROUP_SUPER_MAGIC, ns); 1874 CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2064 struct css_set *to_cset = cset->mg_dst_cset; 2122 struct css_set *to_cset = cset->mg_dst_cset;
2065 2123
2066 get_css_set(to_cset); 2124 get_css_set(to_cset);
2125 to_cset->nr_tasks++;
2067 css_set_move_task(task, from_cset, to_cset, true); 2126 css_set_move_task(task, from_cset, to_cset, true);
2068 put_css_set_locked(from_cset); 2127 put_css_set_locked(from_cset);
2128 from_cset->nr_tasks--;
2069 } 2129 }
2070 } 2130 }
2071 spin_unlock_irq(&css_set_lock); 2131 spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2355 struct cgroup *dst_cgrp, 2415 struct cgroup *dst_cgrp,
2356 struct kernfs_open_file *of) 2416 struct kernfs_open_file *of)
2357{ 2417{
2358 int ret = 0; 2418 struct super_block *sb = of->file->f_path.dentry->d_sb;
2359 2419 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2360 if (cgroup_on_dfl(dst_cgrp)) { 2420 struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
2361 struct super_block *sb = of->file->f_path.dentry->d_sb; 2421 struct cgroup *src_cgrp, *com_cgrp;
2362 struct cgroup *cgrp; 2422 struct inode *inode;
2363 struct inode *inode; 2423 int ret;
2364
2365 spin_lock_irq(&css_set_lock);
2366 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2367 spin_unlock_irq(&css_set_lock);
2368
2369 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2370 cgrp = cgroup_parent(cgrp);
2371 2424
2372 ret = -ENOMEM; 2425 if (!cgroup_on_dfl(dst_cgrp)) {
2373 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2374 if (inode) {
2375 ret = inode_permission(inode, MAY_WRITE);
2376 iput(inode);
2377 }
2378 } else {
2379 const struct cred *cred = current_cred(); 2426 const struct cred *cred = current_cred();
2380 const struct cred *tcred = get_task_cred(task); 2427 const struct cred *tcred = get_task_cred(task);
2381 2428
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2383 * even if we're attaching all tasks in the thread group, 2430 * even if we're attaching all tasks in the thread group,
2384 * we only need to check permissions on one of them. 2431 * we only need to check permissions on one of them.
2385 */ 2432 */
2386 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2433 if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
2387 !uid_eq(cred->euid, tcred->uid) && 2434 uid_eq(cred->euid, tcred->uid) ||
2388 !uid_eq(cred->euid, tcred->suid)) 2435 uid_eq(cred->euid, tcred->suid))
2436 ret = 0;
2437 else
2389 ret = -EACCES; 2438 ret = -EACCES;
2439
2390 put_cred(tcred); 2440 put_cred(tcred);
2441 return ret;
2391 } 2442 }
2392 2443
2393 return ret; 2444 /* find the source cgroup */
2445 spin_lock_irq(&css_set_lock);
2446 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2447 spin_unlock_irq(&css_set_lock);
2448
2449 /* and the common ancestor */
2450 com_cgrp = src_cgrp;
2451 while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
2452 com_cgrp = cgroup_parent(com_cgrp);
2453
2454 /* %current should be authorized to migrate to the common ancestor */
2455 inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
2456 if (!inode)
2457 return -ENOMEM;
2458
2459 ret = inode_permission(inode, MAY_WRITE);
2460 iput(inode);
2461 if (ret)
2462 return ret;
2463
2464 /*
2465 * If namespaces are delegation boundaries, %current must be able
2466 * to see both source and destination cgroups from its namespace.
2467 */
2468 if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
2469 (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
2470 !cgroup_is_descendant(dst_cgrp, root_cgrp)))
2471 return -ENOENT;
2472
2473 return 0;
2394} 2474}
2395 2475
2396/* 2476/*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
2954static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 3034static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2955 size_t nbytes, loff_t off) 3035 size_t nbytes, loff_t off)
2956{ 3036{
3037 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2957 struct cgroup *cgrp = of->kn->parent->priv; 3038 struct cgroup *cgrp = of->kn->parent->priv;
2958 struct cftype *cft = of->kn->priv; 3039 struct cftype *cft = of->kn->priv;
2959 struct cgroup_subsys_state *css; 3040 struct cgroup_subsys_state *css;
2960 int ret; 3041 int ret;
2961 3042
3043 /*
3044 * If namespaces are delegation boundaries, disallow writes to
3045 * files in an non-init namespace root from inside the namespace
3046 * except for the files explicitly marked delegatable -
3047 * cgroup.procs and cgroup.subtree_control.
3048 */
3049 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3050 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3051 ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3052 return -EPERM;
3053
2962 if (cft->write) 3054 if (cft->write)
2963 return cft->write(of, buf, nbytes, off); 3055 return cft->write(of, buf, nbytes, off);
2964 3056
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
3792static struct cftype cgroup_base_files[] = { 3884static struct cftype cgroup_base_files[] = {
3793 { 3885 {
3794 .name = "cgroup.procs", 3886 .name = "cgroup.procs",
3887 .flags = CFTYPE_NS_DELEGATABLE,
3795 .file_offset = offsetof(struct cgroup, procs_file), 3888 .file_offset = offsetof(struct cgroup, procs_file),
3796 .release = cgroup_procs_release, 3889 .release = cgroup_procs_release,
3797 .seq_start = cgroup_procs_start, 3890 .seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
3805 }, 3898 },
3806 { 3899 {
3807 .name = "cgroup.subtree_control", 3900 .name = "cgroup.subtree_control",
3901 .flags = CFTYPE_NS_DELEGATABLE,
3808 .seq_show = cgroup_subtree_control_show, 3902 .seq_show = cgroup_subtree_control_show,
3809 .write = cgroup_subtree_control_write, 3903 .write = cgroup_subtree_control_write,
3810 }, 3904 },
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
4393} 4487}
4394 4488
4395static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { 4489static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4490 .show_options = cgroup_show_options,
4396 .remount_fs = cgroup_remount, 4491 .remount_fs = cgroup_remount,
4397 .mkdir = cgroup_mkdir, 4492 .mkdir = cgroup_mkdir,
4398 .rmdir = cgroup_rmdir, 4493 .rmdir = cgroup_rmdir,
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
4789 cset = task_css_set(current); 4884 cset = task_css_set(current);
4790 if (list_empty(&child->cg_list)) { 4885 if (list_empty(&child->cg_list)) {
4791 get_css_set(cset); 4886 get_css_set(cset);
4887 cset->nr_tasks++;
4792 css_set_move_task(child, NULL, cset, false); 4888 css_set_move_task(child, NULL, cset, false);
4793 } 4889 }
4794 spin_unlock_irq(&css_set_lock); 4890 spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
4838 if (!list_empty(&tsk->cg_list)) { 4934 if (!list_empty(&tsk->cg_list)) {
4839 spin_lock_irq(&css_set_lock); 4935 spin_lock_irq(&css_set_lock);
4840 css_set_move_task(tsk, cset, NULL, false); 4936 css_set_move_task(tsk, cset, NULL, false);
4937 cset->nr_tasks--;
4841 spin_unlock_irq(&css_set_lock); 4938 spin_unlock_irq(&css_set_lock);
4842 } else { 4939 } else {
4843 get_css_set(cset); 4940 get_css_set(cset);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..ca8376e5008c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void)
1038 * @tsk: the task to change 1038 * @tsk: the task to change
1039 * @newmems: new nodes that the task will be set 1039 * @newmems: new nodes that the task will be set
1040 * 1040 *
1041 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 1041 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1042 * we structure updates as setting all new allowed nodes, then clearing newly 1042 * and rebind an eventual tasks' mempolicy. If the task is allocating in
1043 * disallowed ones. 1043 * parallel, it might temporarily see an empty intersection, which results in
1044 * a seqlock check and retry before OOM or allocation failure.
1044 */ 1045 */
1045static void cpuset_change_task_nodemask(struct task_struct *tsk, 1046static void cpuset_change_task_nodemask(struct task_struct *tsk,
1046 nodemask_t *newmems) 1047 nodemask_t *newmems)
1047{ 1048{
1048 bool need_loop;
1049
1050 task_lock(tsk); 1049 task_lock(tsk);
1051 /*
1052 * Determine if a loop is necessary if another thread is doing
1053 * read_mems_allowed_begin(). If at least one node remains unchanged and
1054 * tsk does not have a mempolicy, then an empty nodemask will not be
1055 * possible when mems_allowed is larger than a word.
1056 */
1057 need_loop = task_has_mempolicy(tsk) ||
1058 !nodes_intersects(*newmems, tsk->mems_allowed);
1059 1050
1060 if (need_loop) { 1051 local_irq_disable();
1061 local_irq_disable(); 1052 write_seqcount_begin(&tsk->mems_allowed_seq);
1062 write_seqcount_begin(&tsk->mems_allowed_seq);
1063 }
1064 1053
1065 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 1054 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1066 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 1055 mpol_rebind_task(tsk, newmems);
1067
1068 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1069 tsk->mems_allowed = *newmems; 1056 tsk->mems_allowed = *newmems;
1070 1057
1071 if (need_loop) { 1058 write_seqcount_end(&tsk->mems_allowed_seq);
1072 write_seqcount_end(&tsk->mems_allowed_seq); 1059 local_irq_enable();
1073 local_irq_enable();
1074 }
1075 1060
1076 task_unlock(tsk); 1061 task_unlock(tsk);
1077} 1062}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..dac46af22782
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
1/*
2 * Debug controller
3 *
4 * WARNING: This controller is for cgroup core debugging only.
5 * Its interfaces are unstable and subject to changes at any time.
6 */
7#include <linux/ctype.h>
8#include <linux/mm.h>
9#include <linux/slab.h>
10
11#include "cgroup-internal.h"
12
13static struct cgroup_subsys_state *
14debug_css_alloc(struct cgroup_subsys_state *parent_css)
15{
16 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
17
18 if (!css)
19 return ERR_PTR(-ENOMEM);
20
21 return css;
22}
23
24static void debug_css_free(struct cgroup_subsys_state *css)
25{
26 kfree(css);
27}
28
29/*
30 * debug_taskcount_read - return the number of tasks in a cgroup.
31 * @cgrp: the cgroup in question
32 */
33static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
34 struct cftype *cft)
35{
36 return cgroup_task_count(css->cgroup);
37}
38
39static int current_css_set_read(struct seq_file *seq, void *v)
40{
41 struct kernfs_open_file *of = seq->private;
42 struct css_set *cset;
43 struct cgroup_subsys *ss;
44 struct cgroup_subsys_state *css;
45 int i, refcnt;
46
47 if (!cgroup_kn_lock_live(of->kn, false))
48 return -ENODEV;
49
50 spin_lock_irq(&css_set_lock);
51 rcu_read_lock();
52 cset = rcu_dereference(current->cgroups);
53 refcnt = refcount_read(&cset->refcount);
54 seq_printf(seq, "css_set %pK %d", cset, refcnt);
55 if (refcnt > cset->nr_tasks)
56 seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
57 seq_puts(seq, "\n");
58
59 /*
60 * Print the css'es stored in the current css_set.
61 */
62 for_each_subsys(ss, i) {
63 css = cset->subsys[ss->id];
64 if (!css)
65 continue;
66 seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
67 (unsigned long)css, css->id);
68 }
69 rcu_read_unlock();
70 spin_unlock_irq(&css_set_lock);
71 cgroup_kn_unlock(of->kn);
72 return 0;
73}
74
75static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
76 struct cftype *cft)
77{
78 u64 count;
79
80 rcu_read_lock();
81 count = refcount_read(&task_css_set(current)->refcount);
82 rcu_read_unlock();
83 return count;
84}
85
86static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
87{
88 struct cgrp_cset_link *link;
89 struct css_set *cset;
90 char *name_buf;
91
92 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
93 if (!name_buf)
94 return -ENOMEM;
95
96 spin_lock_irq(&css_set_lock);
97 rcu_read_lock();
98 cset = rcu_dereference(current->cgroups);
99 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
100 struct cgroup *c = link->cgrp;
101
102 cgroup_name(c, name_buf, NAME_MAX + 1);
103 seq_printf(seq, "Root %d group %s\n",
104 c->root->hierarchy_id, name_buf);
105 }
106 rcu_read_unlock();
107 spin_unlock_irq(&css_set_lock);
108 kfree(name_buf);
109 return 0;
110}
111
112#define MAX_TASKS_SHOWN_PER_CSS 25
113static int cgroup_css_links_read(struct seq_file *seq, void *v)
114{
115 struct cgroup_subsys_state *css = seq_css(seq);
116 struct cgrp_cset_link *link;
117 int dead_cnt = 0, extra_refs = 0;
118
119 spin_lock_irq(&css_set_lock);
120 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
121 struct css_set *cset = link->cset;
122 struct task_struct *task;
123 int count = 0;
124 int refcnt = refcount_read(&cset->refcount);
125
126 seq_printf(seq, " %d", refcnt);
127 if (refcnt - cset->nr_tasks > 0) {
128 int extra = refcnt - cset->nr_tasks;
129
130 seq_printf(seq, " +%d", extra);
131 /*
132 * Take out the one additional reference in
133 * init_css_set.
134 */
135 if (cset == &init_css_set)
136 extra--;
137 extra_refs += extra;
138 }
139 seq_puts(seq, "\n");
140
141 list_for_each_entry(task, &cset->tasks, cg_list) {
142 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
143 seq_printf(seq, " task %d\n",
144 task_pid_vnr(task));
145 }
146
147 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
148 if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
149 seq_printf(seq, " task %d\n",
150 task_pid_vnr(task));
151 }
152 /* show # of overflowed tasks */
153 if (count > MAX_TASKS_SHOWN_PER_CSS)
154 seq_printf(seq, " ... (%d)\n",
155 count - MAX_TASKS_SHOWN_PER_CSS);
156
157 if (cset->dead) {
158 seq_puts(seq, " [dead]\n");
159 dead_cnt++;
160 }
161
162 WARN_ON(count != cset->nr_tasks);
163 }
164 spin_unlock_irq(&css_set_lock);
165
166 if (!dead_cnt && !extra_refs)
167 return 0;
168
169 seq_puts(seq, "\n");
170 if (extra_refs)
171 seq_printf(seq, "extra references = %d\n", extra_refs);
172 if (dead_cnt)
173 seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
174
175 return 0;
176}
177
178static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
179{
180 struct kernfs_open_file *of = seq->private;
181 struct cgroup *cgrp;
182 struct cgroup_subsys *ss;
183 struct cgroup_subsys_state *css;
184 char pbuf[16];
185 int i;
186
187 cgrp = cgroup_kn_lock_live(of->kn, false);
188 if (!cgrp)
189 return -ENODEV;
190
191 for_each_subsys(ss, i) {
192 css = rcu_dereference_check(cgrp->subsys[ss->id], true);
193 if (!css)
194 continue;
195
196 pbuf[0] = '\0';
197
198 /* Show the parent CSS if applicable*/
199 if (css->parent)
200 snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
201 css->parent->id);
202 seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
203 (unsigned long)css, css->id,
204 atomic_read(&css->online_cnt), pbuf);
205 }
206
207 cgroup_kn_unlock(of->kn);
208 return 0;
209}
210
211static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
212 u16 mask)
213{
214 struct cgroup_subsys *ss;
215 int ssid;
216 bool first = true;
217
218 seq_printf(seq, "%-17s: ", name);
219 for_each_subsys(ss, ssid) {
220 if (!(mask & (1 << ssid)))
221 continue;
222 if (!first)
223 seq_puts(seq, ", ");
224 seq_puts(seq, ss->name);
225 first = false;
226 }
227 seq_putc(seq, '\n');
228}
229
230static int cgroup_masks_read(struct seq_file *seq, void *v)
231{
232 struct kernfs_open_file *of = seq->private;
233 struct cgroup *cgrp;
234
235 cgrp = cgroup_kn_lock_live(of->kn, false);
236 if (!cgrp)
237 return -ENODEV;
238
239 cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
240 cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
241
242 cgroup_kn_unlock(of->kn);
243 return 0;
244}
245
246static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
247{
248 return (!cgroup_is_populated(css->cgroup) &&
249 !css_has_online_children(&css->cgroup->self));
250}
251
252static struct cftype debug_legacy_files[] = {
253 {
254 .name = "taskcount",
255 .read_u64 = debug_taskcount_read,
256 },
257
258 {
259 .name = "current_css_set",
260 .seq_show = current_css_set_read,
261 .flags = CFTYPE_ONLY_ON_ROOT,
262 },
263
264 {
265 .name = "current_css_set_refcount",
266 .read_u64 = current_css_set_refcount_read,
267 .flags = CFTYPE_ONLY_ON_ROOT,
268 },
269
270 {
271 .name = "current_css_set_cg_links",
272 .seq_show = current_css_set_cg_links_read,
273 .flags = CFTYPE_ONLY_ON_ROOT,
274 },
275
276 {
277 .name = "cgroup_css_links",
278 .seq_show = cgroup_css_links_read,
279 },
280
281 {
282 .name = "cgroup_subsys_states",
283 .seq_show = cgroup_subsys_states_read,
284 },
285
286 {
287 .name = "cgroup_masks",
288 .seq_show = cgroup_masks_read,
289 },
290
291 {
292 .name = "releasable",
293 .read_u64 = releasable_read,
294 },
295
296 { } /* terminate */
297};
298
299static struct cftype debug_files[] = {
300 {
301 .name = "taskcount",
302 .read_u64 = debug_taskcount_read,
303 },
304
305 {
306 .name = "current_css_set",
307 .seq_show = current_css_set_read,
308 .flags = CFTYPE_ONLY_ON_ROOT,
309 },
310
311 {
312 .name = "current_css_set_refcount",
313 .read_u64 = current_css_set_refcount_read,
314 .flags = CFTYPE_ONLY_ON_ROOT,
315 },
316
317 {
318 .name = "current_css_set_cg_links",
319 .seq_show = current_css_set_cg_links_read,
320 .flags = CFTYPE_ONLY_ON_ROOT,
321 },
322
323 {
324 .name = "css_links",
325 .seq_show = cgroup_css_links_read,
326 },
327
328 {
329 .name = "csses",
330 .seq_show = cgroup_subsys_states_read,
331 },
332
333 {
334 .name = "masks",
335 .seq_show = cgroup_masks_read,
336 },
337
338 { } /* terminate */
339};
340
341struct cgroup_subsys debug_cgrp_subsys = {
342 .css_alloc = debug_css_alloc,
343 .css_free = debug_css_free,
344 .legacy_cftypes = debug_legacy_files,
345};
346
347/*
348 * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
349 * parameter.
350 */
351static int __init enable_cgroup_debug(char *str)
352{
353 debug_cgrp_subsys.dfl_cftypes = debug_files;
354 debug_cgrp_subsys.implicit_on_dfl = true;
355 return 1;
356}
357__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/compat.c b/kernel/compat.c
index ebd8bdc3fd68..6f0a0e723a06 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
120 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 120 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
121} 121}
122 122
123static int __compat_get_timespec64(struct timespec64 *ts64,
124 const struct compat_timespec __user *cts)
125{
126 struct compat_timespec ts;
127 int ret;
128
129 ret = copy_from_user(&ts, cts, sizeof(ts));
130 if (ret)
131 return -EFAULT;
132
133 ts64->tv_sec = ts.tv_sec;
134 ts64->tv_nsec = ts.tv_nsec;
135
136 return 0;
137}
138
139static int __compat_put_timespec64(const struct timespec64 *ts64,
140 struct compat_timespec __user *cts)
141{
142 struct compat_timespec ts = {
143 .tv_sec = ts64->tv_sec,
144 .tv_nsec = ts64->tv_nsec
145 };
146 return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
147}
148
149int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
150{
151 if (COMPAT_USE_64BIT_TIME)
152 return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
153 else
154 return __compat_get_timespec64(ts, uts);
155}
156EXPORT_SYMBOL_GPL(compat_get_timespec64);
157
158int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
159{
160 if (COMPAT_USE_64BIT_TIME)
161 return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
162 else
163 return __compat_put_timespec64(ts, uts);
164}
165EXPORT_SYMBOL_GPL(compat_put_timespec64);
166
123int compat_get_timeval(struct timeval *tv, const void __user *utv) 167int compat_get_timeval(struct timeval *tv, const void __user *utv)
124{ 168{
125 if (COMPAT_USE_64BIT_TIME) 169 if (COMPAT_USE_64BIT_TIME)
@@ -203,53 +247,6 @@ int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerv
203 return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; 247 return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
204} 248}
205 249
206static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
207{
208 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
209}
210
211COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
212{
213 if (tbuf) {
214 struct tms tms;
215 struct compat_tms tmp;
216
217 do_sys_times(&tms);
218 /* Convert our struct tms to the compat version. */
219 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
220 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
221 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
222 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
223 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
224 return -EFAULT;
225 }
226 force_successful_syscall_return();
227 return compat_jiffies_to_clock_t(jiffies);
228}
229
230#ifdef __ARCH_WANT_SYS_SIGPENDING
231
232/*
233 * Assumption: old_sigset_t and compat_old_sigset_t are both
234 * types that can be passed to put_user()/get_user().
235 */
236
237COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
238{
239 old_sigset_t s;
240 long ret;
241 mm_segment_t old_fs = get_fs();
242
243 set_fs(KERNEL_DS);
244 ret = sys_sigpending((old_sigset_t __user *) &s);
245 set_fs(old_fs);
246 if (ret == 0)
247 ret = put_user(s, set);
248 return ret;
249}
250
251#endif
252
253#ifdef __ARCH_WANT_SYS_SIGPROCMASK 250#ifdef __ARCH_WANT_SYS_SIGPROCMASK
254 251
255/* 252/*
@@ -304,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
304 301
305#endif 302#endif
306 303
307COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
308 struct compat_rlimit __user *, rlim)
309{
310 struct rlimit r;
311
312 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
313 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
314 __get_user(r.rlim_max, &rlim->rlim_max))
315 return -EFAULT;
316
317 if (r.rlim_cur == COMPAT_RLIM_INFINITY)
318 r.rlim_cur = RLIM_INFINITY;
319 if (r.rlim_max == COMPAT_RLIM_INFINITY)
320 r.rlim_max = RLIM_INFINITY;
321 return do_prlimit(current, resource, &r, NULL);
322}
323
324#ifdef COMPAT_RLIM_OLD_INFINITY
325
326COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
327 struct compat_rlimit __user *, rlim)
328{
329 struct rlimit r;
330 int ret;
331 mm_segment_t old_fs = get_fs();
332
333 set_fs(KERNEL_DS);
334 ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
335 set_fs(old_fs);
336
337 if (!ret) {
338 if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
339 r.rlim_cur = COMPAT_RLIM_INFINITY;
340 if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
341 r.rlim_max = COMPAT_RLIM_INFINITY;
342
343 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
344 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
345 __put_user(r.rlim_max, &rlim->rlim_max))
346 return -EFAULT;
347 }
348 return ret;
349}
350
351#endif
352
353COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
354 struct compat_rlimit __user *, rlim)
355{
356 struct rlimit r;
357 int ret;
358
359 ret = do_prlimit(current, resource, NULL, &r);
360 if (!ret) {
361 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
362 r.rlim_cur = COMPAT_RLIM_INFINITY;
363 if (r.rlim_max > COMPAT_RLIM_INFINITY)
364 r.rlim_max = COMPAT_RLIM_INFINITY;
365
366 if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
367 __put_user(r.rlim_cur, &rlim->rlim_cur) ||
368 __put_user(r.rlim_max, &rlim->rlim_max))
369 return -EFAULT;
370 }
371 return ret;
372}
373
374int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) 304int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
375{ 305{
376 if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || 306 struct compat_rusage r32;
377 __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || 307 memset(&r32, 0, sizeof(r32));
378 __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || 308 r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
379 __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || 309 r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
380 __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || 310 r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
381 __put_user(r->ru_maxrss, &ru->ru_maxrss) || 311 r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
382 __put_user(r->ru_ixrss, &ru->ru_ixrss) || 312 r32.ru_maxrss = r->ru_maxrss;
383 __put_user(r->ru_idrss, &ru->ru_idrss) || 313 r32.ru_ixrss = r->ru_ixrss;
384 __put_user(r->ru_isrss, &ru->ru_isrss) || 314 r32.ru_idrss = r->ru_idrss;
385 __put_user(r->ru_minflt, &ru->ru_minflt) || 315 r32.ru_isrss = r->ru_isrss;
386 __put_user(r->ru_majflt, &ru->ru_majflt) || 316 r32.ru_minflt = r->ru_minflt;
387 __put_user(r->ru_nswap, &ru->ru_nswap) || 317 r32.ru_majflt = r->ru_majflt;
388 __put_user(r->ru_inblock, &ru->ru_inblock) || 318 r32.ru_nswap = r->ru_nswap;
389 __put_user(r->ru_oublock, &ru->ru_oublock) || 319 r32.ru_inblock = r->ru_inblock;
390 __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || 320 r32.ru_oublock = r->ru_oublock;
391 __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || 321 r32.ru_msgsnd = r->ru_msgsnd;
392 __put_user(r->ru_nsignals, &ru->ru_nsignals) || 322 r32.ru_msgrcv = r->ru_msgrcv;
393 __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || 323 r32.ru_nsignals = r->ru_nsignals;
394 __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) 324 r32.ru_nvcsw = r->ru_nvcsw;
325 r32.ru_nivcsw = r->ru_nivcsw;
326 if (copy_to_user(ru, &r32, sizeof(r32)))
395 return -EFAULT; 327 return -EFAULT;
396 return 0; 328 return 0;
397} 329}
398 330
399COMPAT_SYSCALL_DEFINE4(wait4,
400 compat_pid_t, pid,
401 compat_uint_t __user *, stat_addr,
402 int, options,
403 struct compat_rusage __user *, ru)
404{
405 if (!ru) {
406 return sys_wait4(pid, stat_addr, options, NULL);
407 } else {
408 struct rusage r;
409 int ret;
410 unsigned int status;
411 mm_segment_t old_fs = get_fs();
412
413 set_fs (KERNEL_DS);
414 ret = sys_wait4(pid,
415 (stat_addr ?
416 (unsigned int __user *) &status : NULL),
417 options, (struct rusage __user *) &r);
418 set_fs (old_fs);
419
420 if (ret > 0) {
421 if (put_compat_rusage(&r, ru))
422 return -EFAULT;
423 if (stat_addr && put_user(status, stat_addr))
424 return -EFAULT;
425 }
426 return ret;
427 }
428}
429
430COMPAT_SYSCALL_DEFINE5(waitid,
431 int, which, compat_pid_t, pid,
432 struct compat_siginfo __user *, uinfo, int, options,
433 struct compat_rusage __user *, uru)
434{
435 siginfo_t info;
436 struct rusage ru;
437 long ret;
438 mm_segment_t old_fs = get_fs();
439
440 memset(&info, 0, sizeof(info));
441
442 set_fs(KERNEL_DS);
443 ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
444 uru ? (struct rusage __user *)&ru : NULL);
445 set_fs(old_fs);
446
447 if ((ret < 0) || (info.si_signo == 0))
448 return ret;
449
450 if (uru) {
451 /* sys_waitid() overwrites everything in ru */
452 if (COMPAT_USE_64BIT_TIME)
453 ret = copy_to_user(uru, &ru, sizeof(ru));
454 else
455 ret = put_compat_rusage(&ru, uru);
456 if (ret)
457 return -EFAULT;
458 }
459
460 BUG_ON(info.si_code & __SI_MASK);
461 info.si_code |= __SI_CHLD;
462 return copy_siginfo_to_user32(uinfo, &info);
463}
464
465static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 331static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
466 unsigned len, struct cpumask *new_mask) 332 unsigned len, struct cpumask *new_mask)
467{ 333{
@@ -542,6 +408,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
542 return 0; 408 return 0;
543} 409}
544 410
411int get_compat_itimerspec64(struct itimerspec64 *its,
412 const struct compat_itimerspec __user *uits)
413{
414
415 if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
416 __compat_get_timespec64(&its->it_value, &uits->it_value))
417 return -EFAULT;
418 return 0;
419}
420EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
421
422int put_compat_itimerspec64(const struct itimerspec64 *its,
423 struct compat_itimerspec __user *uits)
424{
425 if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
426 __compat_put_timespec64(&its->it_value, &uits->it_value))
427 return -EFAULT;
428 return 0;
429}
430EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
431
545/* 432/*
546 * We currently only need the following fields from the sigevent 433 * We currently only need the following fields from the sigevent
547 * structure: sigev_value, sigev_signo, sig_notify and (sometimes 434 * structure: sigev_value, sigev_signo, sig_notify and (sometimes
@@ -566,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event,
566long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, 453long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
567 unsigned long bitmap_size) 454 unsigned long bitmap_size)
568{ 455{
569 int i, j;
570 unsigned long m;
571 compat_ulong_t um;
572 unsigned long nr_compat_longs; 456 unsigned long nr_compat_longs;
573 457
574 /* align bitmap up to nearest compat_long_t boundary */ 458 /* align bitmap up to nearest compat_long_t boundary */
575 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); 459 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
460 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
576 461
577 if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) 462 if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
578 return -EFAULT; 463 return -EFAULT;
579 464
580 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); 465 user_access_begin();
581 466 while (nr_compat_longs > 1) {
582 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { 467 compat_ulong_t l1, l2;
583 m = 0; 468 unsafe_get_user(l1, umask++, Efault);
584 469 unsafe_get_user(l2, umask++, Efault);
585 for (j = 0; j < sizeof(m)/sizeof(um); j++) { 470 *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
586 /* 471 nr_compat_longs -= 2;
587 * We dont want to read past the end of the userspace
588 * bitmap. We must however ensure the end of the
589 * kernel bitmap is zeroed.
590 */
591 if (nr_compat_longs) {
592 nr_compat_longs--;
593 if (__get_user(um, umask))
594 return -EFAULT;
595 } else {
596 um = 0;
597 }
598
599 umask++;
600 m |= (long)um << (j * BITS_PER_COMPAT_LONG);
601 }
602 *mask++ = m;
603 } 472 }
604 473 if (nr_compat_longs)
474 unsafe_get_user(*mask, umask++, Efault);
475 user_access_end();
605 return 0; 476 return 0;
477
478Efault:
479 user_access_end();
480 return -EFAULT;
606} 481}
607 482
608long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, 483long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
609 unsigned long bitmap_size) 484 unsigned long bitmap_size)
610{ 485{
611 int i, j;
612 unsigned long m;
613 compat_ulong_t um;
614 unsigned long nr_compat_longs; 486 unsigned long nr_compat_longs;
615 487
616 /* align bitmap up to nearest compat_long_t boundary */ 488 /* align bitmap up to nearest compat_long_t boundary */
617 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); 489 bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
490 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
618 491
619 if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) 492 if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
620 return -EFAULT; 493 return -EFAULT;
621 494
622 nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); 495 user_access_begin();
623 496 while (nr_compat_longs > 1) {
624 for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { 497 unsigned long m = *mask++;
625 m = *mask++; 498 unsafe_put_user((compat_ulong_t)m, umask++, Efault);
626 499 unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
627 for (j = 0; j < sizeof(m)/sizeof(um); j++) { 500 nr_compat_longs -= 2;
628 um = m;
629
630 /*
631 * We dont want to write past the end of the userspace
632 * bitmap.
633 */
634 if (nr_compat_longs) {
635 nr_compat_longs--;
636 if (__put_user(um, umask))
637 return -EFAULT;
638 }
639
640 umask++;
641 m >>= 4*sizeof(um);
642 m >>= 4*sizeof(um);
643 }
644 } 501 }
645 502 if (nr_compat_longs)
503 unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
504 user_access_end();
646 return 0; 505 return 0;
506Efault:
507 user_access_end();
508 return -EFAULT;
647} 509}
648 510
649void 511void
@@ -669,38 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
669 } 531 }
670} 532}
671 533
672COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
673 struct compat_siginfo __user *, uinfo,
674 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
675{
676 compat_sigset_t s32;
677 sigset_t s;
678 struct timespec t;
679 siginfo_t info;
680 long ret;
681
682 if (sigsetsize != sizeof(sigset_t))
683 return -EINVAL;
684
685 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
686 return -EFAULT;
687 sigset_from_compat(&s, &s32);
688
689 if (uts) {
690 if (compat_get_timespec(&t, uts))
691 return -EFAULT;
692 }
693
694 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
695
696 if (ret > 0 && uinfo) {
697 if (copy_siginfo_to_user32(uinfo, &info))
698 ret = -EFAULT;
699 }
700
701 return ret;
702}
703
704#ifdef CONFIG_NUMA 534#ifdef CONFIG_NUMA
705COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 535COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
706 compat_uptr_t __user *, pages32, 536 compat_uptr_t __user *, pages32,
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 26a06e09a5bd..d70829033bb7 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,10 +1,13 @@
1# KEEP ALPHABETICALLY SORTED 1# KEEP ALPHABETICALLY SORTED
2# CONFIG_DEVKMEM is not set 2# CONFIG_DEVKMEM is not set
3# CONFIG_DEVMEM is not set 3# CONFIG_DEVMEM is not set
4# CONFIG_FHANDLE is not set
4# CONFIG_INET_LRO is not set 5# CONFIG_INET_LRO is not set
5# CONFIG_MODULES is not set 6# CONFIG_NFSD is not set
7# CONFIG_NFS_FS is not set
6# CONFIG_OABI_COMPAT is not set 8# CONFIG_OABI_COMPAT is not set
7# CONFIG_SYSVIPC is not set 9# CONFIG_SYSVIPC is not set
10# CONFIG_USELIB is not set
8CONFIG_ANDROID=y 11CONFIG_ANDROID=y
9CONFIG_ANDROID_BINDER_IPC=y 12CONFIG_ANDROID_BINDER_IPC=y
10CONFIG_ANDROID_LOW_MEMORY_KILLER=y 13CONFIG_ANDROID_LOW_MEMORY_KILLER=y
@@ -13,6 +16,7 @@ CONFIG_ASHMEM=y
13CONFIG_AUDIT=y 16CONFIG_AUDIT=y
14CONFIG_BLK_DEV_INITRD=y 17CONFIG_BLK_DEV_INITRD=y
15CONFIG_CGROUPS=y 18CONFIG_CGROUPS=y
19CONFIG_CGROUP_BPF=y
16CONFIG_CGROUP_CPUACCT=y 20CONFIG_CGROUP_CPUACCT=y
17CONFIG_CGROUP_DEBUG=y 21CONFIG_CGROUP_DEBUG=y
18CONFIG_CGROUP_FREEZER=y 22CONFIG_CGROUP_FREEZER=y
@@ -23,6 +27,8 @@ CONFIG_EMBEDDED=y
23CONFIG_FB=y 27CONFIG_FB=y
24CONFIG_HARDENED_USERCOPY=y 28CONFIG_HARDENED_USERCOPY=y
25CONFIG_HIGH_RES_TIMERS=y 29CONFIG_HIGH_RES_TIMERS=y
30CONFIG_IKCONFIG=y
31CONFIG_IKCONFIG_PROC=y
26CONFIG_INET6_AH=y 32CONFIG_INET6_AH=y
27CONFIG_INET6_ESP=y 33CONFIG_INET6_ESP=y
28CONFIG_INET6_IPCOMP=y 34CONFIG_INET6_IPCOMP=y
@@ -60,6 +66,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
60CONFIG_IP_NF_TARGET_NETMAP=y 66CONFIG_IP_NF_TARGET_NETMAP=y
61CONFIG_IP_NF_TARGET_REDIRECT=y 67CONFIG_IP_NF_TARGET_REDIRECT=y
62CONFIG_IP_NF_TARGET_REJECT=y 68CONFIG_IP_NF_TARGET_REJECT=y
69CONFIG_MODULES=y
70CONFIG_MODULE_UNLOAD=y
71CONFIG_MODVERSIONS=y
63CONFIG_NET=y 72CONFIG_NET=y
64CONFIG_NETDEVICES=y 73CONFIG_NETDEVICES=y
65CONFIG_NETFILTER=y 74CONFIG_NETFILTER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 28ee064b6744..946fb92418f7 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,13 +6,15 @@
6# CONFIG_NF_CONNTRACK_SIP is not set 6# CONFIG_NF_CONNTRACK_SIP is not set
7# CONFIG_PM_WAKELOCKS_GC is not set 7# CONFIG_PM_WAKELOCKS_GC is not set
8# CONFIG_VT is not set 8# CONFIG_VT is not set
9CONFIG_ARM64_SW_TTBR0_PAN=y
9CONFIG_BACKLIGHT_LCD_SUPPORT=y 10CONFIG_BACKLIGHT_LCD_SUPPORT=y
10CONFIG_BLK_DEV_DM=y 11CONFIG_BLK_DEV_DM=y
11CONFIG_BLK_DEV_LOOP=y 12CONFIG_BLK_DEV_LOOP=y
12CONFIG_BLK_DEV_RAM=y 13CONFIG_BLK_DEV_RAM=y
13CONFIG_BLK_DEV_RAM_SIZE=8192 14CONFIG_BLK_DEV_RAM_SIZE=8192
15CONFIG_CC_STACKPROTECTOR_STRONG=y
14CONFIG_COMPACTION=y 16CONFIG_COMPACTION=y
15CONFIG_STRICT_KERNEL_RWX=y 17CONFIG_CPU_SW_DOMAIN_PAN=y
16CONFIG_DM_CRYPT=y 18CONFIG_DM_CRYPT=y
17CONFIG_DM_UEVENT=y 19CONFIG_DM_UEVENT=y
18CONFIG_DM_VERITY=y 20CONFIG_DM_VERITY=y
@@ -105,6 +107,7 @@ CONFIG_SCHEDSTATS=y
105CONFIG_SMARTJOYPLUS_FF=y 107CONFIG_SMARTJOYPLUS_FF=y
106CONFIG_SND=y 108CONFIG_SND=y
107CONFIG_SOUND=y 109CONFIG_SOUND=y
110CONFIG_STRICT_KERNEL_RWX=y
108CONFIG_SUSPEND_TIME=y 111CONFIG_SUSPEND_TIME=y
109CONFIG_TABLET_USB_ACECAD=y 112CONFIG_TABLET_USB_ACECAD=y
110CONFIG_TABLET_USB_AIPTEK=y 113CONFIG_TABLET_USB_AIPTEK=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b03a32595cfe..eee033134262 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -271,11 +271,26 @@ void cpu_hotplug_enable(void)
271EXPORT_SYMBOL_GPL(cpu_hotplug_enable); 271EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
272#endif /* CONFIG_HOTPLUG_CPU */ 272#endif /* CONFIG_HOTPLUG_CPU */
273 273
274static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
275
274static int bringup_wait_for_ap(unsigned int cpu) 276static int bringup_wait_for_ap(unsigned int cpu)
275{ 277{
276 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 278 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
277 279
280 /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
278 wait_for_completion(&st->done); 281 wait_for_completion(&st->done);
282 if (WARN_ON_ONCE((!cpu_online(cpu))))
283 return -ECANCELED;
284
285 /* Unpark the stopper thread and the hotplug thread of the target cpu */
286 stop_machine_unpark(cpu);
287 kthread_unpark(st->thread);
288
289 /* Should we go further up ? */
290 if (st->target > CPUHP_AP_ONLINE_IDLE) {
291 __cpuhp_kick_ap_work(st);
292 wait_for_completion(&st->done);
293 }
279 return st->result; 294 return st->result;
280} 295}
281 296
@@ -296,9 +311,7 @@ static int bringup_cpu(unsigned int cpu)
296 irq_unlock_sparse(); 311 irq_unlock_sparse();
297 if (ret) 312 if (ret)
298 return ret; 313 return ret;
299 ret = bringup_wait_for_ap(cpu); 314 return bringup_wait_for_ap(cpu);
300 BUG_ON(!cpu_online(cpu));
301 return ret;
302} 315}
303 316
304/* 317/*
@@ -767,31 +780,20 @@ void notify_cpu_starting(unsigned int cpu)
767} 780}
768 781
769/* 782/*
770 * Called from the idle task. We need to set active here, so we can kick off 783 * Called from the idle task. Wake up the controlling task which brings the
771 * the stopper thread and unpark the smpboot threads. If the target state is 784 * stopper and the hotplug thread of the upcoming CPU up and then delegates
772 * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the 785 * the rest of the online bringup to the hotplug thread.
773 * cpu further.
774 */ 786 */
775void cpuhp_online_idle(enum cpuhp_state state) 787void cpuhp_online_idle(enum cpuhp_state state)
776{ 788{
777 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); 789 struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
778 unsigned int cpu = smp_processor_id();
779 790
780 /* Happens for the boot cpu */ 791 /* Happens for the boot cpu */
781 if (state != CPUHP_AP_ONLINE_IDLE) 792 if (state != CPUHP_AP_ONLINE_IDLE)
782 return; 793 return;
783 794
784 st->state = CPUHP_AP_ONLINE_IDLE; 795 st->state = CPUHP_AP_ONLINE_IDLE;
785 796 complete(&st->done);
786 /* Unpark the stopper thread and the hotplug thread of this cpu */
787 stop_machine_unpark(cpu);
788 kthread_unpark(st->thread);
789
790 /* Should we go further up ? */
791 if (st->target > CPUHP_AP_ONLINE_IDLE)
792 __cpuhp_kick_ap_work(st);
793 else
794 complete(&st->done);
795} 797}
796 798
797/* Requires cpu_add_remove_lock to be held */ 799/* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fcbd568f1e95..6db80fc0810b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,10 +14,12 @@
14#include <asm/sections.h> 14#include <asm/sections.h>
15 15
16/* vmcoreinfo stuff */ 16/* vmcoreinfo stuff */
17static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 17static unsigned char *vmcoreinfo_data;
18u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 18static size_t vmcoreinfo_size;
19size_t vmcoreinfo_size; 19u32 *vmcoreinfo_note;
20size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 20
21/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
22static unsigned char *vmcoreinfo_data_safecopy;
21 23
22/* 24/*
23 * parsing the "crashkernel" commandline 25 * parsing the "crashkernel" commandline
@@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void)
324 final_note(buf); 326 final_note(buf);
325} 327}
326 328
329void crash_update_vmcoreinfo_safecopy(void *ptr)
330{
331 if (ptr)
332 memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
333
334 vmcoreinfo_data_safecopy = ptr;
335}
336
327void crash_save_vmcoreinfo(void) 337void crash_save_vmcoreinfo(void)
328{ 338{
339 if (!vmcoreinfo_note)
340 return;
341
342 /* Use the safe copy to generate vmcoreinfo note if have */
343 if (vmcoreinfo_data_safecopy)
344 vmcoreinfo_data = vmcoreinfo_data_safecopy;
345
329 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); 346 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
330 update_vmcoreinfo_note(); 347 update_vmcoreinfo_note();
331} 348}
@@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
340 r = vscnprintf(buf, sizeof(buf), fmt, args); 357 r = vscnprintf(buf, sizeof(buf), fmt, args);
341 va_end(args); 358 va_end(args);
342 359
343 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); 360 r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
344 361
345 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 362 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
346 363
@@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void)
356 373
357phys_addr_t __weak paddr_vmcoreinfo_note(void) 374phys_addr_t __weak paddr_vmcoreinfo_note(void)
358{ 375{
359 return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); 376 return __pa(vmcoreinfo_note);
360} 377}
361 378
362static int __init crash_save_vmcoreinfo_init(void) 379static int __init crash_save_vmcoreinfo_init(void)
363{ 380{
381 vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
382 if (!vmcoreinfo_data) {
383 pr_warn("Memory allocation for vmcoreinfo_data failed\n");
384 return -ENOMEM;
385 }
386
387 vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
388 GFP_KERNEL | __GFP_ZERO);
389 if (!vmcoreinfo_note) {
390 free_page((unsigned long)vmcoreinfo_data);
391 vmcoreinfo_data = NULL;
392 pr_warn("Memory allocation for vmcoreinfo_note failed\n");
393 return -ENOMEM;
394 }
395
364 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); 396 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
365 VMCOREINFO_PAGESIZE(PAGE_SIZE); 397 VMCOREINFO_PAGESIZE(PAGE_SIZE);
366 398
diff --git a/kernel/cred.c b/kernel/cred.c
index 2bc66075740f..ecf03657e71c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/security/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.rst
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c9cdbd396770..426c2ffba16d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3639,10 +3639,10 @@ static inline u64 perf_event_count(struct perf_event *event)
3639 * will not be local and we cannot read them atomically 3639 * will not be local and we cannot read them atomically
3640 * - must not have a pmu::count method 3640 * - must not have a pmu::count method
3641 */ 3641 */
3642u64 perf_event_read_local(struct perf_event *event) 3642int perf_event_read_local(struct perf_event *event, u64 *value)
3643{ 3643{
3644 unsigned long flags; 3644 unsigned long flags;
3645 u64 val; 3645 int ret = 0;
3646 3646
3647 /* 3647 /*
3648 * Disabling interrupts avoids all counter scheduling (context 3648 * Disabling interrupts avoids all counter scheduling (context
@@ -3650,25 +3650,37 @@ u64 perf_event_read_local(struct perf_event *event)
3650 */ 3650 */
3651 local_irq_save(flags); 3651 local_irq_save(flags);
3652 3652
3653 /* If this is a per-task event, it must be for current */
3654 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3655 event->hw.target != current);
3656
3657 /* If this is a per-CPU event, it must be for this CPU */
3658 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3659 event->cpu != smp_processor_id());
3660
3661 /* 3653 /*
3662 * It must not be an event with inherit set, we cannot read 3654 * It must not be an event with inherit set, we cannot read
3663 * all child counters from atomic context. 3655 * all child counters from atomic context.
3664 */ 3656 */
3665 WARN_ON_ONCE(event->attr.inherit); 3657 if (event->attr.inherit) {
3658 ret = -EOPNOTSUPP;
3659 goto out;
3660 }
3666 3661
3667 /* 3662 /*
3668 * It must not have a pmu::count method, those are not 3663 * It must not have a pmu::count method, those are not
3669 * NMI safe. 3664 * NMI safe.
3670 */ 3665 */
3671 WARN_ON_ONCE(event->pmu->count); 3666 if (event->pmu->count) {
3667 ret = -EOPNOTSUPP;
3668 goto out;
3669 }
3670
3671 /* If this is a per-task event, it must be for current */
3672 if ((event->attach_state & PERF_ATTACH_TASK) &&
3673 event->hw.target != current) {
3674 ret = -EINVAL;
3675 goto out;
3676 }
3677
3678 /* If this is a per-CPU event, it must be for this CPU */
3679 if (!(event->attach_state & PERF_ATTACH_TASK) &&
3680 event->cpu != smp_processor_id()) {
3681 ret = -EINVAL;
3682 goto out;
3683 }
3672 3684
3673 /* 3685 /*
3674 * If the event is currently on this CPU, its either a per-task event, 3686 * If the event is currently on this CPU, its either a per-task event,
@@ -3678,10 +3690,11 @@ u64 perf_event_read_local(struct perf_event *event)
3678 if (event->oncpu == smp_processor_id()) 3690 if (event->oncpu == smp_processor_id())
3679 event->pmu->read(event); 3691 event->pmu->read(event);
3680 3692
3681 val = local64_read(&event->count); 3693 *value = local64_read(&event->count);
3694out:
3682 local_irq_restore(flags); 3695 local_irq_restore(flags);
3683 3696
3684 return val; 3697 return ret;
3685} 3698}
3686 3699
3687static int perf_event_read(struct perf_event *event, bool group) 3700static int perf_event_read(struct perf_event *event, bool group)
@@ -4372,7 +4385,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
4372static int __perf_read_group_add(struct perf_event *leader, 4385static int __perf_read_group_add(struct perf_event *leader,
4373 u64 read_format, u64 *values) 4386 u64 read_format, u64 *values)
4374{ 4387{
4388 struct perf_event_context *ctx = leader->ctx;
4375 struct perf_event *sub; 4389 struct perf_event *sub;
4390 unsigned long flags;
4376 int n = 1; /* skip @nr */ 4391 int n = 1; /* skip @nr */
4377 int ret; 4392 int ret;
4378 4393
@@ -4402,12 +4417,15 @@ static int __perf_read_group_add(struct perf_event *leader,
4402 if (read_format & PERF_FORMAT_ID) 4417 if (read_format & PERF_FORMAT_ID)
4403 values[n++] = primary_event_id(leader); 4418 values[n++] = primary_event_id(leader);
4404 4419
4420 raw_spin_lock_irqsave(&ctx->lock, flags);
4421
4405 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4422 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4406 values[n++] += perf_event_count(sub); 4423 values[n++] += perf_event_count(sub);
4407 if (read_format & PERF_FORMAT_ID) 4424 if (read_format & PERF_FORMAT_ID)
4408 values[n++] = primary_event_id(sub); 4425 values[n++] = primary_event_id(sub);
4409 } 4426 }
4410 4427
4428 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4411 return 0; 4429 return 0;
4412} 4430}
4413 4431
@@ -8035,12 +8053,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8035 bool is_kprobe, is_tracepoint; 8053 bool is_kprobe, is_tracepoint;
8036 struct bpf_prog *prog; 8054 struct bpf_prog *prog;
8037 8055
8038 if (event->attr.type == PERF_TYPE_HARDWARE ||
8039 event->attr.type == PERF_TYPE_SOFTWARE)
8040 return perf_event_set_bpf_handler(event, prog_fd);
8041
8042 if (event->attr.type != PERF_TYPE_TRACEPOINT) 8056 if (event->attr.type != PERF_TYPE_TRACEPOINT)
8043 return -EINVAL; 8057 return perf_event_set_bpf_handler(event, prog_fd);
8044 8058
8045 if (event->tp_event->prog) 8059 if (event->tp_event->prog)
8046 return -EEXIST; 8060 return -EEXIST;
diff --git a/kernel/exit.c b/kernel/exit.c
index c63226283aef..c5548faa9f37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
51#include <linux/task_io_accounting_ops.h> 51#include <linux/task_io_accounting_ops.h>
52#include <linux/tracehook.h> 52#include <linux/tracehook.h>
53#include <linux/fs_struct.h> 53#include <linux/fs_struct.h>
54#include <linux/userfaultfd_k.h>
55#include <linux/init_task.h> 54#include <linux/init_task.h>
56#include <linux/perf_event.h> 55#include <linux/perf_event.h>
57#include <trace/events/sched.h> 56#include <trace/events/sched.h>
@@ -62,6 +61,7 @@
62#include <linux/kcov.h> 61#include <linux/kcov.h>
63#include <linux/random.h> 62#include <linux/random.h>
64#include <linux/rcuwait.h> 63#include <linux/rcuwait.h>
64#include <linux/compat.h>
65 65
66#include <linux/uaccess.h> 66#include <linux/uaccess.h>
67#include <asm/unistd.h> 67#include <asm/unistd.h>
@@ -982,14 +982,21 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
982 return 0; 982 return 0;
983} 983}
984 984
985struct waitid_info {
986 pid_t pid;
987 uid_t uid;
988 int status;
989 int cause;
990};
991
985struct wait_opts { 992struct wait_opts {
986 enum pid_type wo_type; 993 enum pid_type wo_type;
987 int wo_flags; 994 int wo_flags;
988 struct pid *wo_pid; 995 struct pid *wo_pid;
989 996
990 struct siginfo __user *wo_info; 997 struct waitid_info *wo_info;
991 int __user *wo_stat; 998 int wo_stat;
992 struct rusage __user *wo_rusage; 999 struct rusage *wo_rusage;
993 1000
994 wait_queue_entry_t child_wait; 1001 wait_queue_entry_t child_wait;
995 int notask_error; 1002 int notask_error;
@@ -1036,34 +1043,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
1036 return 1; 1043 return 1;
1037} 1044}
1038 1045
1039static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1040 pid_t pid, uid_t uid, int why, int status)
1041{
1042 struct siginfo __user *infop;
1043 int retval = wo->wo_rusage
1044 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1045
1046 put_task_struct(p);
1047 infop = wo->wo_info;
1048 if (infop) {
1049 if (!retval)
1050 retval = put_user(SIGCHLD, &infop->si_signo);
1051 if (!retval)
1052 retval = put_user(0, &infop->si_errno);
1053 if (!retval)
1054 retval = put_user((short)why, &infop->si_code);
1055 if (!retval)
1056 retval = put_user(pid, &infop->si_pid);
1057 if (!retval)
1058 retval = put_user(uid, &infop->si_uid);
1059 if (!retval)
1060 retval = put_user(status, &infop->si_status);
1061 }
1062 if (!retval)
1063 retval = pid;
1064 return retval;
1065}
1066
1067/* 1046/*
1068 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold 1047 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1069 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1048 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1072,30 +1051,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1072 */ 1051 */
1073static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 1052static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1074{ 1053{
1075 int state, retval, status; 1054 int state, status;
1076 pid_t pid = task_pid_vnr(p); 1055 pid_t pid = task_pid_vnr(p);
1077 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 1056 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1078 struct siginfo __user *infop; 1057 struct waitid_info *infop;
1079 1058
1080 if (!likely(wo->wo_flags & WEXITED)) 1059 if (!likely(wo->wo_flags & WEXITED))
1081 return 0; 1060 return 0;
1082 1061
1083 if (unlikely(wo->wo_flags & WNOWAIT)) { 1062 if (unlikely(wo->wo_flags & WNOWAIT)) {
1084 int exit_code = p->exit_code; 1063 status = p->exit_code;
1085 int why;
1086
1087 get_task_struct(p); 1064 get_task_struct(p);
1088 read_unlock(&tasklist_lock); 1065 read_unlock(&tasklist_lock);
1089 sched_annotate_sleep(); 1066 sched_annotate_sleep();
1090 1067 if (wo->wo_rusage)
1091 if ((exit_code & 0x7f) == 0) { 1068 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1092 why = CLD_EXITED; 1069 put_task_struct(p);
1093 status = exit_code >> 8; 1070 goto out_info;
1094 } else {
1095 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1096 status = exit_code & 0x7f;
1097 }
1098 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1099 } 1071 }
1100 /* 1072 /*
1101 * Move the task's state to DEAD/TRACE, only one thread can do this. 1073 * Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1168,38 +1140,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1168 spin_unlock_irq(&current->sighand->siglock); 1140 spin_unlock_irq(&current->sighand->siglock);
1169 } 1141 }
1170 1142
1171 retval = wo->wo_rusage 1143 if (wo->wo_rusage)
1172 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1144 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1173 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1145 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1174 ? p->signal->group_exit_code : p->exit_code; 1146 ? p->signal->group_exit_code : p->exit_code;
1175 if (!retval && wo->wo_stat) 1147 wo->wo_stat = status;
1176 retval = put_user(status, wo->wo_stat);
1177
1178 infop = wo->wo_info;
1179 if (!retval && infop)
1180 retval = put_user(SIGCHLD, &infop->si_signo);
1181 if (!retval && infop)
1182 retval = put_user(0, &infop->si_errno);
1183 if (!retval && infop) {
1184 int why;
1185
1186 if ((status & 0x7f) == 0) {
1187 why = CLD_EXITED;
1188 status >>= 8;
1189 } else {
1190 why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1191 status &= 0x7f;
1192 }
1193 retval = put_user((short)why, &infop->si_code);
1194 if (!retval)
1195 retval = put_user(status, &infop->si_status);
1196 }
1197 if (!retval && infop)
1198 retval = put_user(pid, &infop->si_pid);
1199 if (!retval && infop)
1200 retval = put_user(uid, &infop->si_uid);
1201 if (!retval)
1202 retval = pid;
1203 1148
1204 if (state == EXIT_TRACE) { 1149 if (state == EXIT_TRACE) {
1205 write_lock_irq(&tasklist_lock); 1150 write_lock_irq(&tasklist_lock);
@@ -1216,7 +1161,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1216 if (state == EXIT_DEAD) 1161 if (state == EXIT_DEAD)
1217 release_task(p); 1162 release_task(p);
1218 1163
1219 return retval; 1164out_info:
1165 infop = wo->wo_info;
1166 if (infop) {
1167 if ((status & 0x7f) == 0) {
1168 infop->cause = CLD_EXITED;
1169 infop->status = status >> 8;
1170 } else {
1171 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1172 infop->status = status & 0x7f;
1173 }
1174 infop->pid = pid;
1175 infop->uid = uid;
1176 }
1177
1178 return pid;
1220} 1179}
1221 1180
1222static int *task_stopped_code(struct task_struct *p, bool ptrace) 1181static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1252,8 +1211,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1252static int wait_task_stopped(struct wait_opts *wo, 1211static int wait_task_stopped(struct wait_opts *wo,
1253 int ptrace, struct task_struct *p) 1212 int ptrace, struct task_struct *p)
1254{ 1213{
1255 struct siginfo __user *infop; 1214 struct waitid_info *infop;
1256 int retval, exit_code, *p_code, why; 1215 int exit_code, *p_code, why;
1257 uid_t uid = 0; /* unneeded, required by compiler */ 1216 uid_t uid = 0; /* unneeded, required by compiler */
1258 pid_t pid; 1217 pid_t pid;
1259 1218
@@ -1298,34 +1257,21 @@ unlock_sig:
1298 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1257 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1299 read_unlock(&tasklist_lock); 1258 read_unlock(&tasklist_lock);
1300 sched_annotate_sleep(); 1259 sched_annotate_sleep();
1260 if (wo->wo_rusage)
1261 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1262 put_task_struct(p);
1301 1263
1302 if (unlikely(wo->wo_flags & WNOWAIT)) 1264 if (likely(!(wo->wo_flags & WNOWAIT)))
1303 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1265 wo->wo_stat = (exit_code << 8) | 0x7f;
1304
1305 retval = wo->wo_rusage
1306 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1307 if (!retval && wo->wo_stat)
1308 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1309 1266
1310 infop = wo->wo_info; 1267 infop = wo->wo_info;
1311 if (!retval && infop) 1268 if (infop) {
1312 retval = put_user(SIGCHLD, &infop->si_signo); 1269 infop->cause = why;
1313 if (!retval && infop) 1270 infop->status = exit_code;
1314 retval = put_user(0, &infop->si_errno); 1271 infop->pid = pid;
1315 if (!retval && infop) 1272 infop->uid = uid;
1316 retval = put_user((short)why, &infop->si_code); 1273 }
1317 if (!retval && infop) 1274 return pid;
1318 retval = put_user(exit_code, &infop->si_status);
1319 if (!retval && infop)
1320 retval = put_user(pid, &infop->si_pid);
1321 if (!retval && infop)
1322 retval = put_user(uid, &infop->si_uid);
1323 if (!retval)
1324 retval = pid;
1325 put_task_struct(p);
1326
1327 BUG_ON(!retval);
1328 return retval;
1329} 1275}
1330 1276
1331/* 1277/*
@@ -1336,7 +1282,7 @@ unlock_sig:
1336 */ 1282 */
1337static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) 1283static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1338{ 1284{
1339 int retval; 1285 struct waitid_info *infop;
1340 pid_t pid; 1286 pid_t pid;
1341 uid_t uid; 1287 uid_t uid;
1342 1288
@@ -1361,22 +1307,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1361 get_task_struct(p); 1307 get_task_struct(p);
1362 read_unlock(&tasklist_lock); 1308 read_unlock(&tasklist_lock);
1363 sched_annotate_sleep(); 1309 sched_annotate_sleep();
1310 if (wo->wo_rusage)
1311 getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1312 put_task_struct(p);
1364 1313
1365 if (!wo->wo_info) { 1314 infop = wo->wo_info;
1366 retval = wo->wo_rusage 1315 if (!infop) {
1367 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1316 wo->wo_stat = 0xffff;
1368 put_task_struct(p);
1369 if (!retval && wo->wo_stat)
1370 retval = put_user(0xffff, wo->wo_stat);
1371 if (!retval)
1372 retval = pid;
1373 } else { 1317 } else {
1374 retval = wait_noreap_copyout(wo, p, pid, uid, 1318 infop->cause = CLD_CONTINUED;
1375 CLD_CONTINUED, SIGCONT); 1319 infop->pid = pid;
1376 BUG_ON(retval == 0); 1320 infop->uid = uid;
1321 infop->status = SIGCONT;
1377 } 1322 }
1378 1323 return pid;
1379 return retval;
1380} 1324}
1381 1325
1382/* 1326/*
@@ -1604,8 +1548,8 @@ end:
1604 return retval; 1548 return retval;
1605} 1549}
1606 1550
1607SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1551static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1608 infop, int, options, struct rusage __user *, ru) 1552 int options, struct rusage *ru)
1609{ 1553{
1610 struct wait_opts wo; 1554 struct wait_opts wo;
1611 struct pid *pid = NULL; 1555 struct pid *pid = NULL;
@@ -1643,38 +1587,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1643 wo.wo_pid = pid; 1587 wo.wo_pid = pid;
1644 wo.wo_flags = options; 1588 wo.wo_flags = options;
1645 wo.wo_info = infop; 1589 wo.wo_info = infop;
1646 wo.wo_stat = NULL;
1647 wo.wo_rusage = ru; 1590 wo.wo_rusage = ru;
1648 ret = do_wait(&wo); 1591 ret = do_wait(&wo);
1649 1592
1650 if (ret > 0) {
1651 ret = 0;
1652 } else if (infop) {
1653 /*
1654 * For a WNOHANG return, clear out all the fields
1655 * we would set so the user can easily tell the
1656 * difference.
1657 */
1658 if (!ret)
1659 ret = put_user(0, &infop->si_signo);
1660 if (!ret)
1661 ret = put_user(0, &infop->si_errno);
1662 if (!ret)
1663 ret = put_user(0, &infop->si_code);
1664 if (!ret)
1665 ret = put_user(0, &infop->si_pid);
1666 if (!ret)
1667 ret = put_user(0, &infop->si_uid);
1668 if (!ret)
1669 ret = put_user(0, &infop->si_status);
1670 }
1671
1672 put_pid(pid); 1593 put_pid(pid);
1673 return ret; 1594 return ret;
1674} 1595}
1675 1596
1676SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1597SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1677 int, options, struct rusage __user *, ru) 1598 infop, int, options, struct rusage __user *, ru)
1599{
1600 struct rusage r;
1601 struct waitid_info info = {.status = 0};
1602 long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1603 int signo = 0;
1604 if (err > 0) {
1605 signo = SIGCHLD;
1606 err = 0;
1607 }
1608
1609 if (!err) {
1610 if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1611 return -EFAULT;
1612 }
1613 if (!infop)
1614 return err;
1615
1616 user_access_begin();
1617 unsafe_put_user(signo, &infop->si_signo, Efault);
1618 unsafe_put_user(0, &infop->si_errno, Efault);
1619 unsafe_put_user((short)info.cause, &infop->si_code, Efault);
1620 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1621 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1622 unsafe_put_user(info.status, &infop->si_status, Efault);
1623 user_access_end();
1624 return err;
1625Efault:
1626 user_access_end();
1627 return -EFAULT;
1628}
1629
1630long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1631 struct rusage *ru)
1678{ 1632{
1679 struct wait_opts wo; 1633 struct wait_opts wo;
1680 struct pid *pid = NULL; 1634 struct pid *pid = NULL;
@@ -1685,6 +1639,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1685 __WNOTHREAD|__WCLONE|__WALL)) 1639 __WNOTHREAD|__WCLONE|__WALL))
1686 return -EINVAL; 1640 return -EINVAL;
1687 1641
1642 /* -INT_MIN is not defined */
1643 if (upid == INT_MIN)
1644 return -ESRCH;
1645
1688 if (upid == -1) 1646 if (upid == -1)
1689 type = PIDTYPE_MAX; 1647 type = PIDTYPE_MAX;
1690 else if (upid < 0) { 1648 else if (upid < 0) {
@@ -1702,14 +1660,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1702 wo.wo_pid = pid; 1660 wo.wo_pid = pid;
1703 wo.wo_flags = options | WEXITED; 1661 wo.wo_flags = options | WEXITED;
1704 wo.wo_info = NULL; 1662 wo.wo_info = NULL;
1705 wo.wo_stat = stat_addr; 1663 wo.wo_stat = 0;
1706 wo.wo_rusage = ru; 1664 wo.wo_rusage = ru;
1707 ret = do_wait(&wo); 1665 ret = do_wait(&wo);
1708 put_pid(pid); 1666 put_pid(pid);
1667 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1668 ret = -EFAULT;
1709 1669
1710 return ret; 1670 return ret;
1711} 1671}
1712 1672
1673SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1674 int, options, struct rusage __user *, ru)
1675{
1676 struct rusage r;
1677 long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1678
1679 if (err > 0) {
1680 if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1681 return -EFAULT;
1682 }
1683 return err;
1684}
1685
1713#ifdef __ARCH_WANT_SYS_WAITPID 1686#ifdef __ARCH_WANT_SYS_WAITPID
1714 1687
1715/* 1688/*
@@ -1722,3 +1695,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1722} 1695}
1723 1696
1724#endif 1697#endif
1698
1699#ifdef CONFIG_COMPAT
1700COMPAT_SYSCALL_DEFINE4(wait4,
1701 compat_pid_t, pid,
1702 compat_uint_t __user *, stat_addr,
1703 int, options,
1704 struct compat_rusage __user *, ru)
1705{
1706 struct rusage r;
1707 long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1708 if (err > 0) {
1709 if (ru && put_compat_rusage(&r, ru))
1710 return -EFAULT;
1711 }
1712 return err;
1713}
1714
1715COMPAT_SYSCALL_DEFINE5(waitid,
1716 int, which, compat_pid_t, pid,
1717 struct compat_siginfo __user *, infop, int, options,
1718 struct compat_rusage __user *, uru)
1719{
1720 struct rusage ru;
1721 struct waitid_info info = {.status = 0};
1722 long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1723 int signo = 0;
1724 if (err > 0) {
1725 signo = SIGCHLD;
1726 err = 0;
1727 }
1728
1729 if (!err && uru) {
1730 /* kernel_waitid() overwrites everything in ru */
1731 if (COMPAT_USE_64BIT_TIME)
1732 err = copy_to_user(uru, &ru, sizeof(ru));
1733 else
1734 err = put_compat_rusage(&ru, uru);
1735 if (err)
1736 return -EFAULT;
1737 }
1738
1739 if (!infop)
1740 return err;
1741
1742 user_access_begin();
1743 unsafe_put_user(signo, &infop->si_signo, Efault);
1744 unsafe_put_user(0, &infop->si_errno, Efault);
1745 unsafe_put_user((short)info.cause, &infop->si_code, Efault);
1746 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1747 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1748 unsafe_put_user(info.status, &infop->si_status, Efault);
1749 user_access_end();
1750 return err;
1751Efault:
1752 user_access_end();
1753 return -EFAULT;
1754}
1755#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..38c2412401a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
55{ 55{
56 const struct exception_table_entry *e; 56 const struct exception_table_entry *e;
57 57
58 e = search_extable(__start___ex_table, __stop___ex_table-1, addr); 58 e = search_extable(__start___ex_table,
59 __stop___ex_table - __start___ex_table, addr);
59 if (!e) 60 if (!e)
60 e = search_module_extables(addr); 61 e = search_module_extables(addr);
61 return e; 62 return e;
@@ -69,7 +70,7 @@ static inline int init_kernel_text(unsigned long addr)
69 return 0; 70 return 0;
70} 71}
71 72
72int core_kernel_text(unsigned long addr) 73int notrace core_kernel_text(unsigned long addr)
73{ 74{
74 if (addr >= (unsigned long)_stext && 75 if (addr >= (unsigned long)_stext &&
75 addr < (unsigned long)_etext) 76 addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..17921b0390b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
205 void *stack; 205 void *stack;
206 int i; 206 int i;
207 207
208 local_irq_disable();
209 for (i = 0; i < NR_CACHED_STACKS; i++) { 208 for (i = 0; i < NR_CACHED_STACKS; i++) {
210 struct vm_struct *s = this_cpu_read(cached_stacks[i]); 209 struct vm_struct *s;
210
211 s = this_cpu_xchg(cached_stacks[i], NULL);
211 212
212 if (!s) 213 if (!s)
213 continue; 214 continue;
214 this_cpu_write(cached_stacks[i], NULL);
215 215
216 tsk->stack_vm_area = s; 216 tsk->stack_vm_area = s;
217 local_irq_enable();
218 return s->addr; 217 return s->addr;
219 } 218 }
220 local_irq_enable();
221 219
222 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, 220 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
223 VMALLOC_START, VMALLOC_END, 221 VMALLOC_START, VMALLOC_END,
@@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk)
245{ 243{
246#ifdef CONFIG_VMAP_STACK 244#ifdef CONFIG_VMAP_STACK
247 if (task_stack_vm_area(tsk)) { 245 if (task_stack_vm_area(tsk)) {
248 unsigned long flags;
249 int i; 246 int i;
250 247
251 local_irq_save(flags);
252 for (i = 0; i < NR_CACHED_STACKS; i++) { 248 for (i = 0; i < NR_CACHED_STACKS; i++) {
253 if (this_cpu_read(cached_stacks[i])) 249 if (this_cpu_cmpxchg(cached_stacks[i],
250 NULL, tsk->stack_vm_area) != NULL)
254 continue; 251 continue;
255 252
256 this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
257 local_irq_restore(flags);
258 return; 253 return;
259 } 254 }
260 local_irq_restore(flags);
261 255
262 vfree_atomic(tsk->stack); 256 vfree_atomic(tsk->stack);
263 return; 257 return;
@@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
326 } 320 }
327 321
328 /* All stack pages belong to the same memcg. */ 322 /* All stack pages belong to the same memcg. */
329 memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, 323 mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
330 account * (THREAD_SIZE / 1024)); 324 account * (THREAD_SIZE / 1024));
331 } else { 325 } else {
332 /* 326 /*
333 * All stack pages are in the same zone and belong to the 327 * All stack pages are in the same zone and belong to the
@@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
338 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, 332 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
339 THREAD_SIZE / 1024 * account); 333 THREAD_SIZE / 1024 * account);
340 334
341 memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, 335 mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
342 account * (THREAD_SIZE / 1024)); 336 account * (THREAD_SIZE / 1024));
343 } 337 }
344} 338}
345 339
@@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
560 set_task_stack_end_magic(tsk); 554 set_task_stack_end_magic(tsk);
561 555
562#ifdef CONFIG_CC_STACKPROTECTOR 556#ifdef CONFIG_CC_STACKPROTECTOR
563 tsk->stack_canary = get_random_long(); 557 tsk->stack_canary = get_random_canary();
564#endif 558#endif
565 559
566 /* 560 /*
@@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
579 573
580 kcov_task_init(tsk); 574 kcov_task_init(tsk);
581 575
576#ifdef CONFIG_FAULT_INJECTION
577 tsk->fail_nth = 0;
578#endif
579
582 return tsk; 580 return tsk;
583 581
584free_stack: 582free_stack:
@@ -1637,9 +1635,9 @@ static __latent_entropy struct task_struct *copy_process(
1637 prev_cputime_init(&p->prev_cputime); 1635 prev_cputime_init(&p->prev_cputime);
1638 1636
1639#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1637#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1640 seqcount_init(&p->vtime_seqcount); 1638 seqcount_init(&p->vtime.seqcount);
1641 p->vtime_snap = 0; 1639 p->vtime.starttime = 0;
1642 p->vtime_snap_whence = VTIME_INACTIVE; 1640 p->vtime.state = VTIME_INACTIVE;
1643#endif 1641#endif
1644 1642
1645#if defined(SPLIT_RSS_COUNTING) 1643#if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/futex.c b/kernel/futex.c
index d6cf71d08f21..16dbe4c93895 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -212,7 +212,7 @@ struct futex_pi_state {
212 atomic_t refcount; 212 atomic_t refcount;
213 213
214 union futex_key key; 214 union futex_key key;
215}; 215} __randomize_layout;
216 216
217/** 217/**
218 * struct futex_q - The hashed futex queue entry, one per waiting task 218 * struct futex_q - The hashed futex queue entry, one per waiting task
@@ -246,7 +246,7 @@ struct futex_q {
246 struct rt_mutex_waiter *rt_waiter; 246 struct rt_mutex_waiter *rt_waiter;
247 union futex_key *requeue_pi_key; 247 union futex_key *requeue_pi_key;
248 u32 bitset; 248 u32 bitset;
249}; 249} __randomize_layout;
250 250
251static const struct futex_q futex_q_init = { 251static const struct futex_q futex_q_init = {
252 /* list gets initialized in queue_me()*/ 252 /* list gets initialized in queue_me()*/
@@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key)
488 * 488 *
489 * Return: a negative error code or 0 489 * Return: a negative error code or 0
490 * 490 *
491 * The key words are stored in *key on success. 491 * The key words are stored in @key on success.
492 * 492 *
493 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 493 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
494 * offset_within_page). For private mappings, it's (uaddr, current->mm). 494 * offset_within_page). For private mappings, it's (uaddr, current->mm).
@@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1259 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1259 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1260 * 1260 *
1261 * Return: 1261 * Return:
1262 * 0 - ready to wait; 1262 * - 0 - ready to wait;
1263 * 1 - acquired the lock; 1263 * - 1 - acquired the lock;
1264 * <0 - error 1264 * - <0 - error
1265 * 1265 *
1266 * The hb->lock and futex_key refs shall be held by the caller. 1266 * The hb->lock and futex_key refs shall be held by the caller.
1267 */ 1267 */
@@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1717 * hb1 and hb2 must be held by the caller. 1717 * hb1 and hb2 must be held by the caller.
1718 * 1718 *
1719 * Return: 1719 * Return:
1720 * 0 - failed to acquire the lock atomically; 1720 * - 0 - failed to acquire the lock atomically;
1721 * >0 - acquired the lock, return value is vpid of the top_waiter 1721 * - >0 - acquired the lock, return value is vpid of the top_waiter
1722 * <0 - error 1722 * - <0 - error
1723 */ 1723 */
1724static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1724static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1725 struct futex_hash_bucket *hb1, 1725 struct futex_hash_bucket *hb1,
@@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1785 * uaddr2 atomically on behalf of the top waiter. 1785 * uaddr2 atomically on behalf of the top waiter.
1786 * 1786 *
1787 * Return: 1787 * Return:
1788 * >=0 - on success, the number of tasks requeued or woken; 1788 * - >=0 - on success, the number of tasks requeued or woken;
1789 * <0 - on error 1789 * - <0 - on error
1790 */ 1790 */
1791static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1791static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1792 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1792 u32 __user *uaddr2, int nr_wake, int nr_requeue,
@@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2142 * be paired with exactly one earlier call to queue_me(). 2142 * be paired with exactly one earlier call to queue_me().
2143 * 2143 *
2144 * Return: 2144 * Return:
2145 * 1 - if the futex_q was still queued (and we removed unqueued it); 2145 * - 1 - if the futex_q was still queued (and we removed unqueued it);
2146 * 0 - if the futex_q was already removed by the waking thread 2146 * - 0 - if the futex_q was already removed by the waking thread
2147 */ 2147 */
2148static int unqueue_me(struct futex_q *q) 2148static int unqueue_me(struct futex_q *q)
2149{ 2149{
@@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart);
2333 * acquire the lock. Must be called with the hb lock held. 2333 * acquire the lock. Must be called with the hb lock held.
2334 * 2334 *
2335 * Return: 2335 * Return:
2336 * 1 - success, lock taken; 2336 * - 1 - success, lock taken;
2337 * 0 - success, lock not taken; 2337 * - 0 - success, lock not taken;
2338 * <0 - on error (-EFAULT) 2338 * - <0 - on error (-EFAULT)
2339 */ 2339 */
2340static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 2340static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2341{ 2341{
@@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2422 * with no q.key reference on failure. 2422 * with no q.key reference on failure.
2423 * 2423 *
2424 * Return: 2424 * Return:
2425 * 0 - uaddr contains val and hb has been locked; 2425 * - 0 - uaddr contains val and hb has been locked;
2426 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2426 * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2427 */ 2427 */
2428static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2428static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2429 struct futex_q *q, struct futex_hash_bucket **hb) 2429 struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2895,8 +2895,8 @@ pi_faulted:
2895 * called with the hb lock held. 2895 * called with the hb lock held.
2896 * 2896 *
2897 * Return: 2897 * Return:
2898 * 0 = no early wakeup detected; 2898 * - 0 = no early wakeup detected;
2899 * <0 = -ETIMEDOUT or -ERESTARTNOINTR 2899 * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
2900 */ 2900 */
2901static inline 2901static inline
2902int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2902int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2968 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2968 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2969 * 2969 *
2970 * Return: 2970 * Return:
2971 * 0 - On success; 2971 * - 0 - On success;
2972 * <0 - On error 2972 * - <0 - On error
2973 */ 2973 */
2974static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2974static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2975 u32 val, ktime_t *abs_time, u32 bitset, 2975 u32 val, ktime_t *abs_time, u32 bitset,
diff --git a/kernel/groups.c b/kernel/groups.c
index d09727692a2a..434f6665f187 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -5,6 +5,7 @@
5#include <linux/export.h> 5#include <linux/export.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/sort.h>
8#include <linux/syscalls.h> 9#include <linux/syscalls.h>
9#include <linux/user_namespace.h> 10#include <linux/user_namespace.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
@@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
76 return 0; 77 return 0;
77} 78}
78 79
79/* a simple Shell sort */ 80static int gid_cmp(const void *_a, const void *_b)
81{
82 kgid_t a = *(kgid_t *)_a;
83 kgid_t b = *(kgid_t *)_b;
84
85 return gid_gt(a, b) - gid_lt(a, b);
86}
87
80static void groups_sort(struct group_info *group_info) 88static void groups_sort(struct group_info *group_info)
81{ 89{
82 int base, max, stride; 90 sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
83 int gidsetsize = group_info->ngroups; 91 gid_cmp, NULL);
84
85 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
86 ; /* nothing */
87 stride /= 3;
88
89 while (stride) {
90 max = gidsetsize - stride;
91 for (base = 0; base < max; base++) {
92 int left = base;
93 int right = left + stride;
94 kgid_t tmp = group_info->gid[right];
95
96 while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
97 group_info->gid[right] = group_info->gid[left];
98 right = left;
99 left -= stride;
100 }
101 group_info->gid[right] = tmp;
102 }
103 stride /= 3;
104 }
105} 92}
106 93
107/* a simple bsearch */ 94/* a simple bsearch */
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d2747f9c5707..d69bd77252a7 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
110 struct cpumask *masks; 110 struct cpumask *masks;
111 cpumask_var_t nmsk, *node_to_present_cpumask; 111 cpumask_var_t nmsk, *node_to_present_cpumask;
112 112
113 /*
114 * If there aren't any vectors left after applying the pre/post
115 * vectors don't bother with assigning affinity.
116 */
117 if (!affv)
118 return NULL;
119
113 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 120 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
114 return NULL; 121 return NULL;
115 122
@@ -192,15 +199,19 @@ out:
192 199
193/** 200/**
194 * irq_calc_affinity_vectors - Calculate the optimal number of vectors 201 * irq_calc_affinity_vectors - Calculate the optimal number of vectors
202 * @minvec: The minimum number of vectors available
195 * @maxvec: The maximum number of vectors available 203 * @maxvec: The maximum number of vectors available
196 * @affd: Description of the affinity requirements 204 * @affd: Description of the affinity requirements
197 */ 205 */
198int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) 206int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
199{ 207{
200 int resv = affd->pre_vectors + affd->post_vectors; 208 int resv = affd->pre_vectors + affd->post_vectors;
201 int vecs = maxvec - resv; 209 int vecs = maxvec - resv;
202 int ret; 210 int ret;
203 211
212 if (resv > minvec)
213 return 0;
214
204 get_online_cpus(); 215 get_online_cpus();
205 ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; 216 ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
206 put_online_cpus(); 217 put_online_cpus();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2e30d925a40d..a3cc37c0c85e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -7,7 +7,7 @@
7 * This file contains the core interrupt handling code, for irq-chip 7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures. 8 * based architectures.
9 * 9 *
10 * Detailed information is available in Documentation/DocBook/genericirq 10 * Detailed information is available in Documentation/core-api/genericirq.rst
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
@@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
170 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); 170 irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
171} 171}
172 172
173static void irq_state_set_disabled(struct irq_desc *desc)
174{
175 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
176}
177
178static void irq_state_clr_masked(struct irq_desc *desc) 173static void irq_state_clr_masked(struct irq_desc *desc)
179{ 174{
180 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); 175 irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
181} 176}
182 177
183static void irq_state_set_masked(struct irq_desc *desc)
184{
185 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
186}
187
188static void irq_state_clr_started(struct irq_desc *desc) 178static void irq_state_clr_started(struct irq_desc *desc)
189{ 179{
190 irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); 180 irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
@@ -234,7 +224,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
234 return IRQ_STARTUP_MANAGED; 224 return IRQ_STARTUP_MANAGED;
235} 225}
236#else 226#else
237static int 227static __always_inline int
238__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) 228__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
239{ 229{
240 return IRQ_STARTUP_NORMAL; 230 return IRQ_STARTUP_NORMAL;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index eb4d3e8945b8..79f987b942b8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 * 8 *
9 * Detailed information is available in Documentation/DocBook/genericirq 9 * Detailed information is available in Documentation/core-api/genericirq.rst
10 * 10 *
11 */ 11 */
12 12
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9da14d125df4..a2c48058354c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
227 return __irqd_to_state(d) & mask; 227 return __irqd_to_state(d) & mask;
228} 228}
229 229
230static inline void irq_state_set_disabled(struct irq_desc *desc)
231{
232 irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
233}
234
235static inline void irq_state_set_masked(struct irq_desc *desc)
236{
237 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
238}
239
230#undef __irqd_to_state 240#undef __irqd_to_state
231 241
232static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) 242static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
@@ -437,7 +447,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
437# ifdef CONFIG_IRQ_DOMAIN 447# ifdef CONFIG_IRQ_DOMAIN
438void irq_domain_debugfs_init(struct dentry *root); 448void irq_domain_debugfs_init(struct dentry *root);
439# else 449# else
440static inline void irq_domain_debugfs_init(struct dentry *root); 450static inline void irq_domain_debugfs_init(struct dentry *root)
451{
452}
441# endif 453# endif
442#else /* CONFIG_GENERIC_IRQ_DEBUGFS */ 454#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
443static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) 455static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 948b50e78549..73be2b3909bd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -4,7 +4,7 @@
4 * 4 *
5 * This file contains the interrupt descriptor management code 5 * This file contains the interrupt descriptor management code
6 * 6 *
7 * Detailed information is available in Documentation/DocBook/genericirq 7 * Detailed information is available in Documentation/core-api/genericirq.rst
8 * 8 *
9 */ 9 */
10#include <linux/irq.h> 10#include <linux/irq.h>
@@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
373 373
374 raw_spin_lock_init(&desc->lock); 374 raw_spin_lock_init(&desc->lock);
375 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 375 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
376 mutex_init(&desc->request_mutex);
376 init_rcu_head(&desc->rcu); 377 init_rcu_head(&desc->rcu);
377 378
378 desc_set_defaults(irq, desc, node, affinity, owner); 379 desc_set_defaults(irq, desc, node, affinity, owner);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 14fe862aa2e3..f1f251479aa6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
1#define pr_fmt(fmt) "irq: " fmt 1#define pr_fmt(fmt) "irq: " fmt
2 2
3#include <linux/acpi.h>
3#include <linux/debugfs.h> 4#include <linux/debugfs.h>
4#include <linux/hardirq.h> 5#include <linux/hardirq.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
@@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
155 domain->name = fwid->name; 156 domain->name = fwid->name;
156 break; 157 break;
157 } 158 }
159#ifdef CONFIG_ACPI
160 } else if (is_acpi_device_node(fwnode)) {
161 struct acpi_buffer buf = {
162 .length = ACPI_ALLOCATE_BUFFER,
163 };
164 acpi_handle handle;
165
166 handle = acpi_device_handle(to_acpi_device_node(fwnode));
167 if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
168 domain->name = buf.pointer;
169 domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
170 }
171
172 domain->fwnode = fwnode;
173#endif
158 } else if (of_node) { 174 } else if (of_node) {
159 char *name; 175 char *name;
160 176
@@ -1667,8 +1683,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
1667 1683
1668static void debugfs_remove_domain_dir(struct irq_domain *d) 1684static void debugfs_remove_domain_dir(struct irq_domain *d)
1669{ 1685{
1670 if (d->debugfs_file) 1686 debugfs_remove(d->debugfs_file);
1671 debugfs_remove(d->debugfs_file);
1672} 1687}
1673 1688
1674void __init irq_domain_debugfs_init(struct dentry *root) 1689void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5c11c1730ba5..1d1a5b945ab4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
1090/* 1090/*
1091 * Internal function to register an irqaction - typically used to 1091 * Internal function to register an irqaction - typically used to
1092 * allocate special interrupts that are part of the architecture. 1092 * allocate special interrupts that are part of the architecture.
1093 *
1094 * Locking rules:
1095 *
1096 * desc->request_mutex Provides serialization against a concurrent free_irq()
1097 * chip_bus_lock Provides serialization for slow bus operations
1098 * desc->lock Provides serialization against hard interrupts
1099 *
1100 * chip_bus_lock and desc->lock are sufficient for all other management and
1101 * interrupt related functions. desc->request_mutex solely serializes
1102 * request/free_irq().
1093 */ 1103 */
1094static int 1104static int
1095__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 1105__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -1168,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1168 new->flags &= ~IRQF_ONESHOT; 1178 new->flags &= ~IRQF_ONESHOT;
1169 1179
1170 /* 1180 /*
1181 * Protects against a concurrent __free_irq() call which might wait
1182 * for synchronize_irq() to complete without holding the optional
1183 * chip bus lock and desc->lock.
1184 */
1185 mutex_lock(&desc->request_mutex);
1186
1187 /*
1188 * Acquire bus lock as the irq_request_resources() callback below
1189 * might rely on the serialization or the magic power management
1190 * functions which are abusing the irq_bus_lock() callback,
1191 */
1192 chip_bus_lock(desc);
1193
1194 /* First installed action requests resources. */
1195 if (!desc->action) {
1196 ret = irq_request_resources(desc);
1197 if (ret) {
1198 pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
1199 new->name, irq, desc->irq_data.chip->name);
1200 goto out_bus_unlock;
1201 }
1202 }
1203
1204 /*
1171 * The following block of code has to be executed atomically 1205 * The following block of code has to be executed atomically
1206 * protected against a concurrent interrupt and any of the other
1207 * management calls which are not serialized via
1208 * desc->request_mutex or the optional bus lock.
1172 */ 1209 */
1173 raw_spin_lock_irqsave(&desc->lock, flags); 1210 raw_spin_lock_irqsave(&desc->lock, flags);
1174 old_ptr = &desc->action; 1211 old_ptr = &desc->action;
@@ -1267,13 +1304,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1267 } 1304 }
1268 1305
1269 if (!shared) { 1306 if (!shared) {
1270 ret = irq_request_resources(desc);
1271 if (ret) {
1272 pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
1273 new->name, irq, desc->irq_data.chip->name);
1274 goto out_unlock;
1275 }
1276
1277 init_waitqueue_head(&desc->wait_for_threads); 1307 init_waitqueue_head(&desc->wait_for_threads);
1278 1308
1279 /* Setup the type (level, edge polarity) if configured: */ 1309 /* Setup the type (level, edge polarity) if configured: */
@@ -1281,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1281 ret = __irq_set_trigger(desc, 1311 ret = __irq_set_trigger(desc,
1282 new->flags & IRQF_TRIGGER_MASK); 1312 new->flags & IRQF_TRIGGER_MASK);
1283 1313
1284 if (ret) { 1314 if (ret)
1285 irq_release_resources(desc);
1286 goto out_unlock; 1315 goto out_unlock;
1287 }
1288 } 1316 }
1289 1317
1290 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ 1318 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
@@ -1347,6 +1375,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1347 } 1375 }
1348 1376
1349 raw_spin_unlock_irqrestore(&desc->lock, flags); 1377 raw_spin_unlock_irqrestore(&desc->lock, flags);
1378 chip_bus_sync_unlock(desc);
1379 mutex_unlock(&desc->request_mutex);
1350 1380
1351 irq_setup_timings(desc, new); 1381 irq_setup_timings(desc, new);
1352 1382
@@ -1378,6 +1408,12 @@ mismatch:
1378out_unlock: 1408out_unlock:
1379 raw_spin_unlock_irqrestore(&desc->lock, flags); 1409 raw_spin_unlock_irqrestore(&desc->lock, flags);
1380 1410
1411 if (!desc->action)
1412 irq_release_resources(desc);
1413out_bus_unlock:
1414 chip_bus_sync_unlock(desc);
1415 mutex_unlock(&desc->request_mutex);
1416
1381out_thread: 1417out_thread:
1382 if (new->thread) { 1418 if (new->thread) {
1383 struct task_struct *t = new->thread; 1419 struct task_struct *t = new->thread;
@@ -1417,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
1417 if (retval < 0) 1453 if (retval < 0)
1418 return retval; 1454 return retval;
1419 1455
1420 chip_bus_lock(desc);
1421 retval = __setup_irq(irq, desc, act); 1456 retval = __setup_irq(irq, desc, act);
1422 chip_bus_sync_unlock(desc);
1423 1457
1424 if (retval) 1458 if (retval)
1425 irq_chip_pm_put(&desc->irq_data); 1459 irq_chip_pm_put(&desc->irq_data);
@@ -1443,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1443 if (!desc) 1477 if (!desc)
1444 return NULL; 1478 return NULL;
1445 1479
1480 mutex_lock(&desc->request_mutex);
1446 chip_bus_lock(desc); 1481 chip_bus_lock(desc);
1447 raw_spin_lock_irqsave(&desc->lock, flags); 1482 raw_spin_lock_irqsave(&desc->lock, flags);
1448 1483
@@ -1458,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1458 WARN(1, "Trying to free already-free IRQ %d\n", irq); 1493 WARN(1, "Trying to free already-free IRQ %d\n", irq);
1459 raw_spin_unlock_irqrestore(&desc->lock, flags); 1494 raw_spin_unlock_irqrestore(&desc->lock, flags);
1460 chip_bus_sync_unlock(desc); 1495 chip_bus_sync_unlock(desc);
1496 mutex_unlock(&desc->request_mutex);
1461 return NULL; 1497 return NULL;
1462 } 1498 }
1463 1499
@@ -1475,8 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1475 if (!desc->action) { 1511 if (!desc->action) {
1476 irq_settings_clr_disable_unlazy(desc); 1512 irq_settings_clr_disable_unlazy(desc);
1477 irq_shutdown(desc); 1513 irq_shutdown(desc);
1478 irq_release_resources(desc);
1479 irq_remove_timings(desc);
1480 } 1514 }
1481 1515
1482#ifdef CONFIG_SMP 1516#ifdef CONFIG_SMP
@@ -1486,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1486#endif 1520#endif
1487 1521
1488 raw_spin_unlock_irqrestore(&desc->lock, flags); 1522 raw_spin_unlock_irqrestore(&desc->lock, flags);
1523 /*
1524 * Drop bus_lock here so the changes which were done in the chip
1525 * callbacks above are synced out to the irq chips which hang
1526 * behind a slow bus (I2C, SPI) before calling synchronize_irq().
1527 *
1528 * Aside of that the bus_lock can also be taken from the threaded
1529 * handler in irq_finalize_oneshot() which results in a deadlock
1530 * because synchronize_irq() would wait forever for the thread to
1531 * complete, which is blocked on the bus lock.
1532 *
1533 * The still held desc->request_mutex() protects against a
1534 * concurrent request_irq() of this irq so the release of resources
1535 * and timing data is properly serialized.
1536 */
1489 chip_bus_sync_unlock(desc); 1537 chip_bus_sync_unlock(desc);
1490 1538
1491 unregister_handler_proc(irq, action); 1539 unregister_handler_proc(irq, action);
@@ -1518,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1518 } 1566 }
1519 } 1567 }
1520 1568
1569 /* Last action releases resources */
1570 if (!desc->action) {
1571 /*
1572 * Reaquire bus lock as irq_release_resources() might
1573 * require it to deallocate resources over the slow bus.
1574 */
1575 chip_bus_lock(desc);
1576 irq_release_resources(desc);
1577 chip_bus_sync_unlock(desc);
1578 irq_remove_timings(desc);
1579 }
1580
1581 mutex_unlock(&desc->request_mutex);
1582
1521 irq_chip_pm_put(&desc->irq_data); 1583 irq_chip_pm_put(&desc->irq_data);
1522 module_put(desc->owner); 1584 module_put(desc->owner);
1523 kfree(action->secondary); 1585 kfree(action->secondary);
@@ -1674,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1674 return retval; 1736 return retval;
1675 } 1737 }
1676 1738
1677 chip_bus_lock(desc);
1678 retval = __setup_irq(irq, desc, action); 1739 retval = __setup_irq(irq, desc, action);
1679 chip_bus_sync_unlock(desc);
1680 1740
1681 if (retval) { 1741 if (retval) {
1682 irq_chip_pm_put(&desc->irq_data); 1742 irq_chip_pm_put(&desc->irq_data);
@@ -1924,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1924 if (retval < 0) 1984 if (retval < 0)
1925 return retval; 1985 return retval;
1926 1986
1927 chip_bus_lock(desc);
1928 retval = __setup_irq(irq, desc, act); 1987 retval = __setup_irq(irq, desc, act);
1929 chip_bus_sync_unlock(desc);
1930 1988
1931 if (retval) 1989 if (retval)
1932 irq_chip_pm_put(&desc->irq_data); 1990 irq_chip_pm_put(&desc->irq_data);
@@ -1935,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1935} 1993}
1936 1994
1937/** 1995/**
1938 * request_percpu_irq - allocate a percpu interrupt line 1996 * __request_percpu_irq - allocate a percpu interrupt line
1939 * @irq: Interrupt line to allocate 1997 * @irq: Interrupt line to allocate
1940 * @handler: Function to be called when the IRQ occurs. 1998 * @handler: Function to be called when the IRQ occurs.
1999 * @flags: Interrupt type flags (IRQF_TIMER only)
1941 * @devname: An ascii name for the claiming device 2000 * @devname: An ascii name for the claiming device
1942 * @dev_id: A percpu cookie passed back to the handler function 2001 * @dev_id: A percpu cookie passed back to the handler function
1943 * 2002 *
@@ -1950,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1950 * the handler gets called with the interrupted CPU's instance of 2009 * the handler gets called with the interrupted CPU's instance of
1951 * that variable. 2010 * that variable.
1952 */ 2011 */
1953int request_percpu_irq(unsigned int irq, irq_handler_t handler, 2012int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
1954 const char *devname, void __percpu *dev_id) 2013 unsigned long flags, const char *devname,
2014 void __percpu *dev_id)
1955{ 2015{
1956 struct irqaction *action; 2016 struct irqaction *action;
1957 struct irq_desc *desc; 2017 struct irq_desc *desc;
@@ -1965,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1965 !irq_settings_is_per_cpu_devid(desc)) 2025 !irq_settings_is_per_cpu_devid(desc))
1966 return -EINVAL; 2026 return -EINVAL;
1967 2027
2028 if (flags && flags != IRQF_TIMER)
2029 return -EINVAL;
2030
1968 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 2031 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1969 if (!action) 2032 if (!action)
1970 return -ENOMEM; 2033 return -ENOMEM;
1971 2034
1972 action->handler = handler; 2035 action->handler = handler;
1973 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; 2036 action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
1974 action->name = devname; 2037 action->name = devname;
1975 action->percpu_dev_id = dev_id; 2038 action->percpu_dev_id = dev_id;
1976 2039
@@ -1980,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1980 return retval; 2043 return retval;
1981 } 2044 }
1982 2045
1983 chip_bus_lock(desc);
1984 retval = __setup_irq(irq, desc, action); 2046 retval = __setup_irq(irq, desc, action);
1985 chip_bus_sync_unlock(desc);
1986 2047
1987 if (retval) { 2048 if (retval) {
1988 irq_chip_pm_put(&desc->irq_data); 2049 irq_chip_pm_put(&desc->irq_data);
@@ -1991,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1991 2052
1992 return retval; 2053 return retval;
1993} 2054}
1994EXPORT_SYMBOL_GPL(request_percpu_irq); 2055EXPORT_SYMBOL_GPL(__request_percpu_irq);
1995 2056
1996/** 2057/**
1997 * irq_get_irqchip_state - returns the irqchip state of a interrupt. 2058 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..6bd9b58429cc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc)
149 149
150 /* Pretend that it got disabled ! */ 150 /* Pretend that it got disabled ! */
151 desc->depth++; 151 desc->depth++;
152 irq_state_set_disabled(desc);
153 irq_state_set_masked(desc);
152resume: 154resume:
153 desc->istate &= ~IRQS_SUSPENDED; 155 desc->istate &= ~IRQS_SUSPENDED;
154 __enable_irq(desc); 156 __enable_irq(desc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6a3b249a2ae1..127e7cfafa55 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -28,12 +28,6 @@
28 28
29#include <asm/sections.h> 29#include <asm/sections.h>
30 30
31#ifdef CONFIG_KALLSYMS_ALL
32#define all_var 1
33#else
34#define all_var 0
35#endif
36
37/* 31/*
38 * These will be re-linked against their real values 32 * These will be re-linked against their real values
39 * during the second link stage. 33 * during the second link stage.
@@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
82 76
83static int is_ksym_addr(unsigned long addr) 77static int is_ksym_addr(unsigned long addr)
84{ 78{
85 if (all_var) 79 if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
86 return is_kernel(addr); 80 return is_kernel(addr);
87 81
88 return is_kernel_text(addr) || is_kernel_inittext(addr); 82 return is_kernel_text(addr) || is_kernel_inittext(addr);
@@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
280 if (!symbol_end) { 274 if (!symbol_end) {
281 if (is_kernel_inittext(addr)) 275 if (is_kernel_inittext(addr))
282 symbol_end = (unsigned long)_einittext; 276 symbol_end = (unsigned long)_einittext;
283 else if (all_var) 277 else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
284 symbol_end = (unsigned long)_end; 278 symbol_end = (unsigned long)_end;
285 else 279 else
286 symbol_end = (unsigned long)_etext; 280 symbol_end = (unsigned long)_etext;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
11#include <linux/bug.h> 11#include <linux/bug.h>
12#include <linux/err.h> 12#include <linux/err.h>
13#include <linux/kcmp.h> 13#include <linux/kcmp.h>
14#include <linux/capability.h>
15#include <linux/list.h>
16#include <linux/eventpoll.h>
17#include <linux/file.h>
14 18
15#include <asm/unistd.h> 19#include <asm/unistd.h>
16 20
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
94 return err; 98 return err;
95} 99}
96 100
101#ifdef CONFIG_EPOLL
102static int kcmp_epoll_target(struct task_struct *task1,
103 struct task_struct *task2,
104 unsigned long idx1,
105 struct kcmp_epoll_slot __user *uslot)
106{
107 struct file *filp, *filp_epoll, *filp_tgt;
108 struct kcmp_epoll_slot slot;
109 struct files_struct *files;
110
111 if (copy_from_user(&slot, uslot, sizeof(slot)))
112 return -EFAULT;
113
114 filp = get_file_raw_ptr(task1, idx1);
115 if (!filp)
116 return -EBADF;
117
118 files = get_files_struct(task2);
119 if (!files)
120 return -EBADF;
121
122 spin_lock(&files->file_lock);
123 filp_epoll = fcheck_files(files, slot.efd);
124 if (filp_epoll)
125 get_file(filp_epoll);
126 else
127 filp_tgt = ERR_PTR(-EBADF);
128 spin_unlock(&files->file_lock);
129 put_files_struct(files);
130
131 if (filp_epoll) {
132 filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
133 fput(filp_epoll);
134 } else
135
136 if (IS_ERR(filp_tgt))
137 return PTR_ERR(filp_tgt);
138
139 return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
140}
141#else
142static int kcmp_epoll_target(struct task_struct *task1,
143 struct task_struct *task2,
144 unsigned long idx1,
145 struct kcmp_epoll_slot __user *uslot)
146{
147 return -EOPNOTSUPP;
148}
149#endif
150
97SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, 151SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
98 unsigned long, idx1, unsigned long, idx2) 152 unsigned long, idx1, unsigned long, idx2)
99{ 153{
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
165 ret = -EOPNOTSUPP; 219 ret = -EOPNOTSUPP;
166#endif 220#endif
167 break; 221 break;
222 case KCMP_EPOLL_TFD:
223 ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
224 break;
168 default: 225 default:
169 ret = -EINVAL; 226 ret = -EINVAL;
170 break; 227 break;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a90ee6..e62ec4dc6620 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
144 if (ret) 144 if (ret)
145 goto out; 145 goto out;
146 146
147 /*
148 * Some architecture(like S390) may touch the crash memory before
149 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
150 */
151 ret = kimage_crash_copy_vmcoreinfo(image);
152 if (ret)
153 goto out;
154
147 for (i = 0; i < nr_segments; i++) { 155 for (i = 0; i < nr_segments; i++) {
148 ret = kimage_load_segment(image, &image->segment[i]); 156 ret = kimage_load_segment(image, &image->segment[i]);
149 if (ret) 157 if (ret)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 154ffb489b93..1ae7c41c33c1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
482 return pages; 482 return pages;
483} 483}
484 484
485int kimage_crash_copy_vmcoreinfo(struct kimage *image)
486{
487 struct page *vmcoreinfo_page;
488 void *safecopy;
489
490 if (image->type != KEXEC_TYPE_CRASH)
491 return 0;
492
493 /*
494 * For kdump, allocate one vmcoreinfo safe copy from the
495 * crash memory. as we have arch_kexec_protect_crashkres()
496 * after kexec syscall, we naturally protect it from write
497 * (even read) access under kernel direct mapping. But on
498 * the other hand, we still need to operate it when crash
499 * happens to generate vmcoreinfo note, hereby we rely on
500 * vmap for this purpose.
501 */
502 vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
503 if (!vmcoreinfo_page) {
504 pr_warn("Could not allocate vmcoreinfo buffer\n");
505 return -ENOMEM;
506 }
507 safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
508 if (!safecopy) {
509 pr_warn("Could not vmap vmcoreinfo buffer\n");
510 return -ENOMEM;
511 }
512
513 image->vmcoreinfo_data_copy = safecopy;
514 crash_update_vmcoreinfo_safecopy(safecopy);
515
516 return 0;
517}
518
485static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 519static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
486{ 520{
487 if (*image->entry != 0) 521 if (*image->entry != 0)
@@ -569,6 +603,11 @@ void kimage_free(struct kimage *image)
569 if (!image) 603 if (!image)
570 return; 604 return;
571 605
606 if (image->vmcoreinfo_data_copy) {
607 crash_update_vmcoreinfo_safecopy(NULL);
608 vunmap(image->vmcoreinfo_data_copy);
609 }
610
572 kimage_free_extra_pages(image); 611 kimage_free_extra_pages(image);
573 for_each_kimage_entry(image, ptr, entry) { 612 for_each_kimage_entry(image, ptr, entry) {
574 if (entry & IND_INDIRECTION) { 613 if (entry & IND_INDIRECTION) {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..9f48f4412297 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,13 +26,6 @@
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include "kexec_internal.h" 27#include "kexec_internal.h"
28 28
29/*
30 * Declare these symbols weak so that if architecture provides a purgatory,
31 * these will be overridden.
32 */
33char __weak kexec_purgatory[0];
34size_t __weak kexec_purgatory_size = 0;
35
36static int kexec_calculate_store_digests(struct kimage *image); 29static int kexec_calculate_store_digests(struct kimage *image);
37 30
38/* Architectures can provide this probe function */ 31/* Architectures can provide this probe function */
@@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
162 } 155 }
163 156
164 if (cmdline_len) { 157 if (cmdline_len) {
165 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); 158 image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
166 if (!image->cmdline_buf) { 159 if (IS_ERR(image->cmdline_buf)) {
167 ret = -ENOMEM; 160 ret = PTR_ERR(image->cmdline_buf);
168 goto out; 161 image->cmdline_buf = NULL;
169 }
170
171 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
172 cmdline_len);
173 if (ret) {
174 ret = -EFAULT;
175 goto out; 162 goto out;
176 } 163 }
177 164
@@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
304 if (ret) 291 if (ret)
305 goto out; 292 goto out;
306 293
294 /*
295 * Some architecture(like S390) may touch the crash memory before
296 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
297 */
298 ret = kimage_crash_copy_vmcoreinfo(image);
299 if (ret)
300 goto out;
301
307 ret = kexec_calculate_store_digests(image); 302 ret = kexec_calculate_store_digests(image);
308 if (ret) 303 if (ret)
309 goto out; 304 goto out;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 799a8a452187..50dfcb039a41 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -17,6 +17,8 @@ extern struct mutex kexec_mutex;
17#ifdef CONFIG_KEXEC_FILE 17#ifdef CONFIG_KEXEC_FILE
18#include <linux/purgatory.h> 18#include <linux/purgatory.h>
19void kimage_file_post_load_cleanup(struct kimage *image); 19void kimage_file_post_load_cleanup(struct kimage *image);
20extern char kexec_purgatory[];
21extern size_t kexec_purgatory_size;
20#else /* CONFIG_KEXEC_FILE */ 22#else /* CONFIG_KEXEC_FILE */
21static inline void kimage_file_post_load_cleanup(struct kimage *image) { } 23static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
22#endif /* CONFIG_KEXEC_FILE */ 24#endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..6d016c5d97c8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
45 45
46#include <trace/events/module.h> 46#include <trace/events/module.h>
47 47
48extern int max_threads;
49
50#define CAP_BSET (void *)1 48#define CAP_BSET (void *)1
51#define CAP_PI (void *)2 49#define CAP_PI (void *)2
52 50
@@ -56,6 +54,21 @@ static DEFINE_SPINLOCK(umh_sysctl_lock);
56static DECLARE_RWSEM(umhelper_sem); 54static DECLARE_RWSEM(umhelper_sem);
57 55
58#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
57/*
58 * Assuming:
59 *
60 * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
61 * (u64) THREAD_SIZE * 8UL);
62 *
63 * If you need less than 50 threads would mean we're dealing with systems
64 * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
65 * and this would only be an be an upper limit, after which the OOM killer
66 * would take effect. Systems like these are very unlikely if modules are
67 * enabled.
68 */
69#define MAX_KMOD_CONCURRENT 50
70static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
71static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
59 72
60/* 73/*
61 modprobe_path is set via /proc/sys. 74 modprobe_path is set via /proc/sys.
@@ -127,11 +140,7 @@ int __request_module(bool wait, const char *fmt, ...)
127{ 140{
128 va_list args; 141 va_list args;
129 char module_name[MODULE_NAME_LEN]; 142 char module_name[MODULE_NAME_LEN];
130 unsigned int max_modprobes;
131 int ret; 143 int ret;
132 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
133#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
134 static int kmod_loop_msg;
135 144
136 /* 145 /*
137 * We don't allow synchronous module loading from async. Module 146 * We don't allow synchronous module loading from async. Module
@@ -154,40 +163,25 @@ int __request_module(bool wait, const char *fmt, ...)
154 if (ret) 163 if (ret)
155 return ret; 164 return ret;
156 165
157 /* If modprobe needs a service that is in a module, we get a recursive 166 if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
158 * loop. Limit the number of running kmod threads to max_threads/2 or 167 pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
159 * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method 168 atomic_read(&kmod_concurrent_max),
160 * would be to run the parents of this process, counting how many times 169 MAX_KMOD_CONCURRENT, module_name);
161 * kmod was invoked. That would mean accessing the internals of the 170 wait_event_interruptible(kmod_wq,
162 * process tables to get the command line, proc_pid_cmdline is static 171 atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
163 * and it is not worth changing the proc code just to handle this case.
164 * KAO.
165 *
166 * "trace the ppid" is simple, but will fail if someone's
167 * parent exits. I think this is as good as it gets. --RR
168 */
169 max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
170 atomic_inc(&kmod_concurrent);
171 if (atomic_read(&kmod_concurrent) > max_modprobes) {
172 /* We may be blaming an innocent here, but unlikely */
173 if (kmod_loop_msg < 5) {
174 printk(KERN_ERR
175 "request_module: runaway loop modprobe %s\n",
176 module_name);
177 kmod_loop_msg++;
178 }
179 atomic_dec(&kmod_concurrent);
180 return -ENOMEM;
181 } 172 }
182 173
183 trace_module_request(module_name, wait, _RET_IP_); 174 trace_module_request(module_name, wait, _RET_IP_);
184 175
185 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 176 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
186 177
187 atomic_dec(&kmod_concurrent); 178 atomic_inc(&kmod_concurrent_max);
179 wake_up(&kmod_wq);
180
188 return ret; 181 return ret;
189} 182}
190EXPORT_SYMBOL(__request_module); 183EXPORT_SYMBOL(__request_module);
184
191#endif /* CONFIG_MODULES */ 185#endif /* CONFIG_MODULES */
192 186
193static void call_usermodehelper_freeinfo(struct subprocess_info *info) 187static void call_usermodehelper_freeinfo(struct subprocess_info *info)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 23cd70651238..46ba853656f6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
134{ 134{
135 phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); 135 phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
136 return sprintf(buf, "%pa %x\n", &vmcore_base, 136 return sprintf(buf, "%pa %x\n", &vmcore_base,
137 (unsigned int)sizeof(vmcoreinfo_note)); 137 (unsigned int)VMCOREINFO_NOTE_SIZE);
138} 138}
139KERNEL_ATTR_RO(vmcoreinfo); 139KERNEL_ATTR_RO(vmcoreinfo);
140 140
@@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
234 NULL 234 NULL
235}; 235};
236 236
237static struct attribute_group kernel_attr_group = { 237static const struct attribute_group kernel_attr_group = {
238 .attrs = kernel_attrs, 238 .attrs = kernel_attrs,
239}; 239};
240 240
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 198527a62149..858a07590e39 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock);
227 * (or statically defined) before it can be locked. memset()-ing 227 * (or statically defined) before it can be locked. memset()-ing
228 * the mutex to 0 is not allowed. 228 * the mutex to 0 is not allowed.
229 * 229 *
230 * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging 230 * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
231 * checks that will enforce the restrictions and will also do 231 * checks that will enforce the restrictions and will also do
232 * deadlock debugging. ) 232 * deadlock debugging)
233 * 233 *
234 * This function is similar to (but not equivalent to) down(). 234 * This function is similar to (but not equivalent to) down().
235 */ 235 */
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
20#include <linux/cpumask.h> 20#include <linux/cpumask.h>
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/spinlock.h>
23#include <asm/qrwlock.h> 24#include <asm/qrwlock.h>
24 25
25/* 26/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..fd24153e8a48 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
28#include <linux/percpu.h> 28#include <linux/percpu.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/mutex.h> 30#include <linux/mutex.h>
31#include <linux/prefetch.h>
31#include <asm/byteorder.h> 32#include <asm/byteorder.h>
32#include <asm/qspinlock.h> 33#include <asm/qspinlock.h>
33 34
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
193 */ 193 */
194 pv_lock_hash = alloc_large_system_hash("PV qspinlock", 194 pv_lock_hash = alloc_large_system_hash("PV qspinlock",
195 sizeof(struct pv_hash_entry), 195 sizeof(struct pv_hash_entry),
196 pv_hash_size, 0, HASH_EARLY, 196 pv_hash_size, 0,
197 HASH_EARLY | HASH_ZERO,
197 &pv_lock_hash_bits, NULL, 198 &pv_lock_hash_bits, NULL,
198 pv_hash_size, pv_hash_size); 199 pv_hash_size, pv_hash_size);
199} 200}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 78069895032a..649dc9d3951a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
963 return -EDEADLK; 963 return -EDEADLK;
964 964
965 raw_spin_lock(&task->pi_lock); 965 raw_spin_lock(&task->pi_lock);
966 rt_mutex_adjust_prio(task);
967 waiter->task = task; 966 waiter->task = task;
968 waiter->lock = lock; 967 waiter->lock = lock;
969 waiter->prio = task->prio; 968 waiter->prio = task->prio;
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..20819df98125 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
231 231
232out_nolock: 232out_nolock:
233 list_del(&waiter.list); 233 list_del(&waiter.list);
234 if (!list_empty(&sem->wait_list)) 234 if (!list_empty(&sem->wait_list) && sem->count >= 0)
235 __rwsem_do_wake(sem, 1); 235 __rwsem_do_wake(sem, 0);
236 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); 236 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
237 237
238 return -EINTR; 238 return -EINTR;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
358 goto err_pfn_remap; 358 goto err_pfn_remap;
359 359
360 mem_hotplug_begin(); 360 mem_hotplug_begin();
361 error = arch_add_memory(nid, align_start, align_size, true); 361 error = arch_add_memory(nid, align_start, align_size, false);
362 if (!error)
363 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
364 align_start >> PAGE_SHIFT,
365 align_size >> PAGE_SHIFT);
362 mem_hotplug_done(); 366 mem_hotplug_done();
363 if (error) 367 if (error)
364 goto err_add_memory; 368 goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..40f983cbea81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
49#include <linux/rculist.h> 49#include <linux/rculist.h>
50#include <linux/uaccess.h> 50#include <linux/uaccess.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#ifdef CONFIG_STRICT_MODULE_RWX 52#include <linux/set_memory.h>
53#include <asm/set_memory.h>
54#endif
55#include <asm/mmu_context.h> 53#include <asm/mmu_context.h>
56#include <linux/license.h> 54#include <linux/license.h>
57#include <asm/sections.h> 55#include <asm/sections.h>
@@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb)
302EXPORT_SYMBOL(unregister_module_notifier); 300EXPORT_SYMBOL(unregister_module_notifier);
303 301
304struct load_info { 302struct load_info {
303 const char *name;
305 Elf_Ehdr *hdr; 304 Elf_Ehdr *hdr;
306 unsigned long len; 305 unsigned long len;
307 Elf_Shdr *sechdrs; 306 Elf_Shdr *sechdrs;
@@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len,
602 601
603 module_assert_mutex_or_preempt(); 602 module_assert_mutex_or_preempt();
604 603
605 list_for_each_entry(mod, &modules, list) { 604 list_for_each_entry_rcu(mod, &modules, list) {
606 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 605 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
607 continue; 606 continue;
608 if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) 607 if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1202,10 +1201,7 @@ static ssize_t store_uevent(struct module_attribute *mattr,
1202 struct module_kobject *mk, 1201 struct module_kobject *mk,
1203 const char *buffer, size_t count) 1202 const char *buffer, size_t count)
1204{ 1203{
1205 enum kobject_action action; 1204 kobject_synth_uevent(&mk->kobj, buffer, count);
1206
1207 if (kobject_action_type(buffer, count, &action) == 0)
1208 kobject_uevent(&mk->kobj, action);
1209 return count; 1205 return count;
1210} 1206}
1211 1207
@@ -1278,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc)
1278 return *(u32 *)((void *)crc + *crc); 1274 return *(u32 *)((void *)crc + *crc);
1279} 1275}
1280 1276
1281static int check_version(Elf_Shdr *sechdrs, 1277static int check_version(const struct load_info *info,
1282 unsigned int versindex,
1283 const char *symname, 1278 const char *symname,
1284 struct module *mod, 1279 struct module *mod,
1285 const s32 *crc) 1280 const s32 *crc)
1286{ 1281{
1282 Elf_Shdr *sechdrs = info->sechdrs;
1283 unsigned int versindex = info->index.vers;
1287 unsigned int i, num_versions; 1284 unsigned int i, num_versions;
1288 struct modversion_info *versions; 1285 struct modversion_info *versions;
1289 1286
@@ -1317,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs,
1317 } 1314 }
1318 1315
1319 /* Broken toolchain. Warn once, then let it go.. */ 1316 /* Broken toolchain. Warn once, then let it go.. */
1320 pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); 1317 pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
1321 return 1; 1318 return 1;
1322 1319
1323bad_version: 1320bad_version:
1324 pr_warn("%s: disagrees about version of symbol %s\n", 1321 pr_warn("%s: disagrees about version of symbol %s\n",
1325 mod->name, symname); 1322 info->name, symname);
1326 return 0; 1323 return 0;
1327} 1324}
1328 1325
1329static inline int check_modstruct_version(Elf_Shdr *sechdrs, 1326static inline int check_modstruct_version(const struct load_info *info,
1330 unsigned int versindex,
1331 struct module *mod) 1327 struct module *mod)
1332{ 1328{
1333 const s32 *crc; 1329 const s32 *crc;
@@ -1343,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1343 BUG(); 1339 BUG();
1344 } 1340 }
1345 preempt_enable(); 1341 preempt_enable();
1346 return check_version(sechdrs, versindex, 1342 return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
1347 VMLINUX_SYMBOL_STR(module_layout), mod, crc); 1343 mod, crc);
1348} 1344}
1349 1345
1350/* First part is kernel version, which we ignore if module has crcs. */ 1346/* First part is kernel version, which we ignore if module has crcs. */
@@ -1358,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1358 return strcmp(amagic, bmagic) == 0; 1354 return strcmp(amagic, bmagic) == 0;
1359} 1355}
1360#else 1356#else
1361static inline int check_version(Elf_Shdr *sechdrs, 1357static inline int check_version(const struct load_info *info,
1362 unsigned int versindex,
1363 const char *symname, 1358 const char *symname,
1364 struct module *mod, 1359 struct module *mod,
1365 const s32 *crc) 1360 const s32 *crc)
@@ -1367,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
1367 return 1; 1362 return 1;
1368} 1363}
1369 1364
1370static inline int check_modstruct_version(Elf_Shdr *sechdrs, 1365static inline int check_modstruct_version(const struct load_info *info,
1371 unsigned int versindex,
1372 struct module *mod) 1366 struct module *mod)
1373{ 1367{
1374 return 1; 1368 return 1;
@@ -1404,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
1404 if (!sym) 1398 if (!sym)
1405 goto unlock; 1399 goto unlock;
1406 1400
1407 if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { 1401 if (!check_version(info, name, mod, crc)) {
1408 sym = ERR_PTR(-EINVAL); 1402 sym = ERR_PTR(-EINVAL);
1409 goto getname; 1403 goto getname;
1410 } 1404 }
@@ -1667,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod)
1667} 1661}
1668#endif /* CONFIG_KALLSYMS */ 1662#endif /* CONFIG_KALLSYMS */
1669 1663
1670static void add_usage_links(struct module *mod) 1664static void del_usage_links(struct module *mod)
1671{ 1665{
1672#ifdef CONFIG_MODULE_UNLOAD 1666#ifdef CONFIG_MODULE_UNLOAD
1673 struct module_use *use; 1667 struct module_use *use;
1674 int nowarn;
1675 1668
1676 mutex_lock(&module_mutex); 1669 mutex_lock(&module_mutex);
1677 list_for_each_entry(use, &mod->target_list, target_list) { 1670 list_for_each_entry(use, &mod->target_list, target_list)
1678 nowarn = sysfs_create_link(use->target->holders_dir, 1671 sysfs_remove_link(use->target->holders_dir, mod->name);
1679 &mod->mkobj.kobj, mod->name);
1680 }
1681 mutex_unlock(&module_mutex); 1672 mutex_unlock(&module_mutex);
1682#endif 1673#endif
1683} 1674}
1684 1675
1685static void del_usage_links(struct module *mod) 1676static int add_usage_links(struct module *mod)
1686{ 1677{
1678 int ret = 0;
1687#ifdef CONFIG_MODULE_UNLOAD 1679#ifdef CONFIG_MODULE_UNLOAD
1688 struct module_use *use; 1680 struct module_use *use;
1689 1681
1690 mutex_lock(&module_mutex); 1682 mutex_lock(&module_mutex);
1691 list_for_each_entry(use, &mod->target_list, target_list) 1683 list_for_each_entry(use, &mod->target_list, target_list) {
1692 sysfs_remove_link(use->target->holders_dir, mod->name); 1684 ret = sysfs_create_link(use->target->holders_dir,
1685 &mod->mkobj.kobj, mod->name);
1686 if (ret)
1687 break;
1688 }
1693 mutex_unlock(&module_mutex); 1689 mutex_unlock(&module_mutex);
1690 if (ret)
1691 del_usage_links(mod);
1694#endif 1692#endif
1693 return ret;
1695} 1694}
1696 1695
1697static int module_add_modinfo_attrs(struct module *mod) 1696static int module_add_modinfo_attrs(struct module *mod)
@@ -1802,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod,
1802 if (err) 1801 if (err)
1803 goto out_unreg_param; 1802 goto out_unreg_param;
1804 1803
1805 add_usage_links(mod); 1804 err = add_usage_links(mod);
1805 if (err)
1806 goto out_unreg_modinfo_attrs;
1807
1806 add_sect_attrs(mod, info); 1808 add_sect_attrs(mod, info);
1807 add_notes_attrs(mod, info); 1809 add_notes_attrs(mod, info);
1808 1810
1809 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1811 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1810 return 0; 1812 return 0;
1811 1813
1814out_unreg_modinfo_attrs:
1815 module_remove_modinfo_attrs(mod);
1812out_unreg_param: 1816out_unreg_param:
1813 module_param_sysfs_remove(mod); 1817 module_param_sysfs_remove(mod);
1814out_unreg_holders: 1818out_unreg_holders:
@@ -2915,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags)
2915 info->index.vers = 0; /* Pretend no __versions section! */ 2919 info->index.vers = 0; /* Pretend no __versions section! */
2916 else 2920 else
2917 info->index.vers = find_sec(info, "__versions"); 2921 info->index.vers = find_sec(info, "__versions");
2922 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
2923
2918 info->index.info = find_sec(info, ".modinfo"); 2924 info->index.info = find_sec(info, ".modinfo");
2925 if (!info->index.info)
2926 info->name = "(missing .modinfo section)";
2927 else
2928 info->name = get_modinfo(info, "name");
2919 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; 2929 info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
2920 info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; 2930
2921 return 0; 2931 return 0;
2922} 2932}
2923 2933
@@ -2957,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags)
2957 2967
2958 info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); 2968 info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
2959 if (!info->index.mod) { 2969 if (!info->index.mod) {
2960 pr_warn("No module found in object\n"); 2970 pr_warn("%s: No module found in object\n",
2971 info->name ?: "(missing .modinfo name field)");
2961 return ERR_PTR(-ENOEXEC); 2972 return ERR_PTR(-ENOEXEC);
2962 } 2973 }
2963 /* This is temporary: point mod into copy of data. */ 2974 /* This is temporary: point mod into copy of data. */
2964 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2975 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2965 2976
2977 /*
2978 * If we didn't load the .modinfo 'name' field, fall back to
2979 * on-disk struct mod 'name' field.
2980 */
2981 if (!info->name)
2982 info->name = mod->name;
2983
2966 if (info->index.sym == 0) { 2984 if (info->index.sym == 0) {
2967 pr_warn("%s: module has no symbols (stripped?)\n", mod->name); 2985 pr_warn("%s: module has no symbols (stripped?)\n", info->name);
2968 return ERR_PTR(-ENOEXEC); 2986 return ERR_PTR(-ENOEXEC);
2969 } 2987 }
2970 2988
2971 info->index.pcpu = find_pcpusec(info); 2989 info->index.pcpu = find_pcpusec(info);
2972 2990
2973 /* Check module struct version now, before we try to use module. */ 2991 /* Check module struct version now, before we try to use module. */
2974 if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) 2992 if (!check_modstruct_version(info, mod))
2975 return ERR_PTR(-ENOEXEC); 2993 return ERR_PTR(-ENOEXEC);
2976 2994
2977 return mod; 2995 return mod;
@@ -2992,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
2992 return err; 3010 return err;
2993 } else if (!same_magic(modmagic, vermagic, info->index.vers)) { 3011 } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
2994 pr_err("%s: version magic '%s' should be '%s'\n", 3012 pr_err("%s: version magic '%s' should be '%s'\n",
2995 mod->name, modmagic, vermagic); 3013 info->name, modmagic, vermagic);
2996 return -ENOEXEC; 3014 return -ENOEXEC;
2997 } 3015 }
2998 3016
@@ -3077,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
3077 mod->trace_events = section_objs(info, "_ftrace_events", 3095 mod->trace_events = section_objs(info, "_ftrace_events",
3078 sizeof(*mod->trace_events), 3096 sizeof(*mod->trace_events),
3079 &mod->num_trace_events); 3097 &mod->num_trace_events);
3080 mod->trace_enums = section_objs(info, "_ftrace_enum_map", 3098 mod->trace_evals = section_objs(info, "_ftrace_eval_map",
3081 sizeof(*mod->trace_enums), 3099 sizeof(*mod->trace_evals),
3082 &mod->num_trace_enums); 3100 &mod->num_trace_evals);
3083#endif 3101#endif
3084#ifdef CONFIG_TRACING 3102#ifdef CONFIG_TRACING
3085 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 3103 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3242,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
3242 3260
3243/* module_blacklist is a comma-separated list of module names */ 3261/* module_blacklist is a comma-separated list of module names */
3244static char *module_blacklist; 3262static char *module_blacklist;
3245static bool blacklisted(char *module_name) 3263static bool blacklisted(const char *module_name)
3246{ 3264{
3247 const char *p; 3265 const char *p;
3248 size_t len; 3266 size_t len;
@@ -3272,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
3272 if (IS_ERR(mod)) 3290 if (IS_ERR(mod))
3273 return mod; 3291 return mod;
3274 3292
3275 if (blacklisted(mod->name)) 3293 if (blacklisted(info->name))
3276 return ERR_PTR(-EPERM); 3294 return ERR_PTR(-EPERM);
3277 3295
3278 err = check_modinfo(mod, info, flags); 3296 err = check_modinfo(mod, info, flags);
@@ -4201,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
4201 goto out; 4219 goto out;
4202 4220
4203 e = search_extable(mod->extable, 4221 e = search_extable(mod->extable,
4204 mod->extable + mod->num_exentries - 1, 4222 mod->num_exentries,
4205 addr); 4223 addr);
4206out: 4224out:
4207 preempt_enable(); 4225 preempt_enable();
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..731c4e528f4e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
575 */ 575 */
576void __init pidhash_init(void) 576void __init pidhash_init(void)
577{ 577{
578 unsigned int i, pidhash_size; 578 unsigned int pidhash_size;
579 579
580 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 580 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
581 HASH_EARLY | HASH_SMALL, 581 HASH_EARLY | HASH_SMALL | HASH_ZERO,
582 &pidhash_shift, NULL, 582 &pidhash_shift, NULL,
583 0, 4096); 583 0, 4096);
584 pidhash_size = 1U << pidhash_shift; 584 pidhash_size = 1U << pidhash_shift;
585
586 for (i = 0; i < pidhash_size; i++)
587 INIT_HLIST_HEAD(&pid_hash[i]);
588} 585}
589 586
590void __init pidmap_init(void) 587void __init pidmap_init(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a8b978c35a6a..e1914c7b85b1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1108,7 +1108,7 @@ static struct attribute * g[] = {
1108}; 1108};
1109 1109
1110 1110
1111static struct attribute_group attr_group = { 1111static const struct attribute_group attr_group = {
1112 .attrs = g, 1112 .attrs = g,
1113}; 1113};
1114 1114
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d401c21136d1..42bd800a6755 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -705,7 +705,7 @@ static struct attribute * g[] = {
705 NULL, 705 NULL,
706}; 706};
707 707
708static struct attribute_group attr_group = { 708static const struct attribute_group attr_group = {
709 .attrs = g, 709 .attrs = g,
710}; 710};
711 711
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
132 if (!pm_freezing) 132 if (!pm_freezing)
133 atomic_inc(&system_freezing_cnt); 133 atomic_inc(&system_freezing_cnt);
134 134
135 pm_wakeup_clear(); 135 pm_wakeup_clear(true);
136 pr_info("Freezing user space processes ... "); 136 pr_info("Freezing user space processes ... ");
137 pm_freezing = true; 137 pm_freezing = true;
138 error = try_to_freeze_tasks(true); 138 error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fa46606f3356..222317721c5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,19 +30,17 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/compiler.h> 31#include <linux/compiler.h>
32#include <linux/ktime.h> 32#include <linux/ktime.h>
33#include <linux/set_memory.h>
33 34
34#include <linux/uaccess.h> 35#include <linux/uaccess.h>
35#include <asm/mmu_context.h> 36#include <asm/mmu_context.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#ifdef CONFIG_STRICT_KERNEL_RWX
40#include <asm/set_memory.h>
41#endif
42 40
43#include "power.h" 41#include "power.h"
44 42
45#ifdef CONFIG_STRICT_KERNEL_RWX 43#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY)
46static bool hibernate_restore_protection; 44static bool hibernate_restore_protection;
47static bool hibernate_restore_protection_active; 45static bool hibernate_restore_protection_active;
48 46
@@ -77,7 +75,7 @@ static inline void hibernate_restore_protection_begin(void) {}
77static inline void hibernate_restore_protection_end(void) {} 75static inline void hibernate_restore_protection_end(void) {}
78static inline void hibernate_restore_protect_page(void *page_address) {} 76static inline void hibernate_restore_protect_page(void *page_address) {}
79static inline void hibernate_restore_unprotect_page(void *page_address) {} 77static inline void hibernate_restore_unprotect_page(void *page_address) {}
80#endif /* CONFIG_STRICT_KERNEL_RWX */ 78#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
81 79
82static int swsusp_page_is_free(struct page *); 80static int swsusp_page_is_free(struct page *);
83static void swsusp_set_page_forbidden(struct page *); 81static void swsusp_set_page_forbidden(struct page *);
@@ -1929,8 +1927,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
1929 * also be located in the high memory, because of the way in which 1927 * also be located in the high memory, because of the way in which
1930 * copy_data_pages() works. 1928 * copy_data_pages() works.
1931 */ 1929 */
1932static int swsusp_alloc(struct memory_bitmap *orig_bm, 1930static int swsusp_alloc(struct memory_bitmap *copy_bm,
1933 struct memory_bitmap *copy_bm,
1934 unsigned int nr_pages, unsigned int nr_highmem) 1931 unsigned int nr_pages, unsigned int nr_highmem)
1935{ 1932{
1936 if (nr_highmem > 0) { 1933 if (nr_highmem > 0) {
@@ -1976,7 +1973,7 @@ asmlinkage __visible int swsusp_save(void)
1976 return -ENOMEM; 1973 return -ENOMEM;
1977 } 1974 }
1978 1975
1979 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) { 1976 if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
1980 printk(KERN_ERR "PM: Memory allocation failed\n"); 1977 printk(KERN_ERR "PM: Memory allocation failed\n");
1981 return -ENOMEM; 1978 return -ENOMEM;
1982 } 1979 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..3ecf275d7e44 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
72 72
73static void freeze_enter(void) 73static void freeze_enter(void)
74{ 74{
75 trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
76
75 spin_lock_irq(&suspend_freeze_lock); 77 spin_lock_irq(&suspend_freeze_lock);
76 if (pm_wakeup_pending()) 78 if (pm_wakeup_pending())
77 goto out; 79 goto out;
@@ -84,11 +86,9 @@ static void freeze_enter(void)
84 86
85 /* Push all the CPUs into the idle loop. */ 87 /* Push all the CPUs into the idle loop. */
86 wake_up_all_idle_cpus(); 88 wake_up_all_idle_cpus();
87 pr_debug("PM: suspend-to-idle\n");
88 /* Make the current CPU wait so it can enter the idle loop too. */ 89 /* Make the current CPU wait so it can enter the idle loop too. */
89 wait_event(suspend_freeze_wait_head, 90 wait_event(suspend_freeze_wait_head,
90 suspend_freeze_state == FREEZE_STATE_WAKE); 91 suspend_freeze_state == FREEZE_STATE_WAKE);
91 pr_debug("PM: resume from suspend-to-idle\n");
92 92
93 cpuidle_pause(); 93 cpuidle_pause();
94 put_online_cpus(); 94 put_online_cpus();
@@ -98,6 +98,31 @@ static void freeze_enter(void)
98 out: 98 out:
99 suspend_freeze_state = FREEZE_STATE_NONE; 99 suspend_freeze_state = FREEZE_STATE_NONE;
100 spin_unlock_irq(&suspend_freeze_lock); 100 spin_unlock_irq(&suspend_freeze_lock);
101
102 trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
103}
104
105static void s2idle_loop(void)
106{
107 pr_debug("PM: suspend-to-idle\n");
108
109 do {
110 freeze_enter();
111
112 if (freeze_ops && freeze_ops->wake)
113 freeze_ops->wake();
114
115 dpm_resume_noirq(PMSG_RESUME);
116 if (freeze_ops && freeze_ops->sync)
117 freeze_ops->sync();
118
119 if (pm_wakeup_pending())
120 break;
121
122 pm_wakeup_clear(false);
123 } while (!dpm_suspend_noirq(PMSG_SUSPEND));
124
125 pr_debug("PM: resume from suspend-to-idle\n");
101} 126}
102 127
103void freeze_wake(void) 128void freeze_wake(void)
@@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
371 * all the devices are suspended. 396 * all the devices are suspended.
372 */ 397 */
373 if (state == PM_SUSPEND_FREEZE) { 398 if (state == PM_SUSPEND_FREEZE) {
374 trace_suspend_resume(TPS("machine_suspend"), state, true); 399 s2idle_loop();
375 freeze_enter(); 400 goto Platform_early_resume;
376 trace_suspend_resume(TPS("machine_suspend"), state, false);
377 goto Platform_wake;
378 } 401 }
379 402
380 error = disable_nonboot_cpus(); 403 error = disable_nonboot_cpus();
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 1db044f808b7..2a7d04049af4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -18,12 +18,14 @@
18 18
19#ifdef CONFIG_PRINTK 19#ifdef CONFIG_PRINTK
20 20
21#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff 21#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
22#define PRINTK_NMI_CONTEXT_MASK 0x80000000 22#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
23#define PRINTK_NMI_CONTEXT_MASK 0x80000000
23 24
24extern raw_spinlock_t logbuf_lock; 25extern raw_spinlock_t logbuf_lock;
25 26
26__printf(1, 0) int vprintk_default(const char *fmt, va_list args); 27__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
28__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
27__printf(1, 0) int vprintk_func(const char *fmt, va_list args); 29__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
28void __printk_safe_enter(void); 30void __printk_safe_enter(void);
29void __printk_safe_exit(void); 31void __printk_safe_exit(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bd53ea579dc8..fc47863f629c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2720,16 +2720,13 @@ void wake_up_klogd(void)
2720 preempt_enable(); 2720 preempt_enable();
2721} 2721}
2722 2722
2723int printk_deferred(const char *fmt, ...) 2723int vprintk_deferred(const char *fmt, va_list args)
2724{ 2724{
2725 va_list args;
2726 int r; 2725 int r;
2727 2726
2728 preempt_disable();
2729 va_start(args, fmt);
2730 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); 2727 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2731 va_end(args);
2732 2728
2729 preempt_disable();
2733 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2730 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2734 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); 2731 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2735 preempt_enable(); 2732 preempt_enable();
@@ -2737,6 +2734,18 @@ int printk_deferred(const char *fmt, ...)
2737 return r; 2734 return r;
2738} 2735}
2739 2736
2737int printk_deferred(const char *fmt, ...)
2738{
2739 va_list args;
2740 int r;
2741
2742 va_start(args, fmt);
2743 r = vprintk_deferred(fmt, args);
2744 va_end(args);
2745
2746 return r;
2747}
2748
2740/* 2749/*
2741 * printk rate limiting, lifted from the networking subsystem. 2750 * printk rate limiting, lifted from the networking subsystem.
2742 * 2751 *
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 033e50a7d706..3cdaeaef9ce1 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
80 * happen, printk_safe_log_store() will notice the buffer->len mismatch 80 * happen, printk_safe_log_store() will notice the buffer->len mismatch
81 * and repeat the write. 81 * and repeat the write.
82 */ 82 */
83static int printk_safe_log_store(struct printk_safe_seq_buf *s, 83static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
84 const char *fmt, va_list args) 84 const char *fmt, va_list args)
85{ 85{
86 int add; 86 int add;
87 size_t len; 87 size_t len;
@@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void)
299 * one writer running. But the buffer might get flushed from another 299 * one writer running. But the buffer might get flushed from another
300 * CPU, so we need to be careful. 300 * CPU, so we need to be careful.
301 */ 301 */
302static int vprintk_nmi(const char *fmt, va_list args) 302static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
303{ 303{
304 struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); 304 struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
305 305
@@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args)
308 308
309void printk_nmi_enter(void) 309void printk_nmi_enter(void)
310{ 310{
311 this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); 311 /*
312 * The size of the extra per-CPU buffer is limited. Use it only when
313 * the main one is locked. If this CPU is not in the safe context,
314 * the lock must be taken on another CPU and we could wait for it.
315 */
316 if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
317 raw_spin_is_locked(&logbuf_lock)) {
318 this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
319 } else {
320 this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
321 }
312} 322}
313 323
314void printk_nmi_exit(void) 324void printk_nmi_exit(void)
315{ 325{
316 this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); 326 this_cpu_and(printk_context,
327 ~(PRINTK_NMI_CONTEXT_MASK |
328 PRINTK_NMI_DEFERRED_CONTEXT_MASK));
317} 329}
318 330
319#else 331#else
320 332
321static int vprintk_nmi(const char *fmt, va_list args) 333static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
322{ 334{
323 return 0; 335 return 0;
324} 336}
@@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
330 * into itself. It uses a per-CPU buffer to store the message, just like 342 * into itself. It uses a per-CPU buffer to store the message, just like
331 * NMI. 343 * NMI.
332 */ 344 */
333static int vprintk_safe(const char *fmt, va_list args) 345static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
334{ 346{
335 struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); 347 struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
336 348
@@ -351,12 +363,22 @@ void __printk_safe_exit(void)
351 363
352__printf(1, 0) int vprintk_func(const char *fmt, va_list args) 364__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
353{ 365{
366 /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
354 if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) 367 if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
355 return vprintk_nmi(fmt, args); 368 return vprintk_nmi(fmt, args);
356 369
370 /* Use extra buffer to prevent a recursion deadlock in safe mode. */
357 if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) 371 if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
358 return vprintk_safe(fmt, args); 372 return vprintk_safe(fmt, args);
359 373
374 /*
375 * Use the main logbuf when logbuf_lock is available in NMI.
376 * But avoid calling console drivers that might have their own locks.
377 */
378 if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
379 return vprintk_deferred(fmt, args);
380
381 /* No obstacles. */
360 return vprintk_default(fmt, args); 382 return vprintk_default(fmt, args);
361} 383}
362 384
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..29a397067ffa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy)
610 sg_cpu->sg_policy = sg_policy; 610 sg_cpu->sg_policy = sg_policy;
611 sg_cpu->flags = SCHED_CPUFREQ_RT; 611 sg_cpu->flags = SCHED_CPUFREQ_RT;
612 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 612 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
613 }
614
615 for_each_cpu(cpu, policy->cpus) {
616 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
617
613 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, 618 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
614 policy_is_shared(policy) ? 619 policy_is_shared(policy) ?
615 sugov_update_shared : 620 sugov_update_shared :
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
611 utime = curr->utime; 611 utime = curr->utime;
612 612
613 /* 613 /*
614 * If either stime or both stime and utime are 0, assume all runtime is 614 * If either stime or utime are 0, assume all runtime is userspace.
615 * userspace. Once a task gets some ticks, the monotonicy code at 615 * Once a task gets some ticks, the monotonicy code at 'update:'
616 * 'update' will ensure things converge to the observed ratio. 616 * will ensure things converge to the observed ratio.
617 */ 617 */
618 if (stime != 0) { 618 if (stime == 0) {
619 if (utime == 0) 619 utime = rtime;
620 stime = rtime; 620 goto update;
621 else
622 stime = scale_stime(stime, rtime, stime + utime);
623 } 621 }
624 622
623 if (utime == 0) {
624 stime = rtime;
625 goto update;
626 }
627
628 stime = scale_stime(stime, rtime, stime + utime);
629
630update:
625 /* 631 /*
626 * Make sure stime doesn't go backwards; this preserves monotonicity 632 * Make sure stime doesn't go backwards; this preserves monotonicity
627 * for utime because rtime is monotonic. 633 * for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
673#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 679#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
674 680
675#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 681#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
676static u64 vtime_delta(struct task_struct *tsk) 682static u64 vtime_delta(struct vtime *vtime)
677{ 683{
678 unsigned long now = READ_ONCE(jiffies); 684 unsigned long long clock;
679 685
680 if (time_before(now, (unsigned long)tsk->vtime_snap)) 686 clock = sched_clock();
687 if (clock < vtime->starttime)
681 return 0; 688 return 0;
682 689
683 return jiffies_to_nsecs(now - tsk->vtime_snap); 690 return clock - vtime->starttime;
684} 691}
685 692
686static u64 get_vtime_delta(struct task_struct *tsk) 693static u64 get_vtime_delta(struct vtime *vtime)
687{ 694{
688 unsigned long now = READ_ONCE(jiffies); 695 u64 delta = vtime_delta(vtime);
689 u64 delta, other; 696 u64 other;
690 697
691 /* 698 /*
692 * Unlike tick based timing, vtime based timing never has lost 699 * Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
695 * elapsed time. Limit account_other_time to prevent rounding 702 * elapsed time. Limit account_other_time to prevent rounding
696 * errors from causing elapsed vtime to go negative. 703 * errors from causing elapsed vtime to go negative.
697 */ 704 */
698 delta = jiffies_to_nsecs(now - tsk->vtime_snap);
699 other = account_other_time(delta); 705 other = account_other_time(delta);
700 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 706 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
701 tsk->vtime_snap = now; 707 vtime->starttime += delta;
702 708
703 return delta - other; 709 return delta - other;
704} 710}
705 711
706static void __vtime_account_system(struct task_struct *tsk) 712static void __vtime_account_system(struct task_struct *tsk,
713 struct vtime *vtime)
714{
715 vtime->stime += get_vtime_delta(vtime);
716 if (vtime->stime >= TICK_NSEC) {
717 account_system_time(tsk, irq_count(), vtime->stime);
718 vtime->stime = 0;
719 }
720}
721
722static void vtime_account_guest(struct task_struct *tsk,
723 struct vtime *vtime)
707{ 724{
708 account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); 725 vtime->gtime += get_vtime_delta(vtime);
726 if (vtime->gtime >= TICK_NSEC) {
727 account_guest_time(tsk, vtime->gtime);
728 vtime->gtime = 0;
729 }
709} 730}
710 731
711void vtime_account_system(struct task_struct *tsk) 732void vtime_account_system(struct task_struct *tsk)
712{ 733{
713 if (!vtime_delta(tsk)) 734 struct vtime *vtime = &tsk->vtime;
735
736 if (!vtime_delta(vtime))
714 return; 737 return;
715 738
716 write_seqcount_begin(&tsk->vtime_seqcount); 739 write_seqcount_begin(&vtime->seqcount);
717 __vtime_account_system(tsk); 740 /* We might have scheduled out from guest path */
718 write_seqcount_end(&tsk->vtime_seqcount); 741 if (current->flags & PF_VCPU)
742 vtime_account_guest(tsk, vtime);
743 else
744 __vtime_account_system(tsk, vtime);
745 write_seqcount_end(&vtime->seqcount);
719} 746}
720 747
721void vtime_account_user(struct task_struct *tsk) 748void vtime_user_enter(struct task_struct *tsk)
722{ 749{
723 write_seqcount_begin(&tsk->vtime_seqcount); 750 struct vtime *vtime = &tsk->vtime;
724 tsk->vtime_snap_whence = VTIME_SYS; 751
725 if (vtime_delta(tsk)) 752 write_seqcount_begin(&vtime->seqcount);
726 account_user_time(tsk, get_vtime_delta(tsk)); 753 __vtime_account_system(tsk, vtime);
727 write_seqcount_end(&tsk->vtime_seqcount); 754 vtime->state = VTIME_USER;
755 write_seqcount_end(&vtime->seqcount);
728} 756}
729 757
730void vtime_user_enter(struct task_struct *tsk) 758void vtime_user_exit(struct task_struct *tsk)
731{ 759{
732 write_seqcount_begin(&tsk->vtime_seqcount); 760 struct vtime *vtime = &tsk->vtime;
733 if (vtime_delta(tsk)) 761
734 __vtime_account_system(tsk); 762 write_seqcount_begin(&vtime->seqcount);
735 tsk->vtime_snap_whence = VTIME_USER; 763 vtime->utime += get_vtime_delta(vtime);
736 write_seqcount_end(&tsk->vtime_seqcount); 764 if (vtime->utime >= TICK_NSEC) {
765 account_user_time(tsk, vtime->utime);
766 vtime->utime = 0;
767 }
768 vtime->state = VTIME_SYS;
769 write_seqcount_end(&vtime->seqcount);
737} 770}
738 771
739void vtime_guest_enter(struct task_struct *tsk) 772void vtime_guest_enter(struct task_struct *tsk)
740{ 773{
774 struct vtime *vtime = &tsk->vtime;
741 /* 775 /*
742 * The flags must be updated under the lock with 776 * The flags must be updated under the lock with
743 * the vtime_snap flush and update. 777 * the vtime_starttime flush and update.
744 * That enforces a right ordering and update sequence 778 * That enforces a right ordering and update sequence
745 * synchronization against the reader (task_gtime()) 779 * synchronization against the reader (task_gtime())
746 * that can thus safely catch up with a tickless delta. 780 * that can thus safely catch up with a tickless delta.
747 */ 781 */
748 write_seqcount_begin(&tsk->vtime_seqcount); 782 write_seqcount_begin(&vtime->seqcount);
749 if (vtime_delta(tsk)) 783 __vtime_account_system(tsk, vtime);
750 __vtime_account_system(tsk);
751 current->flags |= PF_VCPU; 784 current->flags |= PF_VCPU;
752 write_seqcount_end(&tsk->vtime_seqcount); 785 write_seqcount_end(&vtime->seqcount);
753} 786}
754EXPORT_SYMBOL_GPL(vtime_guest_enter); 787EXPORT_SYMBOL_GPL(vtime_guest_enter);
755 788
756void vtime_guest_exit(struct task_struct *tsk) 789void vtime_guest_exit(struct task_struct *tsk)
757{ 790{
758 write_seqcount_begin(&tsk->vtime_seqcount); 791 struct vtime *vtime = &tsk->vtime;
759 __vtime_account_system(tsk); 792
793 write_seqcount_begin(&vtime->seqcount);
794 vtime_account_guest(tsk, vtime);
760 current->flags &= ~PF_VCPU; 795 current->flags &= ~PF_VCPU;
761 write_seqcount_end(&tsk->vtime_seqcount); 796 write_seqcount_end(&vtime->seqcount);
762} 797}
763EXPORT_SYMBOL_GPL(vtime_guest_exit); 798EXPORT_SYMBOL_GPL(vtime_guest_exit);
764 799
765void vtime_account_idle(struct task_struct *tsk) 800void vtime_account_idle(struct task_struct *tsk)
766{ 801{
767 account_idle_time(get_vtime_delta(tsk)); 802 account_idle_time(get_vtime_delta(&tsk->vtime));
768} 803}
769 804
770void arch_vtime_task_switch(struct task_struct *prev) 805void arch_vtime_task_switch(struct task_struct *prev)
771{ 806{
772 write_seqcount_begin(&prev->vtime_seqcount); 807 struct vtime *vtime = &prev->vtime;
773 prev->vtime_snap_whence = VTIME_INACTIVE; 808
774 write_seqcount_end(&prev->vtime_seqcount); 809 write_seqcount_begin(&vtime->seqcount);
810 vtime->state = VTIME_INACTIVE;
811 write_seqcount_end(&vtime->seqcount);
812
813 vtime = &current->vtime;
775 814
776 write_seqcount_begin(&current->vtime_seqcount); 815 write_seqcount_begin(&vtime->seqcount);
777 current->vtime_snap_whence = VTIME_SYS; 816 vtime->state = VTIME_SYS;
778 current->vtime_snap = jiffies; 817 vtime->starttime = sched_clock();
779 write_seqcount_end(&current->vtime_seqcount); 818 write_seqcount_end(&vtime->seqcount);
780} 819}
781 820
782void vtime_init_idle(struct task_struct *t, int cpu) 821void vtime_init_idle(struct task_struct *t, int cpu)
783{ 822{
823 struct vtime *vtime = &t->vtime;
784 unsigned long flags; 824 unsigned long flags;
785 825
786 local_irq_save(flags); 826 local_irq_save(flags);
787 write_seqcount_begin(&t->vtime_seqcount); 827 write_seqcount_begin(&vtime->seqcount);
788 t->vtime_snap_whence = VTIME_SYS; 828 vtime->state = VTIME_SYS;
789 t->vtime_snap = jiffies; 829 vtime->starttime = sched_clock();
790 write_seqcount_end(&t->vtime_seqcount); 830 write_seqcount_end(&vtime->seqcount);
791 local_irq_restore(flags); 831 local_irq_restore(flags);
792} 832}
793 833
794u64 task_gtime(struct task_struct *t) 834u64 task_gtime(struct task_struct *t)
795{ 835{
836 struct vtime *vtime = &t->vtime;
796 unsigned int seq; 837 unsigned int seq;
797 u64 gtime; 838 u64 gtime;
798 839
@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
800 return t->gtime; 841 return t->gtime;
801 842
802 do { 843 do {
803 seq = read_seqcount_begin(&t->vtime_seqcount); 844 seq = read_seqcount_begin(&vtime->seqcount);
804 845
805 gtime = t->gtime; 846 gtime = t->gtime;
806 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) 847 if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
807 gtime += vtime_delta(t); 848 gtime += vtime->gtime + vtime_delta(vtime);
808 849
809 } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 850 } while (read_seqcount_retry(&vtime->seqcount, seq));
810 851
811 return gtime; 852 return gtime;
812} 853}
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
818 */ 859 */
819void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) 860void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
820{ 861{
821 u64 delta; 862 struct vtime *vtime = &t->vtime;
822 unsigned int seq; 863 unsigned int seq;
864 u64 delta;
823 865
824 if (!vtime_accounting_enabled()) { 866 if (!vtime_accounting_enabled()) {
825 *utime = t->utime; 867 *utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
828 } 870 }
829 871
830 do { 872 do {
831 seq = read_seqcount_begin(&t->vtime_seqcount); 873 seq = read_seqcount_begin(&vtime->seqcount);
832 874
833 *utime = t->utime; 875 *utime = t->utime;
834 *stime = t->stime; 876 *stime = t->stime;
835 877
836 /* Task is sleeping, nothing to add */ 878 /* Task is sleeping, nothing to add */
837 if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) 879 if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
838 continue; 880 continue;
839 881
840 delta = vtime_delta(t); 882 delta = vtime_delta(vtime);
841 883
842 /* 884 /*
843 * Task runs either in user or kernel space, add pending nohz time to 885 * Task runs either in user or kernel space, add pending nohz time to
844 * the right place. 886 * the right place.
845 */ 887 */
846 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) 888 if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
847 *utime += delta; 889 *utime += vtime->utime + delta;
848 else if (t->vtime_snap_whence == VTIME_SYS) 890 else if (vtime->state == VTIME_SYS)
849 *stime += delta; 891 *stime += vtime->stime + delta;
850 } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 892 } while (read_seqcount_retry(&vtime->seqcount, seq));
851} 893}
852#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 894#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a84299f44b5d..755bd3f1a1a9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1392 struct sched_dl_entity *pi_se = &p->dl; 1392 struct sched_dl_entity *pi_se = &p->dl;
1393 1393
1394 /* 1394 /*
1395 * Use the scheduling parameters of the top pi-waiter 1395 * Use the scheduling parameters of the top pi-waiter task if:
1396 * task if we have one and its (absolute) deadline is 1396 * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
1397 * smaller than our one... OTW we keep our runtime and 1397 * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
1398 * deadline. 1398 * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
1399 * boosted due to a SCHED_DEADLINE pi-waiter).
1400 * Otherwise we keep our runtime and deadline.
1399 */ 1401 */
1400 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { 1402 if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
1401 pi_se = &pi_task->dl; 1403 pi_se = &pi_task->dl;
1402 } else if (!dl_prio(p->normal_prio)) { 1404 } else if (!dl_prio(p->normal_prio)) {
1403 /* 1405 /*
1404 * Special case in which we have a !SCHED_DEADLINE task 1406 * Special case in which we have a !SCHED_DEADLINE task
1405 * that is going to be deboosted, but exceedes its 1407 * that is going to be deboosted, but exceeds its
1406 * runtime while doing so. No point in replenishing 1408 * runtime while doing so. No point in replenishing
1407 * it, as it's going to return back to its original 1409 * it, as it's going to return back to its original
1408 * scheduling class after this. 1410 * scheduling class after this.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
6646 * our sched_group. We may want to revisit it if we couldn't 6646 * our sched_group. We may want to revisit it if we couldn't
6647 * meet load balance goals by pulling other tasks on src_cpu. 6647 * meet load balance goals by pulling other tasks on src_cpu.
6648 * 6648 *
6649 * Also avoid computing new_dst_cpu if we have already computed 6649 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
6650 * one in current iteration. 6650 * already computed one in current iteration.
6651 */ 6651 */
6652 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) 6652 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
6653 return 0; 6653 return 0;
6654 6654
6655 /* Prevent to re-select dst_cpu via env's cpus */ 6655 /* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
8022 .tasks = LIST_HEAD_INIT(env.tasks), 8022 .tasks = LIST_HEAD_INIT(env.tasks),
8023 }; 8023 };
8024 8024
8025 /* 8025 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8026 * For NEWLY_IDLE load_balancing, we don't need to consider
8027 * other cpus in our group
8028 */
8029 if (idle == CPU_NEWLY_IDLE)
8030 env.dst_grpmask = NULL;
8031
8032 cpumask_copy(cpus, cpu_active_mask);
8033 8026
8034 schedstat_inc(sd->lb_count[idle]); 8027 schedstat_inc(sd->lb_count[idle]);
8035 8028
@@ -8151,7 +8144,15 @@ more_balance:
8151 /* All tasks on this runqueue were pinned by CPU affinity */ 8144 /* All tasks on this runqueue were pinned by CPU affinity */
8152 if (unlikely(env.flags & LBF_ALL_PINNED)) { 8145 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8153 cpumask_clear_cpu(cpu_of(busiest), cpus); 8146 cpumask_clear_cpu(cpu_of(busiest), cpus);
8154 if (!cpumask_empty(cpus)) { 8147 /*
8148 * Attempting to continue load balancing at the current
8149 * sched_domain level only makes sense if there are
8150 * active CPUs remaining as possible busiest CPUs to
8151 * pull load from which are not contained within the
8152 * destination group that is receiving any migrated
8153 * load.
8154 */
8155 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8155 env.loop = 0; 8156 env.loop = 0;
8156 env.loop_break = sched_nr_migrate_break; 8157 env.loop_break = sched_nr_migrate_break;
8157 goto redo; 8158 goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
8447 .src_cpu = busiest_rq->cpu, 8448 .src_cpu = busiest_rq->cpu,
8448 .src_rq = busiest_rq, 8449 .src_rq = busiest_rq,
8449 .idle = CPU_IDLE, 8450 .idle = CPU_IDLE,
8451 /*
8452 * can_migrate_task() doesn't need to compute new_dst_cpu
8453 * for active balancing. Since we have CPU_IDLE, but no
8454 * @dst_grpmask we need to make that test go away with lying
8455 * about DST_PINNED.
8456 */
8457 .flags = LBF_DST_PINNED,
8450 }; 8458 };
8451 8459
8452 schedstat_inc(sd->alb_count); 8460 schedstat_inc(sd->alb_count);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 65f61077ad50..98b59b5db90b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,7 +13,7 @@
13 * of Berkeley Packet Filters/Linux Socket Filters. 13 * of Berkeley Packet Filters/Linux Socket Filters.
14 */ 14 */
15 15
16#include <linux/atomic.h> 16#include <linux/refcount.h>
17#include <linux/audit.h> 17#include <linux/audit.h>
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/coredump.h> 19#include <linux/coredump.h>
@@ -56,7 +56,7 @@
56 * to a task_struct (other than @usage). 56 * to a task_struct (other than @usage).
57 */ 57 */
58struct seccomp_filter { 58struct seccomp_filter {
59 atomic_t usage; 59 refcount_t usage;
60 struct seccomp_filter *prev; 60 struct seccomp_filter *prev;
61 struct bpf_prog *prog; 61 struct bpf_prog *prog;
62}; 62};
@@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
378 return ERR_PTR(ret); 378 return ERR_PTR(ret);
379 } 379 }
380 380
381 atomic_set(&sfilter->usage, 1); 381 refcount_set(&sfilter->usage, 1);
382 382
383 return sfilter; 383 return sfilter;
384} 384}
@@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk)
465 if (!orig) 465 if (!orig)
466 return; 466 return;
467 /* Reference count is bounded by the number of total processes. */ 467 /* Reference count is bounded by the number of total processes. */
468 atomic_inc(&orig->usage); 468 refcount_inc(&orig->usage);
469} 469}
470 470
471static inline void seccomp_filter_free(struct seccomp_filter *filter) 471static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk)
481{ 481{
482 struct seccomp_filter *orig = tsk->seccomp.filter; 482 struct seccomp_filter *orig = tsk->seccomp.filter;
483 /* Clean up single-reference branches iteratively. */ 483 /* Clean up single-reference branches iteratively. */
484 while (orig && atomic_dec_and_test(&orig->usage)) { 484 while (orig && refcount_dec_and_test(&orig->usage)) {
485 struct seccomp_filter *freeme = orig; 485 struct seccomp_filter *freeme = orig;
486 orig = orig->prev; 486 orig = orig->prev;
487 seccomp_filter_free(freeme); 487 seccomp_filter_free(freeme);
@@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
641 return 0; 641 return 0;
642 642
643 case SECCOMP_RET_KILL: 643 case SECCOMP_RET_KILL:
644 default: { 644 default:
645 siginfo_t info;
646 audit_seccomp(this_syscall, SIGSYS, action); 645 audit_seccomp(this_syscall, SIGSYS, action);
647 /* Dump core only if this is the last remaining thread. */ 646 /* Dump core only if this is the last remaining thread. */
648 if (get_nr_threads(current) == 1) { 647 if (get_nr_threads(current) == 1) {
648 siginfo_t info;
649
649 /* Show the original registers in the dump. */ 650 /* Show the original registers in the dump. */
650 syscall_rollback(current, task_pt_regs(current)); 651 syscall_rollback(current, task_pt_regs(current));
651 /* Trigger a manual coredump since do_exit skips it. */ 652 /* Trigger a manual coredump since do_exit skips it. */
@@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
654 } 655 }
655 do_exit(SIGSYS); 656 do_exit(SIGSYS);
656 } 657 }
657 }
658 658
659 unreachable(); 659 unreachable();
660 660
diff --git a/kernel/signal.c b/kernel/signal.c
index 35a570f71f07..caed9133ae52 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1402 return ret; 1402 return ret;
1403 } 1403 }
1404 1404
1405 /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
1406 if (pid == INT_MIN)
1407 return -ESRCH;
1408
1405 read_lock(&tasklist_lock); 1409 read_lock(&tasklist_lock);
1406 if (pid != -1) { 1410 if (pid != -1) {
1407 ret = __kill_pgrp_info(sig, info, 1411 ret = __kill_pgrp_info(sig, info,
@@ -2776,7 +2780,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2776 * @info: if non-null, the signal's siginfo is returned here 2780 * @info: if non-null, the signal's siginfo is returned here
2777 * @ts: upper bound on process time suspension 2781 * @ts: upper bound on process time suspension
2778 */ 2782 */
2779int do_sigtimedwait(const sigset_t *which, siginfo_t *info, 2783static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2780 const struct timespec *ts) 2784 const struct timespec *ts)
2781{ 2785{
2782 ktime_t *to = NULL, timeout = KTIME_MAX; 2786 ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2865,6 +2869,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2865 return ret; 2869 return ret;
2866} 2870}
2867 2871
2872#ifdef CONFIG_COMPAT
2873COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
2874 struct compat_siginfo __user *, uinfo,
2875 struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
2876{
2877 compat_sigset_t s32;
2878 sigset_t s;
2879 struct timespec t;
2880 siginfo_t info;
2881 long ret;
2882
2883 if (sigsetsize != sizeof(sigset_t))
2884 return -EINVAL;
2885
2886 if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
2887 return -EFAULT;
2888 sigset_from_compat(&s, &s32);
2889
2890 if (uts) {
2891 if (compat_get_timespec(&t, uts))
2892 return -EFAULT;
2893 }
2894
2895 ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
2896
2897 if (ret > 0 && uinfo) {
2898 if (copy_siginfo_to_user32(uinfo, &info))
2899 ret = -EFAULT;
2900 }
2901
2902 return ret;
2903}
2904#endif
2905
2868/** 2906/**
2869 * sys_kill - send a signal to a process 2907 * sys_kill - send a signal to a process
2870 * @pid: the PID of the process 2908 * @pid: the PID of the process
@@ -3121,78 +3159,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3121} 3159}
3122 3160
3123static int 3161static int
3124do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) 3162do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
3125{ 3163{
3126 stack_t oss; 3164 struct task_struct *t = current;
3127 int error;
3128 3165
3129 oss.ss_sp = (void __user *) current->sas_ss_sp; 3166 if (oss) {
3130 oss.ss_size = current->sas_ss_size; 3167 memset(oss, 0, sizeof(stack_t));
3131 oss.ss_flags = sas_ss_flags(sp) | 3168 oss->ss_sp = (void __user *) t->sas_ss_sp;
3132 (current->sas_ss_flags & SS_FLAG_BITS); 3169 oss->ss_size = t->sas_ss_size;
3170 oss->ss_flags = sas_ss_flags(sp) |
3171 (current->sas_ss_flags & SS_FLAG_BITS);
3172 }
3133 3173
3134 if (uss) { 3174 if (ss) {
3135 void __user *ss_sp; 3175 void __user *ss_sp = ss->ss_sp;
3136 size_t ss_size; 3176 size_t ss_size = ss->ss_size;
3137 unsigned ss_flags; 3177 unsigned ss_flags = ss->ss_flags;
3138 int ss_mode; 3178 int ss_mode;
3139 3179
3140 error = -EFAULT; 3180 if (unlikely(on_sig_stack(sp)))
3141 if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) 3181 return -EPERM;
3142 goto out;
3143 error = __get_user(ss_sp, &uss->ss_sp) |
3144 __get_user(ss_flags, &uss->ss_flags) |
3145 __get_user(ss_size, &uss->ss_size);
3146 if (error)
3147 goto out;
3148
3149 error = -EPERM;
3150 if (on_sig_stack(sp))
3151 goto out;
3152 3182
3153 ss_mode = ss_flags & ~SS_FLAG_BITS; 3183 ss_mode = ss_flags & ~SS_FLAG_BITS;
3154 error = -EINVAL; 3184 if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
3155 if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && 3185 ss_mode != 0))
3156 ss_mode != 0) 3186 return -EINVAL;
3157 goto out;
3158 3187
3159 if (ss_mode == SS_DISABLE) { 3188 if (ss_mode == SS_DISABLE) {
3160 ss_size = 0; 3189 ss_size = 0;
3161 ss_sp = NULL; 3190 ss_sp = NULL;
3162 } else { 3191 } else {
3163 error = -ENOMEM; 3192 if (unlikely(ss_size < MINSIGSTKSZ))
3164 if (ss_size < MINSIGSTKSZ) 3193 return -ENOMEM;
3165 goto out;
3166 } 3194 }
3167 3195
3168 current->sas_ss_sp = (unsigned long) ss_sp; 3196 t->sas_ss_sp = (unsigned long) ss_sp;
3169 current->sas_ss_size = ss_size; 3197 t->sas_ss_size = ss_size;
3170 current->sas_ss_flags = ss_flags; 3198 t->sas_ss_flags = ss_flags;
3171 } 3199 }
3172 3200 return 0;
3173 error = 0;
3174 if (uoss) {
3175 error = -EFAULT;
3176 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
3177 goto out;
3178 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
3179 __put_user(oss.ss_size, &uoss->ss_size) |
3180 __put_user(oss.ss_flags, &uoss->ss_flags);
3181 }
3182
3183out:
3184 return error;
3185} 3201}
3202
3186SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) 3203SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
3187{ 3204{
3188 return do_sigaltstack(uss, uoss, current_user_stack_pointer()); 3205 stack_t new, old;
3206 int err;
3207 if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
3208 return -EFAULT;
3209 err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
3210 current_user_stack_pointer());
3211 if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
3212 err = -EFAULT;
3213 return err;
3189} 3214}
3190 3215
3191int restore_altstack(const stack_t __user *uss) 3216int restore_altstack(const stack_t __user *uss)
3192{ 3217{
3193 int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); 3218 stack_t new;
3219 if (copy_from_user(&new, uss, sizeof(stack_t)))
3220 return -EFAULT;
3221 (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
3194 /* squash all but EFAULT for now */ 3222 /* squash all but EFAULT for now */
3195 return err == -EFAULT ? err : 0; 3223 return 0;
3196} 3224}
3197 3225
3198int __save_altstack(stack_t __user *uss, unsigned long sp) 3226int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3215,29 +3243,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
3215{ 3243{
3216 stack_t uss, uoss; 3244 stack_t uss, uoss;
3217 int ret; 3245 int ret;
3218 mm_segment_t seg;
3219 3246
3220 if (uss_ptr) { 3247 if (uss_ptr) {
3221 compat_stack_t uss32; 3248 compat_stack_t uss32;
3222
3223 memset(&uss, 0, sizeof(stack_t));
3224 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) 3249 if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
3225 return -EFAULT; 3250 return -EFAULT;
3226 uss.ss_sp = compat_ptr(uss32.ss_sp); 3251 uss.ss_sp = compat_ptr(uss32.ss_sp);
3227 uss.ss_flags = uss32.ss_flags; 3252 uss.ss_flags = uss32.ss_flags;
3228 uss.ss_size = uss32.ss_size; 3253 uss.ss_size = uss32.ss_size;
3229 } 3254 }
3230 seg = get_fs(); 3255 ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
3231 set_fs(KERNEL_DS);
3232 ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
3233 (stack_t __force __user *) &uoss,
3234 compat_user_stack_pointer()); 3256 compat_user_stack_pointer());
3235 set_fs(seg);
3236 if (ret >= 0 && uoss_ptr) { 3257 if (ret >= 0 && uoss_ptr) {
3237 if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || 3258 compat_stack_t old;
3238 __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || 3259 memset(&old, 0, sizeof(old));
3239 __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || 3260 old.ss_sp = ptr_to_compat(uoss.ss_sp);
3240 __put_user(uoss.ss_size, &uoss_ptr->ss_size)) 3261 old.ss_flags = uoss.ss_flags;
3262 old.ss_size = uoss.ss_size;
3263 if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
3241 ret = -EFAULT; 3264 ret = -EFAULT;
3242 } 3265 }
3243 return ret; 3266 return ret;
@@ -3277,6 +3300,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
3277 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 3300 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
3278} 3301}
3279 3302
3303#ifdef CONFIG_COMPAT
3304COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
3305{
3306 sigset_t set;
3307 int err = do_sigpending(&set, sizeof(old_sigset_t));
3308 if (err == 0)
3309 if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
3310 err = -EFAULT;
3311 return err;
3312}
3313#endif
3314
3280#endif 3315#endif
3281 3316
3282#ifdef __ARCH_WANT_SYS_SIGPROCMASK 3317#ifdef __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..2855ee73acd0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
886 return from_kgid_munged(current_user_ns(), current_egid()); 886 return from_kgid_munged(current_user_ns(), current_egid());
887} 887}
888 888
889void do_sys_times(struct tms *tms) 889static void do_sys_times(struct tms *tms)
890{ 890{
891 u64 tgutime, tgstime, cutime, cstime; 891 u64 tgutime, tgstime, cutime, cstime;
892 892
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
912 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 912 return (long) jiffies_64_to_clock_t(get_jiffies_64());
913} 913}
914 914
915#ifdef CONFIG_COMPAT
916static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
917{
918 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
919}
920
921COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
922{
923 if (tbuf) {
924 struct tms tms;
925 struct compat_tms tmp;
926
927 do_sys_times(&tms);
928 /* Convert our struct tms to the compat version. */
929 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
930 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
931 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
932 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
933 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
934 return -EFAULT;
935 }
936 force_successful_syscall_return();
937 return compat_jiffies_to_clock_t(jiffies);
938}
939#endif
940
915/* 941/*
916 * This needs some heavy checking ... 942 * This needs some heavy checking ...
917 * I just haven't the stomach for it. I also don't fully 943 * I just haven't the stomach for it. I also don't fully
@@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1306 return ret; 1332 return ret;
1307} 1333}
1308 1334
1335#ifdef CONFIG_COMPAT
1336
1337COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
1338 struct compat_rlimit __user *, rlim)
1339{
1340 struct rlimit r;
1341 struct compat_rlimit r32;
1342
1343 if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
1344 return -EFAULT;
1345
1346 if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
1347 r.rlim_cur = RLIM_INFINITY;
1348 else
1349 r.rlim_cur = r32.rlim_cur;
1350 if (r32.rlim_max == COMPAT_RLIM_INFINITY)
1351 r.rlim_max = RLIM_INFINITY;
1352 else
1353 r.rlim_max = r32.rlim_max;
1354 return do_prlimit(current, resource, &r, NULL);
1355}
1356
1357COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
1358 struct compat_rlimit __user *, rlim)
1359{
1360 struct rlimit r;
1361 int ret;
1362
1363 ret = do_prlimit(current, resource, NULL, &r);
1364 if (!ret) {
1365 struct compat_rlimit r32;
1366 if (r.rlim_cur > COMPAT_RLIM_INFINITY)
1367 r32.rlim_cur = COMPAT_RLIM_INFINITY;
1368 else
1369 r32.rlim_cur = r.rlim_cur;
1370 if (r.rlim_max > COMPAT_RLIM_INFINITY)
1371 r32.rlim_max = COMPAT_RLIM_INFINITY;
1372 else
1373 r32.rlim_max = r.rlim_max;
1374
1375 if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
1376 return -EFAULT;
1377 }
1378 return ret;
1379}
1380
1381#endif
1382
1309#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1383#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1310 1384
1311/* 1385/*
@@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1328 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; 1402 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1329} 1403}
1330 1404
1405#ifdef CONFIG_COMPAT
1406COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1407 struct compat_rlimit __user *, rlim)
1408{
1409 struct rlimit r;
1410
1411 if (resource >= RLIM_NLIMITS)
1412 return -EINVAL;
1413
1414 task_lock(current->group_leader);
1415 r = current->signal->rlim[resource];
1416 task_unlock(current->group_leader);
1417 if (r.rlim_cur > 0x7FFFFFFF)
1418 r.rlim_cur = 0x7FFFFFFF;
1419 if (r.rlim_max > 0x7FFFFFFF)
1420 r.rlim_max = 0x7FFFFFFF;
1421
1422 if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
1423 put_user(r.rlim_max, &rlim->rlim_max))
1424 return -EFAULT;
1425 return 0;
1426}
1427#endif
1428
1331#endif 1429#endif
1332 1430
1333static inline bool rlim64_is_infinity(__u64 rlim64) 1431static inline bool rlim64_is_infinity(__u64 rlim64)
@@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1552 r->ru_oublock += task_io_get_oublock(t); 1650 r->ru_oublock += task_io_get_oublock(t);
1553} 1651}
1554 1652
1555static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1653void getrusage(struct task_struct *p, int who, struct rusage *r)
1556{ 1654{
1557 struct task_struct *t; 1655 struct task_struct *t;
1558 unsigned long flags; 1656 unsigned long flags;
@@ -1626,20 +1724,16 @@ out:
1626 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ 1724 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1627} 1725}
1628 1726
1629int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1727SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1630{ 1728{
1631 struct rusage r; 1729 struct rusage r;
1632 1730
1633 k_getrusage(p, who, &r);
1634 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1635}
1636
1637SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1638{
1639 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1731 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1640 who != RUSAGE_THREAD) 1732 who != RUSAGE_THREAD)
1641 return -EINVAL; 1733 return -EINVAL;
1642 return getrusage(current, who, ru); 1734
1735 getrusage(current, who, &r);
1736 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1643} 1737}
1644 1738
1645#ifdef CONFIG_COMPAT 1739#ifdef CONFIG_COMPAT
@@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1651 who != RUSAGE_THREAD) 1745 who != RUSAGE_THREAD)
1652 return -EINVAL; 1746 return -EINVAL;
1653 1747
1654 k_getrusage(current, who, &r); 1748 getrusage(current, who, &r);
1655 return put_compat_rusage(&r, ru); 1749 return put_compat_rusage(&r, ru);
1656} 1750}
1657#endif 1751#endif
@@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2266 case PR_GET_THP_DISABLE: 2360 case PR_GET_THP_DISABLE:
2267 if (arg2 || arg3 || arg4 || arg5) 2361 if (arg2 || arg3 || arg4 || arg5)
2268 return -EINVAL; 2362 return -EINVAL;
2269 error = !!(me->mm->def_flags & VM_NOHUGEPAGE); 2363 error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
2270 break; 2364 break;
2271 case PR_SET_THP_DISABLE: 2365 case PR_SET_THP_DISABLE:
2272 if (arg3 || arg4 || arg5) 2366 if (arg3 || arg4 || arg5)
@@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2274 if (down_write_killable(&me->mm->mmap_sem)) 2368 if (down_write_killable(&me->mm->mmap_sem))
2275 return -EINTR; 2369 return -EINTR;
2276 if (arg2) 2370 if (arg2)
2277 me->mm->def_flags |= VM_NOHUGEPAGE; 2371 set_bit(MMF_DISABLE_THP, &me->mm->flags);
2278 else 2372 else
2279 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2373 clear_bit(MMF_DISABLE_THP, &me->mm->flags);
2280 up_write(&me->mm->mmap_sem); 2374 up_write(&me->mm->mmap_sem);
2281 break; 2375 break;
2282 case PR_MPX_ENABLE_MANAGEMENT: 2376 case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..6648fbbb8157 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -174,11 +174,32 @@ extern int no_unaligned_warning;
174 174
175#ifdef CONFIG_PROC_SYSCTL 175#ifdef CONFIG_PROC_SYSCTL
176 176
177#define SYSCTL_WRITES_LEGACY -1 177/**
178#define SYSCTL_WRITES_WARN 0 178 * enum sysctl_writes_mode - supported sysctl write modes
179#define SYSCTL_WRITES_STRICT 1 179 *
180 * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
181 * to be written, and multiple writes on the same sysctl file descriptor
182 * will rewrite the sysctl value, regardless of file position. No warning
183 * is issued when the initial position is not 0.
184 * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
185 * not 0.
186 * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
187 * file position 0 and the value must be fully contained in the buffer
188 * sent to the write syscall. If dealing with strings respect the file
189 * position, but restrict this to the max length of the buffer, anything
190 * passed the max lenght will be ignored. Multiple writes will append
191 * to the buffer.
192 *
193 * These write modes control how current file position affects the behavior of
194 * updating sysctl values through the proc interface on each write.
195 */
196enum sysctl_writes_mode {
197 SYSCTL_WRITES_LEGACY = -1,
198 SYSCTL_WRITES_WARN = 0,
199 SYSCTL_WRITES_STRICT = 1,
200};
180 201
181static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; 202static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
182 203
183static int proc_do_cad_pid(struct ctl_table *table, int write, 204static int proc_do_cad_pid(struct ctl_table *table, int write,
184 void __user *buffer, size_t *lenp, loff_t *ppos); 205 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = {
880#endif 901#endif
881 }, 902 },
882 { 903 {
904 .procname = "watchdog_cpumask",
905 .data = &watchdog_cpumask_bits,
906 .maxlen = NR_CPUS,
907 .mode = 0644,
908 .proc_handler = proc_watchdog_cpumask,
909 },
910#ifdef CONFIG_SOFTLOCKUP_DETECTOR
911 {
883 .procname = "soft_watchdog", 912 .procname = "soft_watchdog",
884 .data = &soft_watchdog_enabled, 913 .data = &soft_watchdog_enabled,
885 .maxlen = sizeof (int), 914 .maxlen = sizeof (int),
@@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = {
889 .extra2 = &one, 918 .extra2 = &one,
890 }, 919 },
891 { 920 {
892 .procname = "watchdog_cpumask",
893 .data = &watchdog_cpumask_bits,
894 .maxlen = NR_CPUS,
895 .mode = 0644,
896 .proc_handler = proc_watchdog_cpumask,
897 },
898 {
899 .procname = "softlockup_panic", 921 .procname = "softlockup_panic",
900 .data = &softlockup_panic, 922 .data = &softlockup_panic,
901 .maxlen = sizeof(int), 923 .maxlen = sizeof(int),
@@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = {
904 .extra1 = &zero, 926 .extra1 = &zero,
905 .extra2 = &one, 927 .extra2 = &one,
906 }, 928 },
907#ifdef CONFIG_HARDLOCKUP_DETECTOR 929#ifdef CONFIG_SMP
908 { 930 {
909 .procname = "hardlockup_panic", 931 .procname = "softlockup_all_cpu_backtrace",
910 .data = &hardlockup_panic, 932 .data = &sysctl_softlockup_all_cpu_backtrace,
911 .maxlen = sizeof(int), 933 .maxlen = sizeof(int),
912 .mode = 0644, 934 .mode = 0644,
913 .proc_handler = proc_dointvec_minmax, 935 .proc_handler = proc_dointvec_minmax,
914 .extra1 = &zero, 936 .extra1 = &zero,
915 .extra2 = &one, 937 .extra2 = &one,
916 }, 938 },
939#endif /* CONFIG_SMP */
917#endif 940#endif
918#ifdef CONFIG_SMP 941#ifdef CONFIG_HARDLOCKUP_DETECTOR
919 { 942 {
920 .procname = "softlockup_all_cpu_backtrace", 943 .procname = "hardlockup_panic",
921 .data = &sysctl_softlockup_all_cpu_backtrace, 944 .data = &hardlockup_panic,
922 .maxlen = sizeof(int), 945 .maxlen = sizeof(int),
923 .mode = 0644, 946 .mode = 0644,
924 .proc_handler = proc_dointvec_minmax, 947 .proc_handler = proc_dointvec_minmax,
925 .extra1 = &zero, 948 .extra1 = &zero,
926 .extra2 = &one, 949 .extra2 = &one,
927 }, 950 },
951#ifdef CONFIG_SMP
928 { 952 {
929 .procname = "hardlockup_all_cpu_backtrace", 953 .procname = "hardlockup_all_cpu_backtrace",
930 .data = &sysctl_hardlockup_all_cpu_backtrace, 954 .data = &sysctl_hardlockup_all_cpu_backtrace,
@@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = {
936 }, 960 },
937#endif /* CONFIG_SMP */ 961#endif /* CONFIG_SMP */
938#endif 962#endif
963#endif
964
939#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 965#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
940 { 966 {
941 .procname = "unknown_nmi_panic", 967 .procname = "unknown_nmi_panic",
@@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table)
1950} 1976}
1951 1977
1952/** 1978/**
1979 * proc_first_pos_non_zero_ignore - check if firs position is allowed
1980 * @ppos: file position
1981 * @table: the sysctl table
1982 *
1983 * Returns true if the first position is non-zero and the sysctl_writes_strict
1984 * mode indicates this is not allowed for numeric input types. String proc
1985 * hadlers can ignore the return value.
1986 */
1987static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
1988 struct ctl_table *table)
1989{
1990 if (!*ppos)
1991 return false;
1992
1993 switch (sysctl_writes_strict) {
1994 case SYSCTL_WRITES_STRICT:
1995 return true;
1996 case SYSCTL_WRITES_WARN:
1997 warn_sysctl_write(table);
1998 return false;
1999 default:
2000 return false;
2001 }
2002}
2003
2004/**
1953 * proc_dostring - read a string sysctl 2005 * proc_dostring - read a string sysctl
1954 * @table: the sysctl table 2006 * @table: the sysctl table
1955 * @write: %TRUE if this is a write to the sysctl file 2007 * @write: %TRUE if this is a write to the sysctl file
@@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table)
1969int proc_dostring(struct ctl_table *table, int write, 2021int proc_dostring(struct ctl_table *table, int write,
1970 void __user *buffer, size_t *lenp, loff_t *ppos) 2022 void __user *buffer, size_t *lenp, loff_t *ppos)
1971{ 2023{
1972 if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) 2024 if (write)
1973 warn_sysctl_write(table); 2025 proc_first_pos_non_zero_ignore(ppos, table);
1974 2026
1975 return _proc_do_string((char *)(table->data), table->maxlen, write, 2027 return _proc_do_string((char *)(table->data), table->maxlen, write,
1976 (char __user *)buffer, lenp, ppos); 2028 (char __user *)buffer, lenp, ppos);
@@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2128 return 0; 2180 return 0;
2129} 2181}
2130 2182
2131static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, 2183static int do_proc_douintvec_conv(unsigned long *lvalp,
2132 int *valp, 2184 unsigned int *valp,
2133 int write, void *data) 2185 int write, void *data)
2134{ 2186{
2135 if (write) { 2187 if (write) {
2136 if (*negp) 2188 if (*lvalp > UINT_MAX)
2137 return -EINVAL; 2189 return -EINVAL;
2138 if (*lvalp > UINT_MAX) 2190 if (*lvalp > UINT_MAX)
2139 return -EINVAL; 2191 return -EINVAL;
2140 *valp = *lvalp; 2192 *valp = *lvalp;
2141 } else { 2193 } else {
2142 unsigned int val = *valp; 2194 unsigned int val = *valp;
2143 *negp = false;
2144 *lvalp = (unsigned long)val; 2195 *lvalp = (unsigned long)val;
2145 } 2196 }
2146 return 0; 2197 return 0;
@@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2172 conv = do_proc_dointvec_conv; 2223 conv = do_proc_dointvec_conv;
2173 2224
2174 if (write) { 2225 if (write) {
2175 if (*ppos) { 2226 if (proc_first_pos_non_zero_ignore(ppos, table))
2176 switch (sysctl_writes_strict) { 2227 goto out;
2177 case SYSCTL_WRITES_STRICT:
2178 goto out;
2179 case SYSCTL_WRITES_WARN:
2180 warn_sysctl_write(table);
2181 break;
2182 default:
2183 break;
2184 }
2185 }
2186 2228
2187 if (left > PAGE_SIZE - 1) 2229 if (left > PAGE_SIZE - 1)
2188 left = PAGE_SIZE - 1; 2230 left = PAGE_SIZE - 1;
@@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
2249 buffer, lenp, ppos, conv, data); 2291 buffer, lenp, ppos, conv, data);
2250} 2292}
2251 2293
2294static int do_proc_douintvec_w(unsigned int *tbl_data,
2295 struct ctl_table *table,
2296 void __user *buffer,
2297 size_t *lenp, loff_t *ppos,
2298 int (*conv)(unsigned long *lvalp,
2299 unsigned int *valp,
2300 int write, void *data),
2301 void *data)
2302{
2303 unsigned long lval;
2304 int err = 0;
2305 size_t left;
2306 bool neg;
2307 char *kbuf = NULL, *p;
2308
2309 left = *lenp;
2310
2311 if (proc_first_pos_non_zero_ignore(ppos, table))
2312 goto bail_early;
2313
2314 if (left > PAGE_SIZE - 1)
2315 left = PAGE_SIZE - 1;
2316
2317 p = kbuf = memdup_user_nul(buffer, left);
2318 if (IS_ERR(kbuf))
2319 return -EINVAL;
2320
2321 left -= proc_skip_spaces(&p);
2322 if (!left) {
2323 err = -EINVAL;
2324 goto out_free;
2325 }
2326
2327 err = proc_get_long(&p, &left, &lval, &neg,
2328 proc_wspace_sep,
2329 sizeof(proc_wspace_sep), NULL);
2330 if (err || neg) {
2331 err = -EINVAL;
2332 goto out_free;
2333 }
2334
2335 if (conv(&lval, tbl_data, 1, data)) {
2336 err = -EINVAL;
2337 goto out_free;
2338 }
2339
2340 if (!err && left)
2341 left -= proc_skip_spaces(&p);
2342
2343out_free:
2344 kfree(kbuf);
2345 if (err)
2346 return -EINVAL;
2347
2348 return 0;
2349
2350 /* This is in keeping with old __do_proc_dointvec() */
2351bail_early:
2352 *ppos += *lenp;
2353 return err;
2354}
2355
2356static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
2357 size_t *lenp, loff_t *ppos,
2358 int (*conv)(unsigned long *lvalp,
2359 unsigned int *valp,
2360 int write, void *data),
2361 void *data)
2362{
2363 unsigned long lval;
2364 int err = 0;
2365 size_t left;
2366
2367 left = *lenp;
2368
2369 if (conv(&lval, tbl_data, 0, data)) {
2370 err = -EINVAL;
2371 goto out;
2372 }
2373
2374 err = proc_put_long(&buffer, &left, lval, false);
2375 if (err || !left)
2376 goto out;
2377
2378 err = proc_put_char(&buffer, &left, '\n');
2379
2380out:
2381 *lenp -= left;
2382 *ppos += *lenp;
2383
2384 return err;
2385}
2386
2387static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
2388 int write, void __user *buffer,
2389 size_t *lenp, loff_t *ppos,
2390 int (*conv)(unsigned long *lvalp,
2391 unsigned int *valp,
2392 int write, void *data),
2393 void *data)
2394{
2395 unsigned int *i, vleft;
2396
2397 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2398 *lenp = 0;
2399 return 0;
2400 }
2401
2402 i = (unsigned int *) tbl_data;
2403 vleft = table->maxlen / sizeof(*i);
2404
2405 /*
2406 * Arrays are not supported, keep this simple. *Do not* add
2407 * support for them.
2408 */
2409 if (vleft != 1) {
2410 *lenp = 0;
2411 return -EINVAL;
2412 }
2413
2414 if (!conv)
2415 conv = do_proc_douintvec_conv;
2416
2417 if (write)
2418 return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
2419 conv, data);
2420 return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
2421}
2422
2423static int do_proc_douintvec(struct ctl_table *table, int write,
2424 void __user *buffer, size_t *lenp, loff_t *ppos,
2425 int (*conv)(unsigned long *lvalp,
2426 unsigned int *valp,
2427 int write, void *data),
2428 void *data)
2429{
2430 return __do_proc_douintvec(table->data, table, write,
2431 buffer, lenp, ppos, conv, data);
2432}
2433
2252/** 2434/**
2253 * proc_dointvec - read a vector of integers 2435 * proc_dointvec - read a vector of integers
2254 * @table: the sysctl table 2436 * @table: the sysctl table
@@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write,
2284int proc_douintvec(struct ctl_table *table, int write, 2466int proc_douintvec(struct ctl_table *table, int write,
2285 void __user *buffer, size_t *lenp, loff_t *ppos) 2467 void __user *buffer, size_t *lenp, loff_t *ppos)
2286{ 2468{
2287 return do_proc_dointvec(table, write, buffer, lenp, ppos, 2469 return do_proc_douintvec(table, write, buffer, lenp, ppos,
2288 do_proc_douintvec_conv, NULL); 2470 do_proc_douintvec_conv, NULL);
2289} 2471}
2290 2472
2291/* 2473/*
@@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2390 do_proc_dointvec_minmax_conv, &param); 2572 do_proc_dointvec_minmax_conv, &param);
2391} 2573}
2392 2574
2575struct do_proc_douintvec_minmax_conv_param {
2576 unsigned int *min;
2577 unsigned int *max;
2578};
2579
2580static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
2581 unsigned int *valp,
2582 int write, void *data)
2583{
2584 struct do_proc_douintvec_minmax_conv_param *param = data;
2585
2586 if (write) {
2587 unsigned int val = *lvalp;
2588
2589 if ((param->min && *param->min > val) ||
2590 (param->max && *param->max < val))
2591 return -ERANGE;
2592
2593 if (*lvalp > UINT_MAX)
2594 return -EINVAL;
2595 *valp = val;
2596 } else {
2597 unsigned int val = *valp;
2598 *lvalp = (unsigned long) val;
2599 }
2600
2601 return 0;
2602}
2603
2604/**
2605 * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
2606 * @table: the sysctl table
2607 * @write: %TRUE if this is a write to the sysctl file
2608 * @buffer: the user buffer
2609 * @lenp: the size of the user buffer
2610 * @ppos: file position
2611 *
2612 * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
2613 * values from/to the user buffer, treated as an ASCII string. Negative
2614 * strings are not allowed.
2615 *
2616 * This routine will ensure the values are within the range specified by
2617 * table->extra1 (min) and table->extra2 (max). There is a final sanity
2618 * check for UINT_MAX to avoid having to support wrap around uses from
2619 * userspace.
2620 *
2621 * Returns 0 on success.
2622 */
2623int proc_douintvec_minmax(struct ctl_table *table, int write,
2624 void __user *buffer, size_t *lenp, loff_t *ppos)
2625{
2626 struct do_proc_douintvec_minmax_conv_param param = {
2627 .min = (unsigned int *) table->extra1,
2628 .max = (unsigned int *) table->extra2,
2629 };
2630 return do_proc_douintvec(table, write, buffer, lenp, ppos,
2631 do_proc_douintvec_minmax_conv, &param);
2632}
2633
2393static void validate_coredump_safety(void) 2634static void validate_coredump_safety(void)
2394{ 2635{
2395#ifdef CONFIG_COREDUMP 2636#ifdef CONFIG_COREDUMP
@@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2447 left = *lenp; 2688 left = *lenp;
2448 2689
2449 if (write) { 2690 if (write) {
2450 if (*ppos) { 2691 if (proc_first_pos_non_zero_ignore(ppos, table))
2451 switch (sysctl_writes_strict) { 2692 goto out;
2452 case SYSCTL_WRITES_STRICT:
2453 goto out;
2454 case SYSCTL_WRITES_WARN:
2455 warn_sysctl_write(table);
2456 break;
2457 default:
2458 break;
2459 }
2460 }
2461 2693
2462 if (left > PAGE_SIZE - 1) 2694 if (left > PAGE_SIZE - 1)
2463 left = PAGE_SIZE - 1; 2695 left = PAGE_SIZE - 1;
@@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2898 return -ENOSYS; 3130 return -ENOSYS;
2899} 3131}
2900 3132
3133int proc_douintvec_minmax(struct ctl_table *table, int write,
3134 void __user *buffer, size_t *lenp, loff_t *ppos)
3135{
3136 return -ENOSYS;
3137}
3138
2901int proc_dointvec_jiffies(struct ctl_table *table, int write, 3139int proc_dointvec_jiffies(struct ctl_table *table, int write,
2902 void __user *buffer, size_t *lenp, loff_t *ppos) 3140 void __user *buffer, size_t *lenp, loff_t *ppos)
2903{ 3141{
@@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec);
2940EXPORT_SYMBOL(proc_douintvec); 3178EXPORT_SYMBOL(proc_douintvec);
2941EXPORT_SYMBOL(proc_dointvec_jiffies); 3179EXPORT_SYMBOL(proc_dointvec_jiffies);
2942EXPORT_SYMBOL(proc_dointvec_minmax); 3180EXPORT_SYMBOL(proc_dointvec_minmax);
3181EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
2943EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); 3182EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
2944EXPORT_SYMBOL(proc_dointvec_ms_jiffies); 3183EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
2945EXPORT_SYMBOL(proc_dostring); 3184EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 939a158eab11..02e1859f2ca8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1346 * CTL_KERN/KERN_VERSION is used by older glibc and cannot 1346 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1347 * ever go away. 1347 * ever go away.
1348 */ 1348 */
1349 if (name[0] == CTL_KERN && name[1] == KERN_VERSION) 1349 if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
1350 return; 1350 return;
1351 1351
1352 if (printk_ratelimit()) { 1352 if (printk_ratelimit()) {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c991cf212c6d..0b8ff7d257ea 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
712 alarmtimer_freezerset(absexp, type); 712 alarmtimer_freezerset(absexp, type);
713 restart = &current->restart_block; 713 restart = &current->restart_block;
714 if (restart->nanosleep.type != TT_NONE) { 714 if (restart->nanosleep.type != TT_NONE) {
715 struct timespec rmt; 715 struct timespec64 rmt;
716 ktime_t rem; 716 ktime_t rem;
717 717
718 rem = ktime_sub(absexp, alarm_bases[type].gettime()); 718 rem = ktime_sub(absexp, alarm_bases[type].gettime());
719 719
720 if (rem <= 0) 720 if (rem <= 0)
721 return 0; 721 return 0;
722 rmt = ktime_to_timespec(rem); 722 rmt = ktime_to_timespec64(rem);
723 723
724 return nanosleep_copyout(restart, &rmt); 724 return nanosleep_copyout(restart, &rmt);
725 } 725 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 81da124f1115..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1440} 1440}
1441EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); 1441EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1442 1442
1443int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) 1443int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
1444{ 1444{
1445 switch(restart->nanosleep.type) { 1445 switch(restart->nanosleep.type) {
1446#ifdef CONFIG_COMPAT 1446#ifdef CONFIG_COMPAT
1447 case TT_COMPAT: 1447 case TT_COMPAT:
1448 if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) 1448 if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
1449 return -EFAULT; 1449 return -EFAULT;
1450 break; 1450 break;
1451#endif 1451#endif
1452 case TT_NATIVE: 1452 case TT_NATIVE:
1453 if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) 1453 if (put_timespec64(ts, restart->nanosleep.rmtp))
1454 return -EFAULT; 1454 return -EFAULT;
1455 break; 1455 break;
1456 default: 1456 default:
@@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1485 restart = &current->restart_block; 1485 restart = &current->restart_block;
1486 if (restart->nanosleep.type != TT_NONE) { 1486 if (restart->nanosleep.type != TT_NONE) {
1487 ktime_t rem = hrtimer_expires_remaining(&t->timer); 1487 ktime_t rem = hrtimer_expires_remaining(&t->timer);
1488 struct timespec rmt; 1488 struct timespec64 rmt;
1489 1489
1490 if (rem <= 0) 1490 if (rem <= 0)
1491 return 0; 1491 return 0;
1492 rmt = ktime_to_timespec(rem); 1492 rmt = ktime_to_timespec64(rem);
1493 1493
1494 return nanosleep_copyout(restart, &rmt); 1494 return nanosleep_copyout(restart, &rmt);
1495 } 1495 }
@@ -1546,19 +1546,17 @@ out:
1546SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, 1546SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1547 struct timespec __user *, rmtp) 1547 struct timespec __user *, rmtp)
1548{ 1548{
1549 struct timespec64 tu64; 1549 struct timespec64 tu;
1550 struct timespec tu;
1551 1550
1552 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1551 if (get_timespec64(&tu, rqtp))
1553 return -EFAULT; 1552 return -EFAULT;
1554 1553
1555 tu64 = timespec_to_timespec64(tu); 1554 if (!timespec64_valid(&tu))
1556 if (!timespec64_valid(&tu64))
1557 return -EINVAL; 1555 return -EINVAL;
1558 1556
1559 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 1557 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
1560 current->restart_block.nanosleep.rmtp = rmtp; 1558 current->restart_block.nanosleep.rmtp = rmtp;
1561 return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1559 return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1562} 1560}
1563 1561
1564#ifdef CONFIG_COMPAT 1562#ifdef CONFIG_COMPAT
@@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1566COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, 1564COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
1567 struct compat_timespec __user *, rmtp) 1565 struct compat_timespec __user *, rmtp)
1568{ 1566{
1569 struct timespec64 tu64; 1567 struct timespec64 tu;
1570 struct timespec tu;
1571 1568
1572 if (compat_get_timespec(&tu, rqtp)) 1569 if (compat_get_timespec64(&tu, rqtp))
1573 return -EFAULT; 1570 return -EFAULT;
1574 1571
1575 tu64 = timespec_to_timespec64(tu); 1572 if (!timespec64_valid(&tu))
1576 if (!timespec64_valid(&tu64))
1577 return -EINVAL; 1573 return -EINVAL;
1578 1574
1579 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; 1575 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
1580 current->restart_block.nanosleep.compat_rmtp = rmtp; 1576 current->restart_block.nanosleep.compat_rmtp = rmtp;
1581 return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1577 return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
1582} 1578}
1583#endif 1579#endif
1584 1580
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 60cb24ac9ebc..a3bd5dbe0dc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1318,12 +1318,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1318 */ 1318 */
1319 restart = &current->restart_block; 1319 restart = &current->restart_block;
1320 restart->nanosleep.expires = expires; 1320 restart->nanosleep.expires = expires;
1321 if (restart->nanosleep.type != TT_NONE) { 1321 if (restart->nanosleep.type != TT_NONE)
1322 struct timespec ts; 1322 error = nanosleep_copyout(restart, &it.it_value);
1323
1324 ts = timespec64_to_timespec(it.it_value);
1325 error = nanosleep_copyout(restart, &ts);
1326 }
1327 } 1323 }
1328 1324
1329 return error; 1325 return error;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 38f3b20efa29..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,12 +41,6 @@ SYS_NI(setitimer);
41#ifdef __ARCH_WANT_SYS_ALARM 41#ifdef __ARCH_WANT_SYS_ALARM
42SYS_NI(alarm); 42SYS_NI(alarm);
43#endif 43#endif
44COMPAT_SYS_NI(timer_create);
45COMPAT_SYS_NI(clock_adjtime);
46COMPAT_SYS_NI(timer_settime);
47COMPAT_SYS_NI(timer_gettime);
48COMPAT_SYS_NI(getitimer);
49COMPAT_SYS_NI(setitimer);
50 44
51/* 45/*
52 * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC 46 * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
@@ -57,40 +51,52 @@ COMPAT_SYS_NI(setitimer);
57SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 51SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
58 const struct timespec __user *, tp) 52 const struct timespec __user *, tp)
59{ 53{
60 struct timespec64 new_tp64; 54 struct timespec64 new_tp;
61 struct timespec new_tp;
62 55
63 if (which_clock != CLOCK_REALTIME) 56 if (which_clock != CLOCK_REALTIME)
64 return -EINVAL; 57 return -EINVAL;
65 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 58 if (get_timespec64(&new_tp, tp))
66 return -EFAULT; 59 return -EFAULT;
67 60
68 new_tp64 = timespec_to_timespec64(new_tp); 61 return do_sys_settimeofday64(&new_tp, NULL);
69 return do_sys_settimeofday64(&new_tp64, NULL);
70} 62}
71 63
72SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 64int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
73 struct timespec __user *,tp)
74{ 65{
75 struct timespec64 kernel_tp64;
76 struct timespec kernel_tp;
77
78 switch (which_clock) { 66 switch (which_clock) {
79 case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; 67 case CLOCK_REALTIME:
80 case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; 68 ktime_get_real_ts64(tp);
81 case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; 69 break;
82 default: return -EINVAL; 70 case CLOCK_MONOTONIC:
71 ktime_get_ts64(tp);
72 break;
73 case CLOCK_BOOTTIME:
74 get_monotonic_boottime64(tp);
75 break;
76 default:
77 return -EINVAL;
83 } 78 }
84 79
85 kernel_tp = timespec64_to_timespec(kernel_tp64); 80 return 0;
86 if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 81}
82SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
83 struct timespec __user *, tp)
84{
85 int ret;
86 struct timespec64 kernel_tp;
87
88 ret = do_clock_gettime(which_clock, &kernel_tp);
89 if (ret)
90 return ret;
91
92 if (put_timespec64(&kernel_tp, tp))
87 return -EFAULT; 93 return -EFAULT;
88 return 0; 94 return 0;
89} 95}
90 96
91SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) 97SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
92{ 98{
93 struct timespec rtn_tp = { 99 struct timespec64 rtn_tp = {
94 .tv_sec = 0, 100 .tv_sec = 0,
95 .tv_nsec = hrtimer_resolution, 101 .tv_nsec = hrtimer_resolution,
96 }; 102 };
@@ -99,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
99 case CLOCK_REALTIME: 105 case CLOCK_REALTIME:
100 case CLOCK_MONOTONIC: 106 case CLOCK_MONOTONIC:
101 case CLOCK_BOOTTIME: 107 case CLOCK_BOOTTIME:
102 if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) 108 if (put_timespec64(&rtn_tp, tp))
103 return -EFAULT; 109 return -EFAULT;
104 return 0; 110 return 0;
105 default: 111 default:
@@ -138,44 +144,45 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
138} 144}
139 145
140#ifdef CONFIG_COMPAT 146#ifdef CONFIG_COMPAT
147COMPAT_SYS_NI(timer_create);
148COMPAT_SYS_NI(clock_adjtime);
149COMPAT_SYS_NI(timer_settime);
150COMPAT_SYS_NI(timer_gettime);
151COMPAT_SYS_NI(getitimer);
152COMPAT_SYS_NI(setitimer);
153
141COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 154COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
142 struct compat_timespec __user *, tp) 155 struct compat_timespec __user *, tp)
143{ 156{
144 struct timespec64 new_tp64; 157 struct timespec64 new_tp;
145 struct timespec new_tp;
146 158
147 if (which_clock != CLOCK_REALTIME) 159 if (which_clock != CLOCK_REALTIME)
148 return -EINVAL; 160 return -EINVAL;
149 if (compat_get_timespec(&new_tp, tp)) 161 if (compat_get_timespec64(&new_tp, tp))
150 return -EFAULT; 162 return -EFAULT;
151 163
152 new_tp64 = timespec_to_timespec64(new_tp); 164 return do_sys_settimeofday64(&new_tp, NULL);
153 return do_sys_settimeofday64(&new_tp64, NULL);
154} 165}
155 166
156COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 167COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
157 struct compat_timespec __user *,tp) 168 struct compat_timespec __user *, tp)
158{ 169{
159 struct timespec64 kernel_tp64; 170 int ret;
160 struct timespec kernel_tp; 171 struct timespec64 kernel_tp;
161 172
162 switch (which_clock) { 173 ret = do_clock_gettime(which_clock, &kernel_tp);
163 case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; 174 if (ret)
164 case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; 175 return ret;
165 case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
166 default: return -EINVAL;
167 }
168 176
169 kernel_tp = timespec64_to_timespec(kernel_tp64); 177 if (compat_put_timespec64(&kernel_tp, tp))
170 if (compat_put_timespec(&kernel_tp, tp))
171 return -EFAULT; 178 return -EFAULT;
172 return 0; 179 return 0;
173} 180}
174 181
175COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 182COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
176 struct compat_timespec __user *, tp) 183 struct compat_timespec __user *, tp)
177{ 184{
178 struct timespec rtn_tp = { 185 struct timespec64 rtn_tp = {
179 .tv_sec = 0, 186 .tv_sec = 0,
180 .tv_nsec = hrtimer_resolution, 187 .tv_nsec = hrtimer_resolution,
181 }; 188 };
@@ -184,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
184 case CLOCK_REALTIME: 191 case CLOCK_REALTIME:
185 case CLOCK_MONOTONIC: 192 case CLOCK_MONOTONIC:
186 case CLOCK_BOOTTIME: 193 case CLOCK_BOOTTIME:
187 if (compat_put_timespec(&rtn_tp, tp)) 194 if (compat_put_timespec64(&rtn_tp, tp))
188 return -EFAULT; 195 return -EFAULT;
189 return 0; 196 return 0;
190 default: 197 default:
191 return -EINVAL; 198 return -EINVAL;
192 } 199 }
193} 200}
201
194COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, 202COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
195 struct compat_timespec __user *, rqtp, 203 struct compat_timespec __user *, rqtp,
196 struct compat_timespec __user *, rmtp) 204 struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 82d67be7d9d1..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
739SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 739SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
740 struct itimerspec __user *, setting) 740 struct itimerspec __user *, setting)
741{ 741{
742 struct itimerspec64 cur_setting64; 742 struct itimerspec64 cur_setting;
743 743
744 int ret = do_timer_gettime(timer_id, &cur_setting64); 744 int ret = do_timer_gettime(timer_id, &cur_setting);
745 if (!ret) { 745 if (!ret) {
746 struct itimerspec cur_setting; 746 if (put_itimerspec64(&cur_setting, setting))
747 cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
748 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
749 ret = -EFAULT; 747 ret = -EFAULT;
750 } 748 }
751 return ret; 749 return ret;
@@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
755COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 753COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
756 struct compat_itimerspec __user *, setting) 754 struct compat_itimerspec __user *, setting)
757{ 755{
758 struct itimerspec64 cur_setting64; 756 struct itimerspec64 cur_setting;
759 757
760 int ret = do_timer_gettime(timer_id, &cur_setting64); 758 int ret = do_timer_gettime(timer_id, &cur_setting);
761 if (!ret) { 759 if (!ret) {
762 struct itimerspec cur_setting; 760 if (put_compat_itimerspec64(&cur_setting, setting))
763 cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
764 if (put_compat_itimerspec(setting, &cur_setting))
765 ret = -EFAULT; 761 ret = -EFAULT;
766 } 762 }
767 return ret; 763 return ret;
@@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
907 const struct itimerspec __user *, new_setting, 903 const struct itimerspec __user *, new_setting,
908 struct itimerspec __user *, old_setting) 904 struct itimerspec __user *, old_setting)
909{ 905{
910 struct itimerspec64 new_spec64, old_spec64; 906 struct itimerspec64 new_spec, old_spec;
911 struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; 907 struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
912 struct itimerspec new_spec;
913 int error = 0; 908 int error = 0;
914 909
915 if (!new_setting) 910 if (!new_setting)
916 return -EINVAL; 911 return -EINVAL;
917 912
918 if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) 913 if (get_itimerspec64(&new_spec, new_setting))
919 return -EFAULT; 914 return -EFAULT;
920 new_spec64 = itimerspec_to_itimerspec64(&new_spec);
921 915
922 error = do_timer_settime(timer_id, flags, &new_spec64, rtn); 916 error = do_timer_settime(timer_id, flags, &new_spec, rtn);
923 if (!error && old_setting) { 917 if (!error && old_setting) {
924 struct itimerspec old_spec; 918 if (put_itimerspec64(&old_spec, old_setting))
925 old_spec = itimerspec64_to_itimerspec(&old_spec64);
926 if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
927 error = -EFAULT; 919 error = -EFAULT;
928 } 920 }
929 return error; 921 return error;
@@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
934 struct compat_itimerspec __user *, new, 926 struct compat_itimerspec __user *, new,
935 struct compat_itimerspec __user *, old) 927 struct compat_itimerspec __user *, old)
936{ 928{
937 struct itimerspec64 new_spec64, old_spec64; 929 struct itimerspec64 new_spec, old_spec;
938 struct itimerspec64 *rtn = old ? &old_spec64 : NULL; 930 struct itimerspec64 *rtn = old ? &old_spec : NULL;
939 struct itimerspec new_spec;
940 int error = 0; 931 int error = 0;
941 932
942 if (!new) 933 if (!new)
943 return -EINVAL; 934 return -EINVAL;
944 if (get_compat_itimerspec(&new_spec, new)) 935 if (get_compat_itimerspec64(&new_spec, new))
945 return -EFAULT; 936 return -EFAULT;
946 937
947 new_spec64 = itimerspec_to_itimerspec64(&new_spec); 938 error = do_timer_settime(timer_id, flags, &new_spec, rtn);
948 error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
949 if (!error && old) { 939 if (!error && old) {
950 struct itimerspec old_spec; 940 if (put_compat_itimerspec64(&old_spec, old))
951 old_spec = itimerspec64_to_itimerspec(&old_spec64);
952 if (put_compat_itimerspec(old, &old_spec))
953 error = -EFAULT; 941 error = -EFAULT;
954 } 942 }
955 return error; 943 return error;
@@ -1049,34 +1037,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
1049 const struct timespec __user *, tp) 1037 const struct timespec __user *, tp)
1050{ 1038{
1051 const struct k_clock *kc = clockid_to_kclock(which_clock); 1039 const struct k_clock *kc = clockid_to_kclock(which_clock);
1052 struct timespec64 new_tp64; 1040 struct timespec64 new_tp;
1053 struct timespec new_tp;
1054 1041
1055 if (!kc || !kc->clock_set) 1042 if (!kc || !kc->clock_set)
1056 return -EINVAL; 1043 return -EINVAL;
1057 1044
1058 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 1045 if (get_timespec64(&new_tp, tp))
1059 return -EFAULT; 1046 return -EFAULT;
1060 new_tp64 = timespec_to_timespec64(new_tp);
1061 1047
1062 return kc->clock_set(which_clock, &new_tp64); 1048 return kc->clock_set(which_clock, &new_tp);
1063} 1049}
1064 1050
1065SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 1051SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
1066 struct timespec __user *,tp) 1052 struct timespec __user *,tp)
1067{ 1053{
1068 const struct k_clock *kc = clockid_to_kclock(which_clock); 1054 const struct k_clock *kc = clockid_to_kclock(which_clock);
1069 struct timespec64 kernel_tp64; 1055 struct timespec64 kernel_tp;
1070 struct timespec kernel_tp;
1071 int error; 1056 int error;
1072 1057
1073 if (!kc) 1058 if (!kc)
1074 return -EINVAL; 1059 return -EINVAL;
1075 1060
1076 error = kc->clock_get(which_clock, &kernel_tp64); 1061 error = kc->clock_get(which_clock, &kernel_tp);
1077 kernel_tp = timespec64_to_timespec(kernel_tp64);
1078 1062
1079 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 1063 if (!error && put_timespec64(&kernel_tp, tp))
1080 error = -EFAULT; 1064 error = -EFAULT;
1081 1065
1082 return error; 1066 return error;
@@ -1109,17 +1093,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
1109 struct timespec __user *, tp) 1093 struct timespec __user *, tp)
1110{ 1094{
1111 const struct k_clock *kc = clockid_to_kclock(which_clock); 1095 const struct k_clock *kc = clockid_to_kclock(which_clock);
1112 struct timespec64 rtn_tp64; 1096 struct timespec64 rtn_tp;
1113 struct timespec rtn_tp;
1114 int error; 1097 int error;
1115 1098
1116 if (!kc) 1099 if (!kc)
1117 return -EINVAL; 1100 return -EINVAL;
1118 1101
1119 error = kc->clock_getres(which_clock, &rtn_tp64); 1102 error = kc->clock_getres(which_clock, &rtn_tp);
1120 rtn_tp = timespec64_to_timespec(rtn_tp64);
1121 1103
1122 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) 1104 if (!error && tp && put_timespec64(&rtn_tp, tp))
1123 error = -EFAULT; 1105 error = -EFAULT;
1124 1106
1125 return error; 1107 return error;
@@ -1131,38 +1113,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
1131 struct compat_timespec __user *, tp) 1113 struct compat_timespec __user *, tp)
1132{ 1114{
1133 const struct k_clock *kc = clockid_to_kclock(which_clock); 1115 const struct k_clock *kc = clockid_to_kclock(which_clock);
1134 struct timespec64 new_tp64; 1116 struct timespec64 ts;
1135 struct timespec new_tp;
1136 1117
1137 if (!kc || !kc->clock_set) 1118 if (!kc || !kc->clock_set)
1138 return -EINVAL; 1119 return -EINVAL;
1139 1120
1140 if (compat_get_timespec(&new_tp, tp)) 1121 if (compat_get_timespec64(&ts, tp))
1141 return -EFAULT; 1122 return -EFAULT;
1142 1123
1143 new_tp64 = timespec_to_timespec64(new_tp); 1124 return kc->clock_set(which_clock, &ts);
1144
1145 return kc->clock_set(which_clock, &new_tp64);
1146} 1125}
1147 1126
1148COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, 1127COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
1149 struct compat_timespec __user *, tp) 1128 struct compat_timespec __user *, tp)
1150{ 1129{
1151 const struct k_clock *kc = clockid_to_kclock(which_clock); 1130 const struct k_clock *kc = clockid_to_kclock(which_clock);
1152 struct timespec64 kernel_tp64; 1131 struct timespec64 ts;
1153 struct timespec kernel_tp; 1132 int err;
1154 int error;
1155 1133
1156 if (!kc) 1134 if (!kc)
1157 return -EINVAL; 1135 return -EINVAL;
1158 1136
1159 error = kc->clock_get(which_clock, &kernel_tp64); 1137 err = kc->clock_get(which_clock, &ts);
1160 kernel_tp = timespec64_to_timespec(kernel_tp64);
1161 1138
1162 if (!error && compat_put_timespec(&kernel_tp, tp)) 1139 if (!err && compat_put_timespec64(&ts, tp))
1163 error = -EFAULT; 1140 err = -EFAULT;
1164 1141
1165 return error; 1142 return err;
1166} 1143}
1167 1144
1168COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, 1145COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
@@ -1193,21 +1170,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
1193 struct compat_timespec __user *, tp) 1170 struct compat_timespec __user *, tp)
1194{ 1171{
1195 const struct k_clock *kc = clockid_to_kclock(which_clock); 1172 const struct k_clock *kc = clockid_to_kclock(which_clock);
1196 struct timespec64 rtn_tp64; 1173 struct timespec64 ts;
1197 struct timespec rtn_tp; 1174 int err;
1198 int error;
1199 1175
1200 if (!kc) 1176 if (!kc)
1201 return -EINVAL; 1177 return -EINVAL;
1202 1178
1203 error = kc->clock_getres(which_clock, &rtn_tp64); 1179 err = kc->clock_getres(which_clock, &ts);
1204 rtn_tp = timespec64_to_timespec(rtn_tp64); 1180 if (!err && tp && compat_put_timespec64(&ts, tp))
1205 1181 return -EFAULT;
1206 if (!error && tp && compat_put_timespec(&rtn_tp, tp))
1207 error = -EFAULT;
1208 1182
1209 return error; 1183 return err;
1210} 1184}
1185
1211#endif 1186#endif
1212 1187
1213/* 1188/*
@@ -1226,26 +1201,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1226 struct timespec __user *, rmtp) 1201 struct timespec __user *, rmtp)
1227{ 1202{
1228 const struct k_clock *kc = clockid_to_kclock(which_clock); 1203 const struct k_clock *kc = clockid_to_kclock(which_clock);
1229 struct timespec64 t64; 1204 struct timespec64 t;
1230 struct timespec t;
1231 1205
1232 if (!kc) 1206 if (!kc)
1233 return -EINVAL; 1207 return -EINVAL;
1234 if (!kc->nsleep) 1208 if (!kc->nsleep)
1235 return -ENANOSLEEP_NOTSUP; 1209 return -ENANOSLEEP_NOTSUP;
1236 1210
1237 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1211 if (get_timespec64(&t, rqtp))
1238 return -EFAULT; 1212 return -EFAULT;
1239 1213
1240 t64 = timespec_to_timespec64(t); 1214 if (!timespec64_valid(&t))
1241 if (!timespec64_valid(&t64))
1242 return -EINVAL; 1215 return -EINVAL;
1243 if (flags & TIMER_ABSTIME) 1216 if (flags & TIMER_ABSTIME)
1244 rmtp = NULL; 1217 rmtp = NULL;
1245 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 1218 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
1246 current->restart_block.nanosleep.rmtp = rmtp; 1219 current->restart_block.nanosleep.rmtp = rmtp;
1247 1220
1248 return kc->nsleep(which_clock, flags, &t64); 1221 return kc->nsleep(which_clock, flags, &t);
1249} 1222}
1250 1223
1251#ifdef CONFIG_COMPAT 1224#ifdef CONFIG_COMPAT
@@ -1254,26 +1227,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
1254 struct compat_timespec __user *, rmtp) 1227 struct compat_timespec __user *, rmtp)
1255{ 1228{
1256 const struct k_clock *kc = clockid_to_kclock(which_clock); 1229 const struct k_clock *kc = clockid_to_kclock(which_clock);
1257 struct timespec64 t64; 1230 struct timespec64 t;
1258 struct timespec t;
1259 1231
1260 if (!kc) 1232 if (!kc)
1261 return -EINVAL; 1233 return -EINVAL;
1262 if (!kc->nsleep) 1234 if (!kc->nsleep)
1263 return -ENANOSLEEP_NOTSUP; 1235 return -ENANOSLEEP_NOTSUP;
1264 1236
1265 if (compat_get_timespec(&t, rqtp)) 1237 if (compat_get_timespec64(&t, rqtp))
1266 return -EFAULT; 1238 return -EFAULT;
1267 1239
1268 t64 = timespec_to_timespec64(t); 1240 if (!timespec64_valid(&t))
1269 if (!timespec64_valid(&t64))
1270 return -EINVAL; 1241 return -EINVAL;
1271 if (flags & TIMER_ABSTIME) 1242 if (flags & TIMER_ABSTIME)
1272 rmtp = NULL; 1243 rmtp = NULL;
1273 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; 1244 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
1274 current->restart_block.nanosleep.compat_rmtp = rmtp; 1245 current->restart_block.nanosleep.compat_rmtp = rmtp;
1275 1246
1276 return kc->nsleep(which_clock, flags, &t64); 1247 return kc->nsleep(which_clock, flags, &t);
1277} 1248}
1278#endif 1249#endif
1279 1250
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7c89e437c4d7..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -890,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
890 890
891 return res; 891 return res;
892} 892}
893
894int get_timespec64(struct timespec64 *ts,
895 const struct timespec __user *uts)
896{
897 struct timespec kts;
898 int ret;
899
900 ret = copy_from_user(&kts, uts, sizeof(kts));
901 if (ret)
902 return -EFAULT;
903
904 ts->tv_sec = kts.tv_sec;
905 ts->tv_nsec = kts.tv_nsec;
906
907 return 0;
908}
909EXPORT_SYMBOL_GPL(get_timespec64);
910
911int put_timespec64(const struct timespec64 *ts,
912 struct timespec __user *uts)
913{
914 struct timespec kts = {
915 .tv_sec = ts->tv_sec,
916 .tv_nsec = ts->tv_nsec
917 };
918 return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
919}
920EXPORT_SYMBOL_GPL(put_timespec64);
921
922int get_itimerspec64(struct itimerspec64 *it,
923 const struct itimerspec __user *uit)
924{
925 int ret;
926
927 ret = get_timespec64(&it->it_interval, &uit->it_interval);
928 if (ret)
929 return ret;
930
931 ret = get_timespec64(&it->it_value, &uit->it_value);
932
933 return ret;
934}
935EXPORT_SYMBOL_GPL(get_itimerspec64);
936
937int put_itimerspec64(const struct itimerspec64 *it,
938 struct itimerspec __user *uit)
939{
940 int ret;
941
942 ret = put_timespec64(&it->it_interval, &uit->it_interval);
943 if (ret)
944 return ret;
945
946 ret = put_timespec64(&it->it_value, &uit->it_value);
947
948 return ret;
949}
950EXPORT_SYMBOL_GPL(put_itimerspec64);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
667 667
668 If unsure, say N 668 If unsure, say N
669 669
670config TRACE_ENUM_MAP_FILE 670config TRACE_EVAL_MAP_FILE
671 bool "Show enum mappings for trace events" 671 bool "Show eval mappings for trace events"
672 depends on TRACING 672 depends on TRACING
673 help 673 help
674 The "print fmt" of the trace events will show the enum names instead 674 The "print fmt" of the trace events will show the enum/sizeof names
675 of their values. This can cause problems for user space tools that 675 instead of their values. This can cause problems for user space tools
676 use this string to parse the raw data as user space does not know 676 that use this string to parse the raw data as user space does not know
677 how to convert the string to its value. 677 how to convert the string to its value.
678 678
679 To fix this, there's a special macro in the kernel that can be used 679 To fix this, there's a special macro in the kernel that can be used
680 to convert the enum into its value. If this macro is used, then the 680 to convert an enum/sizeof into its value. If this macro is used, then
681 print fmt strings will have the enums converted to their values. 681 the print fmt strings will be converted to their values.
682 682
683 If something does not get converted properly, this option can be 683 If something does not get converted properly, this option can be
684 used to show what enums the kernel tried to convert. 684 used to show what enums/sizeof the kernel tried to convert.
685 685
686 This option is for debugging the enum conversions. A file is created 686 This option is for debugging the conversions. A file is created
687 in the tracing directory called "enum_map" that will show the enum 687 in the tracing directory called "eval_map" that will show the
688 names matched with their values and what trace event system they 688 names matched with their values and what trace event system they
689 belong too. 689 belong too.
690 690
691 Normally, the mapping of the strings to values will be freed after 691 Normally, the mapping of the strings to values will be freed after
692 boot up or module load. With this option, they will not be freed, as 692 boot up or module load. With this option, they will not be freed, as
693 they are needed for the "enum_map" file. Enabling this option will 693 they are needed for the "eval_map" file. Enabling this option will
694 increase the memory footprint of the running kernel. 694 increase the memory footprint of the running kernel.
695 695
696 If unsure, say N 696 If unsure, say N
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..37385193a608 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
122} 122}
123 123
124/* 124/*
125 * limited trace_printk() 125 * Only limited trace_printk() conversion specifiers allowed:
126 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed 126 * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
127 */ 127 */
128BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, 128BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
129 u64, arg2, u64, arg3) 129 u64, arg2, u64, arg3)
@@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
198 i++; 198 i++;
199 } 199 }
200 200
201 if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') 201 if (fmt[i] != 'i' && fmt[i] != 'd' &&
202 fmt[i] != 'u' && fmt[i] != 'x')
202 return -EINVAL; 203 return -EINVAL;
203 fmt_cnt++; 204 fmt_cnt++;
204 } 205 }
@@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
234 unsigned int cpu = smp_processor_id(); 235 unsigned int cpu = smp_processor_id();
235 u64 index = flags & BPF_F_INDEX_MASK; 236 u64 index = flags & BPF_F_INDEX_MASK;
236 struct bpf_event_entry *ee; 237 struct bpf_event_entry *ee;
237 struct perf_event *event; 238 u64 value = 0;
239 int err;
238 240
239 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 241 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
240 return -EINVAL; 242 return -EINVAL;
@@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
247 if (!ee) 249 if (!ee)
248 return -ENOENT; 250 return -ENOENT;
249 251
250 event = ee->event; 252 err = perf_event_read_local(ee->event, &value);
251 if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
252 event->attr.type != PERF_TYPE_RAW))
253 return -EINVAL;
254
255 /* make sure event is local and doesn't have pmu::count */
256 if (unlikely(event->oncpu != cpu || event->pmu->count))
257 return -EINVAL;
258
259 /* 253 /*
260 * we don't know if the function is run successfully by the 254 * this api is ugly since we miss [-22..-2] range of valid
261 * return value. It can be judged in other places, such as 255 * counter values, but that's uapi
262 * eBPF programs.
263 */ 256 */
264 return perf_event_read_local(event); 257 if (err)
258 return err;
259 return value;
265} 260}
266 261
267static const struct bpf_func_proto bpf_perf_event_read_proto = { 262static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
272 .arg2_type = ARG_ANYTHING, 267 .arg2_type = ARG_ANYTHING,
273}; 268};
274 269
270static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
271
275static __always_inline u64 272static __always_inline u64
276__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 273__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
277 u64 flags, struct perf_raw_record *raw) 274 u64 flags, struct perf_raw_record *raw)
278{ 275{
279 struct bpf_array *array = container_of(map, struct bpf_array, map); 276 struct bpf_array *array = container_of(map, struct bpf_array, map);
277 struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
280 unsigned int cpu = smp_processor_id(); 278 unsigned int cpu = smp_processor_id();
281 u64 index = flags & BPF_F_INDEX_MASK; 279 u64 index = flags & BPF_F_INDEX_MASK;
282 struct perf_sample_data sample_data;
283 struct bpf_event_entry *ee; 280 struct bpf_event_entry *ee;
284 struct perf_event *event; 281 struct perf_event *event;
285 282
@@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
300 if (unlikely(event->oncpu != cpu)) 297 if (unlikely(event->oncpu != cpu))
301 return -EOPNOTSUPP; 298 return -EOPNOTSUPP;
302 299
303 perf_sample_data_init(&sample_data, 0, 0); 300 perf_sample_data_init(sd, 0, 0);
304 sample_data.raw = raw; 301 sd->raw = raw;
305 perf_event_output(event, &sample_data, regs); 302 perf_event_output(event, sd, regs);
306 return 0; 303 return 0;
307} 304}
308 305
@@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
483 480
484/* bpf+kprobe programs can access fields of 'struct pt_regs' */ 481/* bpf+kprobe programs can access fields of 'struct pt_regs' */
485static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 482static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
486 enum bpf_reg_type *reg_type) 483 struct bpf_insn_access_aux *info)
487{ 484{
488 if (off < 0 || off >= sizeof(struct pt_regs)) 485 if (off < 0 || off >= sizeof(struct pt_regs))
489 return false; 486 return false;
@@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
566} 563}
567 564
568static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, 565static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
569 enum bpf_reg_type *reg_type) 566 struct bpf_insn_access_aux *info)
570{ 567{
571 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) 568 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
572 return false; 569 return false;
@@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
585}; 582};
586 583
587static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 584static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
588 enum bpf_reg_type *reg_type) 585 struct bpf_insn_access_aux *info)
589{ 586{
587 const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
588 sample_period);
589
590 if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) 590 if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
591 return false; 591 return false;
592 if (type != BPF_READ) 592 if (type != BPF_READ)
593 return false; 593 return false;
594 if (off % size != 0) 594 if (off % size != 0)
595 return false; 595 return false;
596 if (off == offsetof(struct bpf_perf_event_data, sample_period)) { 596
597 if (size != sizeof(u64)) 597 switch (off) {
598 case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
599 bpf_ctx_record_field_size(info, size_sp);
600 if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
598 return false; 601 return false;
599 } else { 602 break;
603 default:
600 if (size != sizeof(long)) 604 if (size != sizeof(long))
601 return false; 605 return false;
602 } 606 }
607
603 return true; 608 return true;
604} 609}
605 610
606static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, 611static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
607 const struct bpf_insn *si, 612 const struct bpf_insn *si,
608 struct bpf_insn *insn_buf, 613 struct bpf_insn *insn_buf,
609 struct bpf_prog *prog) 614 struct bpf_prog *prog, u32 *target_size)
610{ 615{
611 struct bpf_insn *insn = insn_buf; 616 struct bpf_insn *insn = insn_buf;
612 617
613 switch (si->off) { 618 switch (si->off) {
614 case offsetof(struct bpf_perf_event_data, sample_period): 619 case offsetof(struct bpf_perf_event_data, sample_period):
615 BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
616
617 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 620 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
618 data), si->dst_reg, si->src_reg, 621 data), si->dst_reg, si->src_reg,
619 offsetof(struct bpf_perf_event_data_kern, data)); 622 offsetof(struct bpf_perf_event_data_kern, data));
620 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 623 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
621 offsetof(struct perf_sample_data, period)); 624 bpf_target_off(struct perf_sample_data, period, 8,
625 target_size));
622 break; 626 break;
623 default: 627 default:
624 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 628 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b308be30dfb9..02004ae91860 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly;
113 113
114static DEFINE_MUTEX(ftrace_lock); 114static DEFINE_MUTEX(ftrace_lock);
115 115
116static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 116static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
117ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 117ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
118static struct ftrace_ops global_ops; 118static struct ftrace_ops global_ops;
119 119
@@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void)
169 169
170 mutex_lock(&ftrace_lock); 170 mutex_lock(&ftrace_lock);
171 171
172 for (ops = ftrace_ops_list; 172 for (ops = rcu_dereference_protected(ftrace_ops_list,
173 ops != &ftrace_list_end; ops = ops->next) 173 lockdep_is_held(&ftrace_lock));
174 ops != &ftrace_list_end;
175 ops = rcu_dereference_protected(ops->next,
176 lockdep_is_held(&ftrace_lock)))
174 cnt++; 177 cnt++;
175 178
176 mutex_unlock(&ftrace_lock); 179 mutex_unlock(&ftrace_lock);
@@ -275,10 +278,11 @@ static void update_ftrace_function(void)
275 * If there's only one ftrace_ops registered, the ftrace_ops_list 278 * If there's only one ftrace_ops registered, the ftrace_ops_list
276 * will point to the ops we want. 279 * will point to the ops we want.
277 */ 280 */
278 set_function_trace_op = ftrace_ops_list; 281 set_function_trace_op = rcu_dereference_protected(ftrace_ops_list,
282 lockdep_is_held(&ftrace_lock));
279 283
280 /* If there's no ftrace_ops registered, just call the stub function */ 284 /* If there's no ftrace_ops registered, just call the stub function */
281 if (ftrace_ops_list == &ftrace_list_end) { 285 if (set_function_trace_op == &ftrace_list_end) {
282 func = ftrace_stub; 286 func = ftrace_stub;
283 287
284 /* 288 /*
@@ -286,7 +290,8 @@ static void update_ftrace_function(void)
286 * recursion safe and not dynamic and the arch supports passing ops, 290 * recursion safe and not dynamic and the arch supports passing ops,
287 * then have the mcount trampoline call the function directly. 291 * then have the mcount trampoline call the function directly.
288 */ 292 */
289 } else if (ftrace_ops_list->next == &ftrace_list_end) { 293 } else if (rcu_dereference_protected(ftrace_ops_list->next,
294 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
290 func = ftrace_ops_get_list_func(ftrace_ops_list); 295 func = ftrace_ops_get_list_func(ftrace_ops_list);
291 296
292 } else { 297 } else {
@@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void)
348 return ftrace_trace_function == ftrace_ops_list_func; 353 return ftrace_trace_function == ftrace_ops_list_func;
349} 354}
350 355
351static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 356static void add_ftrace_ops(struct ftrace_ops __rcu **list,
357 struct ftrace_ops *ops)
352{ 358{
353 ops->next = *list; 359 rcu_assign_pointer(ops->next, *list);
360
354 /* 361 /*
355 * We are entering ops into the list but another 362 * We are entering ops into the list but another
356 * CPU might be walking that list. We need to make sure 363 * CPU might be walking that list. We need to make sure
@@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
360 rcu_assign_pointer(*list, ops); 367 rcu_assign_pointer(*list, ops);
361} 368}
362 369
363static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) 370static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
371 struct ftrace_ops *ops)
364{ 372{
365 struct ftrace_ops **p; 373 struct ftrace_ops **p;
366 374
@@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
368 * If we are removing the last function, then simply point 376 * If we are removing the last function, then simply point
369 * to the ftrace_stub. 377 * to the ftrace_stub.
370 */ 378 */
371 if (*list == ops && ops->next == &ftrace_list_end) { 379 if (rcu_dereference_protected(*list,
380 lockdep_is_held(&ftrace_lock)) == ops &&
381 rcu_dereference_protected(ops->next,
382 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
372 *list = &ftrace_list_end; 383 *list = &ftrace_list_end;
373 return 0; 384 return 0;
374 } 385 }
@@ -1293,6 +1304,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
1293 FTRACE_WARN_ON(hash->count); 1304 FTRACE_WARN_ON(hash->count);
1294} 1305}
1295 1306
1307static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
1308{
1309 list_del(&ftrace_mod->list);
1310 kfree(ftrace_mod->module);
1311 kfree(ftrace_mod->func);
1312 kfree(ftrace_mod);
1313}
1314
1315static void clear_ftrace_mod_list(struct list_head *head)
1316{
1317 struct ftrace_mod_load *p, *n;
1318
1319 /* stack tracer isn't supported yet */
1320 if (!head)
1321 return;
1322
1323 mutex_lock(&ftrace_lock);
1324 list_for_each_entry_safe(p, n, head, list)
1325 free_ftrace_mod(p);
1326 mutex_unlock(&ftrace_lock);
1327}
1328
1296static void free_ftrace_hash(struct ftrace_hash *hash) 1329static void free_ftrace_hash(struct ftrace_hash *hash)
1297{ 1330{
1298 if (!hash || hash == EMPTY_HASH) 1331 if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1379,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1346 return hash; 1379 return hash;
1347} 1380}
1348 1381
1382
1383static int ftrace_add_mod(struct trace_array *tr,
1384 const char *func, const char *module,
1385 int enable)
1386{
1387 struct ftrace_mod_load *ftrace_mod;
1388 struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
1389
1390 ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
1391 if (!ftrace_mod)
1392 return -ENOMEM;
1393
1394 ftrace_mod->func = kstrdup(func, GFP_KERNEL);
1395 ftrace_mod->module = kstrdup(module, GFP_KERNEL);
1396 ftrace_mod->enable = enable;
1397
1398 if (!ftrace_mod->func || !ftrace_mod->module)
1399 goto out_free;
1400
1401 list_add(&ftrace_mod->list, mod_head);
1402
1403 return 0;
1404
1405 out_free:
1406 free_ftrace_mod(ftrace_mod);
1407
1408 return -ENOMEM;
1409}
1410
1349static struct ftrace_hash * 1411static struct ftrace_hash *
1350alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) 1412alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1351{ 1413{
@@ -1359,6 +1421,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1359 if (!new_hash) 1421 if (!new_hash)
1360 return NULL; 1422 return NULL;
1361 1423
1424 if (hash)
1425 new_hash->flags = hash->flags;
1426
1362 /* Empty hash? */ 1427 /* Empty hash? */
1363 if (ftrace_hash_empty(hash)) 1428 if (ftrace_hash_empty(hash))
1364 return new_hash; 1429 return new_hash;
@@ -1403,7 +1468,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
1403 /* 1468 /*
1404 * If the new source is empty, just return the empty_hash. 1469 * If the new source is empty, just return the empty_hash.
1405 */ 1470 */
1406 if (!src->count) 1471 if (ftrace_hash_empty(src))
1407 return EMPTY_HASH; 1472 return EMPTY_HASH;
1408 1473
1409 /* 1474 /*
@@ -1420,6 +1485,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
1420 if (!new_hash) 1485 if (!new_hash)
1421 return NULL; 1486 return NULL;
1422 1487
1488 new_hash->flags = src->flags;
1489
1423 size = 1 << src->size_bits; 1490 size = 1 << src->size_bits;
1424 for (i = 0; i < size; i++) { 1491 for (i = 0; i < size; i++) {
1425 hhd = &src->buckets[i]; 1492 hhd = &src->buckets[i];
@@ -1513,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1513 return 0; 1580 return 0;
1514#endif 1581#endif
1515 1582
1516 hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); 1583 rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash);
1517 hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); 1584 rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash);
1518 1585
1519 if (hash_contains_ip(ip, &hash)) 1586 if (hash_contains_ip(ip, &hash))
1520 ret = 1; 1587 ret = 1;
@@ -1650,7 +1717,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
1650 struct dyn_ftrace *rec; 1717 struct dyn_ftrace *rec;
1651 bool update = false; 1718 bool update = false;
1652 int count = 0; 1719 int count = 0;
1653 int all = 0; 1720 int all = false;
1654 1721
1655 /* Only update if the ops has been registered */ 1722 /* Only update if the ops has been registered */
1656 if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) 1723 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1671,7 +1738,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
1671 hash = ops->func_hash->filter_hash; 1738 hash = ops->func_hash->filter_hash;
1672 other_hash = ops->func_hash->notrace_hash; 1739 other_hash = ops->func_hash->notrace_hash;
1673 if (ftrace_hash_empty(hash)) 1740 if (ftrace_hash_empty(hash))
1674 all = 1; 1741 all = true;
1675 } else { 1742 } else {
1676 inc = !inc; 1743 inc = !inc;
1677 hash = ops->func_hash->notrace_hash; 1744 hash = ops->func_hash->notrace_hash;
@@ -2784,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2784 * If there's no more ops registered with ftrace, run a 2851 * If there's no more ops registered with ftrace, run a
2785 * sanity check to make sure all rec flags are cleared. 2852 * sanity check to make sure all rec flags are cleared.
2786 */ 2853 */
2787 if (ftrace_ops_list == &ftrace_list_end) { 2854 if (rcu_dereference_protected(ftrace_ops_list,
2855 lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
2788 struct ftrace_page *pg; 2856 struct ftrace_page *pg;
2789 struct dyn_ftrace *rec; 2857 struct dyn_ftrace *rec;
2790 2858
@@ -3061,6 +3129,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
3061struct ftrace_iterator { 3129struct ftrace_iterator {
3062 loff_t pos; 3130 loff_t pos;
3063 loff_t func_pos; 3131 loff_t func_pos;
3132 loff_t mod_pos;
3064 struct ftrace_page *pg; 3133 struct ftrace_page *pg;
3065 struct dyn_ftrace *func; 3134 struct dyn_ftrace *func;
3066 struct ftrace_func_probe *probe; 3135 struct ftrace_func_probe *probe;
@@ -3068,6 +3137,8 @@ struct ftrace_iterator {
3068 struct trace_parser parser; 3137 struct trace_parser parser;
3069 struct ftrace_hash *hash; 3138 struct ftrace_hash *hash;
3070 struct ftrace_ops *ops; 3139 struct ftrace_ops *ops;
3140 struct trace_array *tr;
3141 struct list_head *mod_list;
3071 int pidx; 3142 int pidx;
3072 int idx; 3143 int idx;
3073 unsigned flags; 3144 unsigned flags;
@@ -3152,13 +3223,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
3152 if (!(iter->flags & FTRACE_ITER_DO_PROBES)) 3223 if (!(iter->flags & FTRACE_ITER_DO_PROBES))
3153 return NULL; 3224 return NULL;
3154 3225
3155 if (iter->func_pos > *pos) 3226 if (iter->mod_pos > *pos)
3156 return NULL; 3227 return NULL;
3157 3228
3158 iter->probe = NULL; 3229 iter->probe = NULL;
3159 iter->probe_entry = NULL; 3230 iter->probe_entry = NULL;
3160 iter->pidx = 0; 3231 iter->pidx = 0;
3161 for (l = 0; l <= (*pos - iter->func_pos); ) { 3232 for (l = 0; l <= (*pos - iter->mod_pos); ) {
3162 p = t_probe_next(m, &l); 3233 p = t_probe_next(m, &l);
3163 if (!p) 3234 if (!p)
3164 break; 3235 break;
@@ -3197,6 +3268,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
3197} 3268}
3198 3269
3199static void * 3270static void *
3271t_mod_next(struct seq_file *m, loff_t *pos)
3272{
3273 struct ftrace_iterator *iter = m->private;
3274 struct trace_array *tr = iter->tr;
3275
3276 (*pos)++;
3277 iter->pos = *pos;
3278
3279 iter->mod_list = iter->mod_list->next;
3280
3281 if (iter->mod_list == &tr->mod_trace ||
3282 iter->mod_list == &tr->mod_notrace) {
3283 iter->flags &= ~FTRACE_ITER_MOD;
3284 return NULL;
3285 }
3286
3287 iter->mod_pos = *pos;
3288
3289 return iter;
3290}
3291
3292static void *t_mod_start(struct seq_file *m, loff_t *pos)
3293{
3294 struct ftrace_iterator *iter = m->private;
3295 void *p = NULL;
3296 loff_t l;
3297
3298 if (iter->func_pos > *pos)
3299 return NULL;
3300
3301 iter->mod_pos = iter->func_pos;
3302
3303 /* probes are only available if tr is set */
3304 if (!iter->tr)
3305 return NULL;
3306
3307 for (l = 0; l <= (*pos - iter->func_pos); ) {
3308 p = t_mod_next(m, &l);
3309 if (!p)
3310 break;
3311 }
3312 if (!p) {
3313 iter->flags &= ~FTRACE_ITER_MOD;
3314 return t_probe_start(m, pos);
3315 }
3316
3317 /* Only set this if we have an item */
3318 iter->flags |= FTRACE_ITER_MOD;
3319
3320 return iter;
3321}
3322
3323static int
3324t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
3325{
3326 struct ftrace_mod_load *ftrace_mod;
3327 struct trace_array *tr = iter->tr;
3328
3329 if (WARN_ON_ONCE(!iter->mod_list) ||
3330 iter->mod_list == &tr->mod_trace ||
3331 iter->mod_list == &tr->mod_notrace)
3332 return -EIO;
3333
3334 ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
3335
3336 if (ftrace_mod->func)
3337 seq_printf(m, "%s", ftrace_mod->func);
3338 else
3339 seq_putc(m, '*');
3340
3341 seq_printf(m, ":mod:%s\n", ftrace_mod->module);
3342
3343 return 0;
3344}
3345
3346static void *
3200t_func_next(struct seq_file *m, loff_t *pos) 3347t_func_next(struct seq_file *m, loff_t *pos)
3201{ 3348{
3202 struct ftrace_iterator *iter = m->private; 3349 struct ftrace_iterator *iter = m->private;
@@ -3237,7 +3384,7 @@ static void *
3237t_next(struct seq_file *m, void *v, loff_t *pos) 3384t_next(struct seq_file *m, void *v, loff_t *pos)
3238{ 3385{
3239 struct ftrace_iterator *iter = m->private; 3386 struct ftrace_iterator *iter = m->private;
3240 loff_t l = *pos; /* t_hash_start() must use original pos */ 3387 loff_t l = *pos; /* t_probe_start() must use original pos */
3241 void *ret; 3388 void *ret;
3242 3389
3243 if (unlikely(ftrace_disabled)) 3390 if (unlikely(ftrace_disabled))
@@ -3246,16 +3393,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
3246 if (iter->flags & FTRACE_ITER_PROBE) 3393 if (iter->flags & FTRACE_ITER_PROBE)
3247 return t_probe_next(m, pos); 3394 return t_probe_next(m, pos);
3248 3395
3396 if (iter->flags & FTRACE_ITER_MOD)
3397 return t_mod_next(m, pos);
3398
3249 if (iter->flags & FTRACE_ITER_PRINTALL) { 3399 if (iter->flags & FTRACE_ITER_PRINTALL) {
3250 /* next must increment pos, and t_probe_start does not */ 3400 /* next must increment pos, and t_probe_start does not */
3251 (*pos)++; 3401 (*pos)++;
3252 return t_probe_start(m, &l); 3402 return t_mod_start(m, &l);
3253 } 3403 }
3254 3404
3255 ret = t_func_next(m, pos); 3405 ret = t_func_next(m, pos);
3256 3406
3257 if (!ret) 3407 if (!ret)
3258 return t_probe_start(m, &l); 3408 return t_mod_start(m, &l);
3259 3409
3260 return ret; 3410 return ret;
3261} 3411}
@@ -3264,7 +3414,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
3264{ 3414{
3265 iter->pos = 0; 3415 iter->pos = 0;
3266 iter->func_pos = 0; 3416 iter->func_pos = 0;
3267 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); 3417 iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
3268} 3418}
3269 3419
3270static void *t_start(struct seq_file *m, loff_t *pos) 3420static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3293,15 +3443,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3293 ftrace_hash_empty(iter->hash)) { 3443 ftrace_hash_empty(iter->hash)) {
3294 iter->func_pos = 1; /* Account for the message */ 3444 iter->func_pos = 1; /* Account for the message */
3295 if (*pos > 0) 3445 if (*pos > 0)
3296 return t_probe_start(m, pos); 3446 return t_mod_start(m, pos);
3297 iter->flags |= FTRACE_ITER_PRINTALL; 3447 iter->flags |= FTRACE_ITER_PRINTALL;
3298 /* reset in case of seek/pread */ 3448 /* reset in case of seek/pread */
3299 iter->flags &= ~FTRACE_ITER_PROBE; 3449 iter->flags &= ~FTRACE_ITER_PROBE;
3300 return iter; 3450 return iter;
3301 } 3451 }
3302 3452
3303 if (iter->flags & FTRACE_ITER_PROBE) 3453 if (iter->flags & FTRACE_ITER_MOD)
3304 return t_probe_start(m, pos); 3454 return t_mod_start(m, pos);
3305 3455
3306 /* 3456 /*
3307 * Unfortunately, we need to restart at ftrace_pages_start 3457 * Unfortunately, we need to restart at ftrace_pages_start
@@ -3317,7 +3467,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
3317 } 3467 }
3318 3468
3319 if (!p) 3469 if (!p)
3320 return t_probe_start(m, pos); 3470 return t_mod_start(m, pos);
3321 3471
3322 return iter; 3472 return iter;
3323} 3473}
@@ -3351,6 +3501,9 @@ static int t_show(struct seq_file *m, void *v)
3351 if (iter->flags & FTRACE_ITER_PROBE) 3501 if (iter->flags & FTRACE_ITER_PROBE)
3352 return t_probe_show(m, iter); 3502 return t_probe_show(m, iter);
3353 3503
3504 if (iter->flags & FTRACE_ITER_MOD)
3505 return t_mod_show(m, iter);
3506
3354 if (iter->flags & FTRACE_ITER_PRINTALL) { 3507 if (iter->flags & FTRACE_ITER_PRINTALL) {
3355 if (iter->flags & FTRACE_ITER_NOTRACE) 3508 if (iter->flags & FTRACE_ITER_NOTRACE)
3356 seq_puts(m, "#### no functions disabled ####\n"); 3509 seq_puts(m, "#### no functions disabled ####\n");
@@ -3457,6 +3610,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3457{ 3610{
3458 struct ftrace_iterator *iter; 3611 struct ftrace_iterator *iter;
3459 struct ftrace_hash *hash; 3612 struct ftrace_hash *hash;
3613 struct list_head *mod_head;
3614 struct trace_array *tr = ops->private;
3460 int ret = 0; 3615 int ret = 0;
3461 3616
3462 ftrace_ops_init(ops); 3617 ftrace_ops_init(ops);
@@ -3475,21 +3630,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3475 3630
3476 iter->ops = ops; 3631 iter->ops = ops;
3477 iter->flags = flag; 3632 iter->flags = flag;
3633 iter->tr = tr;
3478 3634
3479 mutex_lock(&ops->func_hash->regex_lock); 3635 mutex_lock(&ops->func_hash->regex_lock);
3480 3636
3481 if (flag & FTRACE_ITER_NOTRACE) 3637 if (flag & FTRACE_ITER_NOTRACE) {
3482 hash = ops->func_hash->notrace_hash; 3638 hash = ops->func_hash->notrace_hash;
3483 else 3639 mod_head = tr ? &tr->mod_notrace : NULL;
3640 } else {
3484 hash = ops->func_hash->filter_hash; 3641 hash = ops->func_hash->filter_hash;
3642 mod_head = tr ? &tr->mod_trace : NULL;
3643 }
3644
3645 iter->mod_list = mod_head;
3485 3646
3486 if (file->f_mode & FMODE_WRITE) { 3647 if (file->f_mode & FMODE_WRITE) {
3487 const int size_bits = FTRACE_HASH_DEFAULT_BITS; 3648 const int size_bits = FTRACE_HASH_DEFAULT_BITS;
3488 3649
3489 if (file->f_flags & O_TRUNC) 3650 if (file->f_flags & O_TRUNC) {
3490 iter->hash = alloc_ftrace_hash(size_bits); 3651 iter->hash = alloc_ftrace_hash(size_bits);
3491 else 3652 clear_ftrace_mod_list(mod_head);
3653 } else {
3492 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); 3654 iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
3655 }
3493 3656
3494 if (!iter->hash) { 3657 if (!iter->hash) {
3495 trace_parser_put(&iter->parser); 3658 trace_parser_put(&iter->parser);
@@ -3665,7 +3828,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
3665 int exclude_mod = 0; 3828 int exclude_mod = 0;
3666 int found = 0; 3829 int found = 0;
3667 int ret; 3830 int ret;
3668 int clear_filter; 3831 int clear_filter = 0;
3669 3832
3670 if (func) { 3833 if (func) {
3671 func_g.type = filter_parse_regex(func, len, &func_g.search, 3834 func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3761,6 +3924,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3761 return ret; 3924 return ret;
3762} 3925}
3763 3926
3927static bool module_exists(const char *module)
3928{
3929 /* All modules have the symbol __this_module */
3930 const char this_mod[] = "__this_module";
3931 const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
3932 char modname[modname_size + 1];
3933 unsigned long val;
3934 int n;
3935
3936 n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
3937
3938 if (n > modname_size)
3939 return false;
3940
3941 val = module_kallsyms_lookup_name(modname);
3942 return val != 0;
3943}
3944
3945static int cache_mod(struct trace_array *tr,
3946 const char *func, char *module, int enable)
3947{
3948 struct ftrace_mod_load *ftrace_mod, *n;
3949 struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
3950 int ret;
3951
3952 mutex_lock(&ftrace_lock);
3953
3954 /* We do not cache inverse filters */
3955 if (func[0] == '!') {
3956 func++;
3957 ret = -EINVAL;
3958
3959 /* Look to remove this hash */
3960 list_for_each_entry_safe(ftrace_mod, n, head, list) {
3961 if (strcmp(ftrace_mod->module, module) != 0)
3962 continue;
3963
3964 /* no func matches all */
3965 if (strcmp(func, "*") == 0 ||
3966 (ftrace_mod->func &&
3967 strcmp(ftrace_mod->func, func) == 0)) {
3968 ret = 0;
3969 free_ftrace_mod(ftrace_mod);
3970 continue;
3971 }
3972 }
3973 goto out;
3974 }
3975
3976 ret = -EINVAL;
3977 /* We only care about modules that have not been loaded yet */
3978 if (module_exists(module))
3979 goto out;
3980
3981 /* Save this string off, and execute it when the module is loaded */
3982 ret = ftrace_add_mod(tr, func, module, enable);
3983 out:
3984 mutex_unlock(&ftrace_lock);
3985
3986 return ret;
3987}
3988
3989static int
3990ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3991 int reset, int enable);
3992
3993#ifdef CONFIG_MODULES
3994static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
3995 char *mod, bool enable)
3996{
3997 struct ftrace_mod_load *ftrace_mod, *n;
3998 struct ftrace_hash **orig_hash, *new_hash;
3999 LIST_HEAD(process_mods);
4000 char *func;
4001 int ret;
4002
4003 mutex_lock(&ops->func_hash->regex_lock);
4004
4005 if (enable)
4006 orig_hash = &ops->func_hash->filter_hash;
4007 else
4008 orig_hash = &ops->func_hash->notrace_hash;
4009
4010 new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
4011 *orig_hash);
4012 if (!new_hash)
4013 goto out; /* warn? */
4014
4015 mutex_lock(&ftrace_lock);
4016
4017 list_for_each_entry_safe(ftrace_mod, n, head, list) {
4018
4019 if (strcmp(ftrace_mod->module, mod) != 0)
4020 continue;
4021
4022 if (ftrace_mod->func)
4023 func = kstrdup(ftrace_mod->func, GFP_KERNEL);
4024 else
4025 func = kstrdup("*", GFP_KERNEL);
4026
4027 if (!func) /* warn? */
4028 continue;
4029
4030 list_del(&ftrace_mod->list);
4031 list_add(&ftrace_mod->list, &process_mods);
4032
4033 /* Use the newly allocated func, as it may be "*" */
4034 kfree(ftrace_mod->func);
4035 ftrace_mod->func = func;
4036 }
4037
4038 mutex_unlock(&ftrace_lock);
4039
4040 list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
4041
4042 func = ftrace_mod->func;
4043
4044 /* Grabs ftrace_lock, which is why we have this extra step */
4045 match_records(new_hash, func, strlen(func), mod);
4046 free_ftrace_mod(ftrace_mod);
4047 }
4048
4049 if (enable && list_empty(head))
4050 new_hash->flags &= ~FTRACE_HASH_FL_MOD;
4051
4052 mutex_lock(&ftrace_lock);
4053
4054 ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
4055 new_hash, enable);
4056 mutex_unlock(&ftrace_lock);
4057
4058 out:
4059 mutex_unlock(&ops->func_hash->regex_lock);
4060
4061 free_ftrace_hash(new_hash);
4062}
4063
4064static void process_cached_mods(const char *mod_name)
4065{
4066 struct trace_array *tr;
4067 char *mod;
4068
4069 mod = kstrdup(mod_name, GFP_KERNEL);
4070 if (!mod)
4071 return;
4072
4073 mutex_lock(&trace_types_lock);
4074 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
4075 if (!list_empty(&tr->mod_trace))
4076 process_mod_list(&tr->mod_trace, tr->ops, mod, true);
4077 if (!list_empty(&tr->mod_notrace))
4078 process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
4079 }
4080 mutex_unlock(&trace_types_lock);
4081
4082 kfree(mod);
4083}
4084#endif
4085
3764/* 4086/*
3765 * We register the module command as a template to show others how 4087 * We register the module command as a template to show others how
3766 * to register the a command as well. 4088 * to register the a command as well.
@@ -3768,10 +4090,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3768 4090
3769static int 4091static int
3770ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, 4092ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
3771 char *func, char *cmd, char *module, int enable) 4093 char *func_orig, char *cmd, char *module, int enable)
3772{ 4094{
4095 char *func;
3773 int ret; 4096 int ret;
3774 4097
4098 /* match_records() modifies func, and we need the original */
4099 func = kstrdup(func_orig, GFP_KERNEL);
4100 if (!func)
4101 return -ENOMEM;
4102
3775 /* 4103 /*
3776 * cmd == 'mod' because we only registered this func 4104 * cmd == 'mod' because we only registered this func
3777 * for the 'mod' ftrace_func_command. 4105 * for the 'mod' ftrace_func_command.
@@ -3780,8 +4108,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
3780 * parameter. 4108 * parameter.
3781 */ 4109 */
3782 ret = match_records(hash, func, strlen(func), module); 4110 ret = match_records(hash, func, strlen(func), module);
4111 kfree(func);
4112
3783 if (!ret) 4113 if (!ret)
3784 return -EINVAL; 4114 return cache_mod(tr, func_orig, module, enable);
3785 if (ret < 0) 4115 if (ret < 0)
3786 return ret; 4116 return ret;
3787 return 0; 4117 return 0;
@@ -4725,9 +5055,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4725 if (file->f_mode & FMODE_WRITE) { 5055 if (file->f_mode & FMODE_WRITE) {
4726 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); 5056 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
4727 5057
4728 if (filter_hash) 5058 if (filter_hash) {
4729 orig_hash = &iter->ops->func_hash->filter_hash; 5059 orig_hash = &iter->ops->func_hash->filter_hash;
4730 else 5060 if (iter->tr && !list_empty(&iter->tr->mod_trace))
5061 iter->hash->flags |= FTRACE_HASH_FL_MOD;
5062 } else
4731 orig_hash = &iter->ops->func_hash->notrace_hash; 5063 orig_hash = &iter->ops->func_hash->notrace_hash;
4732 5064
4733 mutex_lock(&ftrace_lock); 5065 mutex_lock(&ftrace_lock);
@@ -5385,6 +5717,7 @@ void ftrace_release_mod(struct module *mod)
5385 if (pg == ftrace_pages) 5717 if (pg == ftrace_pages)
5386 ftrace_pages = next_to_ftrace_page(last_pg); 5718 ftrace_pages = next_to_ftrace_page(last_pg);
5387 5719
5720 ftrace_update_tot_cnt -= pg->index;
5388 *last_pg = pg->next; 5721 *last_pg = pg->next;
5389 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 5722 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
5390 free_pages((unsigned long)pg->records, order); 5723 free_pages((unsigned long)pg->records, order);
@@ -5463,6 +5796,8 @@ void ftrace_module_enable(struct module *mod)
5463 5796
5464 out_unlock: 5797 out_unlock:
5465 mutex_unlock(&ftrace_lock); 5798 mutex_unlock(&ftrace_lock);
5799
5800 process_cached_mods(mod->name);
5466} 5801}
5467 5802
5468void ftrace_module_init(struct module *mod) 5803void ftrace_module_init(struct module *mod)
@@ -5501,6 +5836,7 @@ void __init ftrace_free_init_mem(void)
5501 if (!rec) 5836 if (!rec)
5502 continue; 5837 continue;
5503 pg->index--; 5838 pg->index--;
5839 ftrace_update_tot_cnt--;
5504 if (!pg->index) { 5840 if (!pg->index) {
5505 *last_pg = pg->next; 5841 *last_pg = pg->next;
5506 order = get_count_order(pg->size / ENTRIES_PER_PAGE); 5842 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
@@ -5567,6 +5903,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
5567void ftrace_init_trace_array(struct trace_array *tr) 5903void ftrace_init_trace_array(struct trace_array *tr)
5568{ 5904{
5569 INIT_LIST_HEAD(&tr->func_probes); 5905 INIT_LIST_HEAD(&tr->func_probes);
5906 INIT_LIST_HEAD(&tr->mod_trace);
5907 INIT_LIST_HEAD(&tr->mod_notrace);
5570} 5908}
5571#else 5909#else
5572 5910
@@ -6127,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
6127 if (ftrace_enabled) { 6465 if (ftrace_enabled) {
6128 6466
6129 /* we are starting ftrace again */ 6467 /* we are starting ftrace again */
6130 if (ftrace_ops_list != &ftrace_list_end) 6468 if (rcu_dereference_protected(ftrace_ops_list,
6469 lockdep_is_held(&ftrace_lock)) != &ftrace_list_end)
6131 update_ftrace_function(); 6470 update_ftrace_function();
6132 6471
6133 ftrace_startup_sysctl(); 6472 ftrace_startup_sysctl();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ae268e687fe..529cc50d7243 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
1136 for (i = 0; i < nr_pages; i++) { 1136 for (i = 0; i < nr_pages; i++) {
1137 struct page *page; 1137 struct page *page;
1138 /* 1138 /*
1139 * __GFP_NORETRY flag makes sure that the allocation fails 1139 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
1140 * gracefully without invoking oom-killer and the system is 1140 * gracefully without invoking oom-killer and the system is not
1141 * not destabilized. 1141 * destabilized.
1142 */ 1142 */
1143 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1143 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1144 GFP_KERNEL | __GFP_NORETRY, 1144 GFP_KERNEL | __GFP_RETRY_MAYFAIL,
1145 cpu_to_node(cpu)); 1145 cpu_to_node(cpu));
1146 if (!bpage) 1146 if (!bpage)
1147 goto free_pages; 1147 goto free_pages;
@@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
1149 list_add(&bpage->list, pages); 1149 list_add(&bpage->list, pages);
1150 1150
1151 page = alloc_pages_node(cpu_to_node(cpu), 1151 page = alloc_pages_node(cpu_to_node(cpu),
1152 GFP_KERNEL | __GFP_NORETRY, 0); 1152 GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
1153 if (!page) 1153 if (!page)
1154 goto free_pages; 1154 goto free_pages;
1155 bpage->page = page_address(page); 1155 bpage->page = page_address(page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 091e801145c9..42b9355033d4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
87 * tracing is active, only save the comm when a trace event 87 * tracing is active, only save the comm when a trace event
88 * occurred. 88 * occurred.
89 */ 89 */
90static DEFINE_PER_CPU(bool, trace_cmdline_save); 90static DEFINE_PER_CPU(bool, trace_taskinfo_save);
91 91
92/* 92/*
93 * Kill all tracing for good (never come back). 93 * Kill all tracing for good (never come back).
@@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
120/* When set, tracing will stop when a WARN*() is hit */ 120/* When set, tracing will stop when a WARN*() is hit */
121int __disable_trace_on_warning; 121int __disable_trace_on_warning;
122 122
123#ifdef CONFIG_TRACE_ENUM_MAP_FILE 123#ifdef CONFIG_TRACE_EVAL_MAP_FILE
124/* Map of enums to their values, for "enum_map" file */ 124/* Map of enums to their values, for "eval_map" file */
125struct trace_enum_map_head { 125struct trace_eval_map_head {
126 struct module *mod; 126 struct module *mod;
127 unsigned long length; 127 unsigned long length;
128}; 128};
129 129
130union trace_enum_map_item; 130union trace_eval_map_item;
131 131
132struct trace_enum_map_tail { 132struct trace_eval_map_tail {
133 /* 133 /*
134 * "end" is first and points to NULL as it must be different 134 * "end" is first and points to NULL as it must be different
135 * than "mod" or "enum_string" 135 * than "mod" or "eval_string"
136 */ 136 */
137 union trace_enum_map_item *next; 137 union trace_eval_map_item *next;
138 const char *end; /* points to NULL */ 138 const char *end; /* points to NULL */
139}; 139};
140 140
141static DEFINE_MUTEX(trace_enum_mutex); 141static DEFINE_MUTEX(trace_eval_mutex);
142 142
143/* 143/*
144 * The trace_enum_maps are saved in an array with two extra elements, 144 * The trace_eval_maps are saved in an array with two extra elements,
145 * one at the beginning, and one at the end. The beginning item contains 145 * one at the beginning, and one at the end. The beginning item contains
146 * the count of the saved maps (head.length), and the module they 146 * the count of the saved maps (head.length), and the module they
147 * belong to if not built in (head.mod). The ending item contains a 147 * belong to if not built in (head.mod). The ending item contains a
148 * pointer to the next array of saved enum_map items. 148 * pointer to the next array of saved eval_map items.
149 */ 149 */
150union trace_enum_map_item { 150union trace_eval_map_item {
151 struct trace_enum_map map; 151 struct trace_eval_map map;
152 struct trace_enum_map_head head; 152 struct trace_eval_map_head head;
153 struct trace_enum_map_tail tail; 153 struct trace_eval_map_tail tail;
154}; 154};
155 155
156static union trace_enum_map_item *trace_enum_maps; 156static union trace_eval_map_item *trace_eval_maps;
157#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ 157#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
158 158
159static int tracing_set_tracer(struct trace_array *tr, const char *buf); 159static int tracing_set_tracer(struct trace_array *tr, const char *buf);
160 160
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
790static __always_inline void 790static __always_inline void
791__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 791__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
792{ 792{
793 __this_cpu_write(trace_cmdline_save, true); 793 __this_cpu_write(trace_taskinfo_save, true);
794 794
795 /* If this is the temp buffer, we need to commit fully */ 795 /* If this is the temp buffer, we need to commit fully */
796 if (this_cpu_read(trace_buffered_event) == event) { 796 if (this_cpu_read(trace_buffered_event) == event) {
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
1141 1141
1142/* 1142/*
1143 * TRACE_FLAGS is defined as a tuple matching bit masks with strings. 1143 * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
1144 * It uses C(a, b) where 'a' is the enum name and 'b' is the string that 1144 * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
1145 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list 1145 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
1146 * of strings in the order that the enums were defined. 1146 * of strings in the order that the evals (enum) were defined.
1147 */ 1147 */
1148#undef C 1148#undef C
1149#define C(a, b) b 1149#define C(a, b) b
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
1709 } 1709 }
1710} 1710}
1711 1711
1712static int *tgid_map;
1713
1712#define SAVED_CMDLINES_DEFAULT 128 1714#define SAVED_CMDLINES_DEFAULT 128
1713#define NO_CMDLINE_MAP UINT_MAX 1715#define NO_CMDLINE_MAP UINT_MAX
1714static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; 1716static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
1722static struct saved_cmdlines_buffer *savedcmd; 1724static struct saved_cmdlines_buffer *savedcmd;
1723 1725
1724/* temporary disable recording */ 1726/* temporary disable recording */
1725static atomic_t trace_record_cmdline_disabled __read_mostly; 1727static atomic_t trace_record_taskinfo_disabled __read_mostly;
1726 1728
1727static inline char *get_saved_cmdlines(int idx) 1729static inline char *get_saved_cmdlines(int idx)
1728{ 1730{
@@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr)
1910 raw_spin_unlock_irqrestore(&tr->start_lock, flags); 1912 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1911} 1913}
1912 1914
1913void trace_stop_cmdline_recording(void);
1914
1915static int trace_save_cmdline(struct task_struct *tsk) 1915static int trace_save_cmdline(struct task_struct *tsk)
1916{ 1916{
1917 unsigned pid, idx; 1917 unsigned pid, idx;
1918 1918
1919 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 1919 /* treat recording of idle task as a success */
1920 if (!tsk->pid)
1921 return 1;
1922
1923 if (unlikely(tsk->pid > PID_MAX_DEFAULT))
1920 return 0; 1924 return 0;
1921 1925
1922 /* 1926 /*
@@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[])
1992 preempt_enable(); 1996 preempt_enable();
1993} 1997}
1994 1998
1995void tracing_record_cmdline(struct task_struct *tsk) 1999int trace_find_tgid(int pid)
1996{ 2000{
1997 if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) 2001 if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
2002 return 0;
2003
2004 return tgid_map[pid];
2005}
2006
2007static int trace_save_tgid(struct task_struct *tsk)
2008{
2009 /* treat recording of idle task as a success */
2010 if (!tsk->pid)
2011 return 1;
2012
2013 if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
2014 return 0;
2015
2016 tgid_map[tsk->pid] = tsk->tgid;
2017 return 1;
2018}
2019
2020static bool tracing_record_taskinfo_skip(int flags)
2021{
2022 if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
2023 return true;
2024 if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
2025 return true;
2026 if (!__this_cpu_read(trace_taskinfo_save))
2027 return true;
2028 return false;
2029}
2030
2031/**
2032 * tracing_record_taskinfo - record the task info of a task
2033 *
2034 * @task - task to record
2035 * @flags - TRACE_RECORD_CMDLINE for recording comm
2036 * - TRACE_RECORD_TGID for recording tgid
2037 */
2038void tracing_record_taskinfo(struct task_struct *task, int flags)
2039{
2040 bool done;
2041
2042 if (tracing_record_taskinfo_skip(flags))
1998 return; 2043 return;
1999 2044
2000 if (!__this_cpu_read(trace_cmdline_save)) 2045 /*
2046 * Record as much task information as possible. If some fail, continue
2047 * to try to record the others.
2048 */
2049 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
2050 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
2051
2052 /* If recording any information failed, retry again soon. */
2053 if (!done)
2001 return; 2054 return;
2002 2055
2003 if (trace_save_cmdline(tsk)) 2056 __this_cpu_write(trace_taskinfo_save, false);
2004 __this_cpu_write(trace_cmdline_save, false); 2057}
2058
2059/**
2060 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
2061 *
2062 * @prev - previous task during sched_switch
2063 * @next - next task during sched_switch
2064 * @flags - TRACE_RECORD_CMDLINE for recording comm
2065 * TRACE_RECORD_TGID for recording tgid
2066 */
2067void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
2068 struct task_struct *next, int flags)
2069{
2070 bool done;
2071
2072 if (tracing_record_taskinfo_skip(flags))
2073 return;
2074
2075 /*
2076 * Record as much task information as possible. If some fail, continue
2077 * to try to record the others.
2078 */
2079 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
2080 done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
2081 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
2082 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
2083
2084 /* If recording any information failed, retry again soon. */
2085 if (!done)
2086 return;
2087
2088 __this_cpu_write(trace_taskinfo_save, false);
2089}
2090
2091/* Helpers to record a specific task information */
2092void tracing_record_cmdline(struct task_struct *task)
2093{
2094 tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
2095}
2096
2097void tracing_record_tgid(struct task_struct *task)
2098{
2099 tracing_record_taskinfo(task, TRACE_RECORD_TGID);
2005} 2100}
2006 2101
2007/* 2102/*
@@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3146#endif 3241#endif
3147 3242
3148 if (!iter->snapshot) 3243 if (!iter->snapshot)
3149 atomic_inc(&trace_record_cmdline_disabled); 3244 atomic_inc(&trace_record_taskinfo_disabled);
3150 3245
3151 if (*pos != iter->pos) { 3246 if (*pos != iter->pos) {
3152 iter->ent = NULL; 3247 iter->ent = NULL;
@@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p)
3191#endif 3286#endif
3192 3287
3193 if (!iter->snapshot) 3288 if (!iter->snapshot)
3194 atomic_dec(&trace_record_cmdline_disabled); 3289 atomic_dec(&trace_record_taskinfo_disabled);
3195 3290
3196 trace_access_unlock(iter->cpu_file); 3291 trace_access_unlock(iter->cpu_file);
3197 trace_event_read_unlock(); 3292 trace_event_read_unlock();
@@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
3248 seq_puts(m, "#\n"); 3343 seq_puts(m, "#\n");
3249} 3344}
3250 3345
3251static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 3346static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
3347 unsigned int flags)
3252{ 3348{
3349 bool tgid = flags & TRACE_ITER_RECORD_TGID;
3350
3253 print_event_info(buf, m); 3351 print_event_info(buf, m);
3254 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" 3352
3255 "# | | | | |\n"); 3353 seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
3354 seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
3256} 3355}
3257 3356
3258static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 3357static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
3358 unsigned int flags)
3259{ 3359{
3260 print_event_info(buf, m); 3360 bool tgid = flags & TRACE_ITER_RECORD_TGID;
3261 seq_puts(m, "# _-----=> irqs-off\n" 3361 const char tgid_space[] = " ";
3262 "# / _----=> need-resched\n" 3362 const char space[] = " ";
3263 "# | / _---=> hardirq/softirq\n" 3363
3264 "# || / _--=> preempt-depth\n" 3364 seq_printf(m, "# %s _-----=> irqs-off\n",
3265 "# ||| / delay\n" 3365 tgid ? tgid_space : space);
3266 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" 3366 seq_printf(m, "# %s / _----=> need-resched\n",
3267 "# | | | |||| | |\n"); 3367 tgid ? tgid_space : space);
3368 seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
3369 tgid ? tgid_space : space);
3370 seq_printf(m, "# %s|| / _--=> preempt-depth\n",
3371 tgid ? tgid_space : space);
3372 seq_printf(m, "# %s||| / delay\n",
3373 tgid ? tgid_space : space);
3374 seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n",
3375 tgid ? " TGID " : space);
3376 seq_printf(m, "# | | | %s|||| | |\n",
3377 tgid ? " | " : space);
3268} 3378}
3269 3379
3270void 3380void
@@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m)
3580 } else { 3690 } else {
3581 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 3691 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
3582 if (trace_flags & TRACE_ITER_IRQ_INFO) 3692 if (trace_flags & TRACE_ITER_IRQ_INFO)
3583 print_func_help_header_irq(iter->trace_buffer, m); 3693 print_func_help_header_irq(iter->trace_buffer,
3694 m, trace_flags);
3584 else 3695 else
3585 print_func_help_header(iter->trace_buffer, m); 3696 print_func_help_header(iter->trace_buffer, m,
3697 trace_flags);
3586 } 3698 }
3587 } 3699 }
3588} 3700}
@@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
4238 if (mask == TRACE_ITER_RECORD_CMD) 4350 if (mask == TRACE_ITER_RECORD_CMD)
4239 trace_event_enable_cmd_record(enabled); 4351 trace_event_enable_cmd_record(enabled);
4240 4352
4353 if (mask == TRACE_ITER_RECORD_TGID) {
4354 if (!tgid_map)
4355 tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
4356 GFP_KERNEL);
4357 if (!tgid_map) {
4358 tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
4359 return -ENOMEM;
4360 }
4361
4362 trace_event_enable_tgid_record(enabled);
4363 }
4364
4241 if (mask == TRACE_ITER_EVENT_FORK) 4365 if (mask == TRACE_ITER_EVENT_FORK)
4242 trace_event_follow_fork(tr, enabled); 4366 trace_event_follow_fork(tr, enabled);
4243 4367
@@ -4473,7 +4597,8 @@ static const char readme_msg[] =
4473#endif 4597#endif
4474#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) 4598#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
4475 "\t accepts: event-definitions (one definition per line)\n" 4599 "\t accepts: event-definitions (one definition per line)\n"
4476 "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" 4600 "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
4601 "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
4477 "\t -:[<group>/]<event>\n" 4602 "\t -:[<group>/]<event>\n"
4478#ifdef CONFIG_KPROBE_EVENTS 4603#ifdef CONFIG_KPROBE_EVENTS
4479 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" 4604 "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = {
4597 .llseek = generic_file_llseek, 4722 .llseek = generic_file_llseek,
4598}; 4723};
4599 4724
4725static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
4726{
4727 int *ptr = v;
4728
4729 if (*pos || m->count)
4730 ptr++;
4731
4732 (*pos)++;
4733
4734 for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
4735 if (trace_find_tgid(*ptr))
4736 return ptr;
4737 }
4738
4739 return NULL;
4740}
4741
4742static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
4743{
4744 void *v;
4745 loff_t l = 0;
4746
4747 if (!tgid_map)
4748 return NULL;
4749
4750 v = &tgid_map[0];
4751 while (l <= *pos) {
4752 v = saved_tgids_next(m, v, &l);
4753 if (!v)
4754 return NULL;
4755 }
4756
4757 return v;
4758}
4759
4760static void saved_tgids_stop(struct seq_file *m, void *v)
4761{
4762}
4763
4764static int saved_tgids_show(struct seq_file *m, void *v)
4765{
4766 int pid = (int *)v - tgid_map;
4767
4768 seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
4769 return 0;
4770}
4771
4772static const struct seq_operations tracing_saved_tgids_seq_ops = {
4773 .start = saved_tgids_start,
4774 .stop = saved_tgids_stop,
4775 .next = saved_tgids_next,
4776 .show = saved_tgids_show,
4777};
4778
4779static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
4780{
4781 if (tracing_disabled)
4782 return -ENODEV;
4783
4784 return seq_open(filp, &tracing_saved_tgids_seq_ops);
4785}
4786
4787
4788static const struct file_operations tracing_saved_tgids_fops = {
4789 .open = tracing_saved_tgids_open,
4790 .read = seq_read,
4791 .llseek = seq_lseek,
4792 .release = seq_release,
4793};
4794
4600static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) 4795static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
4601{ 4796{
4602 unsigned int *ptr = v; 4797 unsigned int *ptr = v;
@@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
4746 .write = tracing_saved_cmdlines_size_write, 4941 .write = tracing_saved_cmdlines_size_write,
4747}; 4942};
4748 4943
4749#ifdef CONFIG_TRACE_ENUM_MAP_FILE 4944#ifdef CONFIG_TRACE_EVAL_MAP_FILE
4750static union trace_enum_map_item * 4945static union trace_eval_map_item *
4751update_enum_map(union trace_enum_map_item *ptr) 4946update_eval_map(union trace_eval_map_item *ptr)
4752{ 4947{
4753 if (!ptr->map.enum_string) { 4948 if (!ptr->map.eval_string) {
4754 if (ptr->tail.next) { 4949 if (ptr->tail.next) {
4755 ptr = ptr->tail.next; 4950 ptr = ptr->tail.next;
4756 /* Set ptr to the next real item (skip head) */ 4951 /* Set ptr to the next real item (skip head) */
@@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr)
4761 return ptr; 4956 return ptr;
4762} 4957}
4763 4958
4764static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) 4959static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
4765{ 4960{
4766 union trace_enum_map_item *ptr = v; 4961 union trace_eval_map_item *ptr = v;
4767 4962
4768 /* 4963 /*
4769 * Paranoid! If ptr points to end, we don't want to increment past it. 4964 * Paranoid! If ptr points to end, we don't want to increment past it.
4770 * This really should never happen. 4965 * This really should never happen.
4771 */ 4966 */
4772 ptr = update_enum_map(ptr); 4967 ptr = update_eval_map(ptr);
4773 if (WARN_ON_ONCE(!ptr)) 4968 if (WARN_ON_ONCE(!ptr))
4774 return NULL; 4969 return NULL;
4775 4970
@@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
4777 4972
4778 (*pos)++; 4973 (*pos)++;
4779 4974
4780 ptr = update_enum_map(ptr); 4975 ptr = update_eval_map(ptr);
4781 4976
4782 return ptr; 4977 return ptr;
4783} 4978}
4784 4979
4785static void *enum_map_start(struct seq_file *m, loff_t *pos) 4980static void *eval_map_start(struct seq_file *m, loff_t *pos)
4786{ 4981{
4787 union trace_enum_map_item *v; 4982 union trace_eval_map_item *v;
4788 loff_t l = 0; 4983 loff_t l = 0;
4789 4984
4790 mutex_lock(&trace_enum_mutex); 4985 mutex_lock(&trace_eval_mutex);
4791 4986
4792 v = trace_enum_maps; 4987 v = trace_eval_maps;
4793 if (v) 4988 if (v)
4794 v++; 4989 v++;
4795 4990
4796 while (v && l < *pos) { 4991 while (v && l < *pos) {
4797 v = enum_map_next(m, v, &l); 4992 v = eval_map_next(m, v, &l);
4798 } 4993 }
4799 4994
4800 return v; 4995 return v;
4801} 4996}
4802 4997
4803static void enum_map_stop(struct seq_file *m, void *v) 4998static void eval_map_stop(struct seq_file *m, void *v)
4804{ 4999{
4805 mutex_unlock(&trace_enum_mutex); 5000 mutex_unlock(&trace_eval_mutex);
4806} 5001}
4807 5002
4808static int enum_map_show(struct seq_file *m, void *v) 5003static int eval_map_show(struct seq_file *m, void *v)
4809{ 5004{
4810 union trace_enum_map_item *ptr = v; 5005 union trace_eval_map_item *ptr = v;
4811 5006
4812 seq_printf(m, "%s %ld (%s)\n", 5007 seq_printf(m, "%s %ld (%s)\n",
4813 ptr->map.enum_string, ptr->map.enum_value, 5008 ptr->map.eval_string, ptr->map.eval_value,
4814 ptr->map.system); 5009 ptr->map.system);
4815 5010
4816 return 0; 5011 return 0;
4817} 5012}
4818 5013
4819static const struct seq_operations tracing_enum_map_seq_ops = { 5014static const struct seq_operations tracing_eval_map_seq_ops = {
4820 .start = enum_map_start, 5015 .start = eval_map_start,
4821 .next = enum_map_next, 5016 .next = eval_map_next,
4822 .stop = enum_map_stop, 5017 .stop = eval_map_stop,
4823 .show = enum_map_show, 5018 .show = eval_map_show,
4824}; 5019};
4825 5020
4826static int tracing_enum_map_open(struct inode *inode, struct file *filp) 5021static int tracing_eval_map_open(struct inode *inode, struct file *filp)
4827{ 5022{
4828 if (tracing_disabled) 5023 if (tracing_disabled)
4829 return -ENODEV; 5024 return -ENODEV;
4830 5025
4831 return seq_open(filp, &tracing_enum_map_seq_ops); 5026 return seq_open(filp, &tracing_eval_map_seq_ops);
4832} 5027}
4833 5028
4834static const struct file_operations tracing_enum_map_fops = { 5029static const struct file_operations tracing_eval_map_fops = {
4835 .open = tracing_enum_map_open, 5030 .open = tracing_eval_map_open,
4836 .read = seq_read, 5031 .read = seq_read,
4837 .llseek = seq_lseek, 5032 .llseek = seq_lseek,
4838 .release = seq_release, 5033 .release = seq_release,
4839}; 5034};
4840 5035
4841static inline union trace_enum_map_item * 5036static inline union trace_eval_map_item *
4842trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) 5037trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
4843{ 5038{
4844 /* Return tail of array given the head */ 5039 /* Return tail of array given the head */
4845 return ptr + ptr->head.length + 1; 5040 return ptr + ptr->head.length + 1;
4846} 5041}
4847 5042
4848static void 5043static void
4849trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, 5044trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
4850 int len) 5045 int len)
4851{ 5046{
4852 struct trace_enum_map **stop; 5047 struct trace_eval_map **stop;
4853 struct trace_enum_map **map; 5048 struct trace_eval_map **map;
4854 union trace_enum_map_item *map_array; 5049 union trace_eval_map_item *map_array;
4855 union trace_enum_map_item *ptr; 5050 union trace_eval_map_item *ptr;
4856 5051
4857 stop = start + len; 5052 stop = start + len;
4858 5053
4859 /* 5054 /*
4860 * The trace_enum_maps contains the map plus a head and tail item, 5055 * The trace_eval_maps contains the map plus a head and tail item,
4861 * where the head holds the module and length of array, and the 5056 * where the head holds the module and length of array, and the
4862 * tail holds a pointer to the next list. 5057 * tail holds a pointer to the next list.
4863 */ 5058 */
4864 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); 5059 map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
4865 if (!map_array) { 5060 if (!map_array) {
4866 pr_warn("Unable to allocate trace enum mapping\n"); 5061 pr_warn("Unable to allocate trace eval mapping\n");
4867 return; 5062 return;
4868 } 5063 }
4869 5064
4870 mutex_lock(&trace_enum_mutex); 5065 mutex_lock(&trace_eval_mutex);
4871 5066
4872 if (!trace_enum_maps) 5067 if (!trace_eval_maps)
4873 trace_enum_maps = map_array; 5068 trace_eval_maps = map_array;
4874 else { 5069 else {
4875 ptr = trace_enum_maps; 5070 ptr = trace_eval_maps;
4876 for (;;) { 5071 for (;;) {
4877 ptr = trace_enum_jmp_to_tail(ptr); 5072 ptr = trace_eval_jmp_to_tail(ptr);
4878 if (!ptr->tail.next) 5073 if (!ptr->tail.next)
4879 break; 5074 break;
4880 ptr = ptr->tail.next; 5075 ptr = ptr->tail.next;
@@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
4892 } 5087 }
4893 memset(map_array, 0, sizeof(*map_array)); 5088 memset(map_array, 0, sizeof(*map_array));
4894 5089
4895 mutex_unlock(&trace_enum_mutex); 5090 mutex_unlock(&trace_eval_mutex);
4896} 5091}
4897 5092
4898static void trace_create_enum_file(struct dentry *d_tracer) 5093static void trace_create_eval_file(struct dentry *d_tracer)
4899{ 5094{
4900 trace_create_file("enum_map", 0444, d_tracer, 5095 trace_create_file("eval_map", 0444, d_tracer,
4901 NULL, &tracing_enum_map_fops); 5096 NULL, &tracing_eval_map_fops);
4902} 5097}
4903 5098
4904#else /* CONFIG_TRACE_ENUM_MAP_FILE */ 5099#else /* CONFIG_TRACE_EVAL_MAP_FILE */
4905static inline void trace_create_enum_file(struct dentry *d_tracer) { } 5100static inline void trace_create_eval_file(struct dentry *d_tracer) { }
4906static inline void trace_insert_enum_map_file(struct module *mod, 5101static inline void trace_insert_eval_map_file(struct module *mod,
4907 struct trace_enum_map **start, int len) { } 5102 struct trace_eval_map **start, int len) { }
4908#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ 5103#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
4909 5104
4910static void trace_insert_enum_map(struct module *mod, 5105static void trace_insert_eval_map(struct module *mod,
4911 struct trace_enum_map **start, int len) 5106 struct trace_eval_map **start, int len)
4912{ 5107{
4913 struct trace_enum_map **map; 5108 struct trace_eval_map **map;
4914 5109
4915 if (len <= 0) 5110 if (len <= 0)
4916 return; 5111 return;
4917 5112
4918 map = start; 5113 map = start;
4919 5114
4920 trace_event_enum_update(map, len); 5115 trace_event_eval_update(map, len);
4921 5116
4922 trace_insert_enum_map_file(mod, start, len); 5117 trace_insert_eval_map_file(mod, start, len);
4923} 5118}
4924 5119
4925static ssize_t 5120static ssize_t
@@ -6739,33 +6934,18 @@ static const struct file_operations tracing_stats_fops = {
6739 6934
6740#ifdef CONFIG_DYNAMIC_FTRACE 6935#ifdef CONFIG_DYNAMIC_FTRACE
6741 6936
6742int __weak ftrace_arch_read_dyn_info(char *buf, int size)
6743{
6744 return 0;
6745}
6746
6747static ssize_t 6937static ssize_t
6748tracing_read_dyn_info(struct file *filp, char __user *ubuf, 6938tracing_read_dyn_info(struct file *filp, char __user *ubuf,
6749 size_t cnt, loff_t *ppos) 6939 size_t cnt, loff_t *ppos)
6750{ 6940{
6751 static char ftrace_dyn_info_buffer[1024];
6752 static DEFINE_MUTEX(dyn_info_mutex);
6753 unsigned long *p = filp->private_data; 6941 unsigned long *p = filp->private_data;
6754 char *buf = ftrace_dyn_info_buffer; 6942 char buf[64]; /* Not too big for a shallow stack */
6755 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
6756 int r; 6943 int r;
6757 6944
6758 mutex_lock(&dyn_info_mutex); 6945 r = scnprintf(buf, 63, "%ld", *p);
6759 r = sprintf(buf, "%ld ", *p);
6760
6761 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
6762 buf[r++] = '\n'; 6946 buf[r++] = '\n';
6763 6947
6764 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 6948 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
6765
6766 mutex_unlock(&dyn_info_mutex);
6767
6768 return r;
6769} 6949}
6770 6950
6771static const struct file_operations tracing_dyn_info_fops = { 6951static const struct file_operations tracing_dyn_info_fops = {
@@ -7594,6 +7774,7 @@ static int instance_rmdir(const char *name)
7594 } 7774 }
7595 kfree(tr->topts); 7775 kfree(tr->topts);
7596 7776
7777 free_cpumask_var(tr->tracing_cpumask);
7597 kfree(tr->name); 7778 kfree(tr->name);
7598 kfree(tr); 7779 kfree(tr);
7599 7780
@@ -7737,21 +7918,21 @@ struct dentry *tracing_init_dentry(void)
7737 return NULL; 7918 return NULL;
7738} 7919}
7739 7920
7740extern struct trace_enum_map *__start_ftrace_enum_maps[]; 7921extern struct trace_eval_map *__start_ftrace_eval_maps[];
7741extern struct trace_enum_map *__stop_ftrace_enum_maps[]; 7922extern struct trace_eval_map *__stop_ftrace_eval_maps[];
7742 7923
7743static void __init trace_enum_init(void) 7924static void __init trace_eval_init(void)
7744{ 7925{
7745 int len; 7926 int len;
7746 7927
7747 len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; 7928 len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
7748 trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); 7929 trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
7749} 7930}
7750 7931
7751#ifdef CONFIG_MODULES 7932#ifdef CONFIG_MODULES
7752static void trace_module_add_enums(struct module *mod) 7933static void trace_module_add_evals(struct module *mod)
7753{ 7934{
7754 if (!mod->num_trace_enums) 7935 if (!mod->num_trace_evals)
7755 return; 7936 return;
7756 7937
7757 /* 7938 /*
@@ -7761,40 +7942,40 @@ static void trace_module_add_enums(struct module *mod)
7761 if (trace_module_has_bad_taint(mod)) 7942 if (trace_module_has_bad_taint(mod))
7762 return; 7943 return;
7763 7944
7764 trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); 7945 trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
7765} 7946}
7766 7947
7767#ifdef CONFIG_TRACE_ENUM_MAP_FILE 7948#ifdef CONFIG_TRACE_EVAL_MAP_FILE
7768static void trace_module_remove_enums(struct module *mod) 7949static void trace_module_remove_evals(struct module *mod)
7769{ 7950{
7770 union trace_enum_map_item *map; 7951 union trace_eval_map_item *map;
7771 union trace_enum_map_item **last = &trace_enum_maps; 7952 union trace_eval_map_item **last = &trace_eval_maps;
7772 7953
7773 if (!mod->num_trace_enums) 7954 if (!mod->num_trace_evals)
7774 return; 7955 return;
7775 7956
7776 mutex_lock(&trace_enum_mutex); 7957 mutex_lock(&trace_eval_mutex);
7777 7958
7778 map = trace_enum_maps; 7959 map = trace_eval_maps;
7779 7960
7780 while (map) { 7961 while (map) {
7781 if (map->head.mod == mod) 7962 if (map->head.mod == mod)
7782 break; 7963 break;
7783 map = trace_enum_jmp_to_tail(map); 7964 map = trace_eval_jmp_to_tail(map);
7784 last = &map->tail.next; 7965 last = &map->tail.next;
7785 map = map->tail.next; 7966 map = map->tail.next;
7786 } 7967 }
7787 if (!map) 7968 if (!map)
7788 goto out; 7969 goto out;
7789 7970
7790 *last = trace_enum_jmp_to_tail(map)->tail.next; 7971 *last = trace_eval_jmp_to_tail(map)->tail.next;
7791 kfree(map); 7972 kfree(map);
7792 out: 7973 out:
7793 mutex_unlock(&trace_enum_mutex); 7974 mutex_unlock(&trace_eval_mutex);
7794} 7975}
7795#else 7976#else
7796static inline void trace_module_remove_enums(struct module *mod) { } 7977static inline void trace_module_remove_evals(struct module *mod) { }
7797#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ 7978#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
7798 7979
7799static int trace_module_notify(struct notifier_block *self, 7980static int trace_module_notify(struct notifier_block *self,
7800 unsigned long val, void *data) 7981 unsigned long val, void *data)
@@ -7803,10 +7984,10 @@ static int trace_module_notify(struct notifier_block *self,
7803 7984
7804 switch (val) { 7985 switch (val) {
7805 case MODULE_STATE_COMING: 7986 case MODULE_STATE_COMING:
7806 trace_module_add_enums(mod); 7987 trace_module_add_evals(mod);
7807 break; 7988 break;
7808 case MODULE_STATE_GOING: 7989 case MODULE_STATE_GOING:
7809 trace_module_remove_enums(mod); 7990 trace_module_remove_evals(mod);
7810 break; 7991 break;
7811 } 7992 }
7812 7993
@@ -7844,9 +8025,12 @@ static __init int tracer_init_tracefs(void)
7844 trace_create_file("saved_cmdlines_size", 0644, d_tracer, 8025 trace_create_file("saved_cmdlines_size", 0644, d_tracer,
7845 NULL, &tracing_saved_cmdlines_size_fops); 8026 NULL, &tracing_saved_cmdlines_size_fops);
7846 8027
7847 trace_enum_init(); 8028 trace_create_file("saved_tgids", 0444, d_tracer,
8029 NULL, &tracing_saved_tgids_fops);
8030
8031 trace_eval_init();
7848 8032
7849 trace_create_enum_file(d_tracer); 8033 trace_create_eval_file(d_tracer);
7850 8034
7851#ifdef CONFIG_MODULES 8035#ifdef CONFIG_MODULES
7852 register_module_notifier(&trace_module_nb); 8036 register_module_notifier(&trace_module_nb);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..490ba229931d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
263 struct ftrace_ops *ops; 263 struct ftrace_ops *ops;
264 struct trace_pid_list __rcu *function_pids; 264 struct trace_pid_list __rcu *function_pids;
265#ifdef CONFIG_DYNAMIC_FTRACE 265#ifdef CONFIG_DYNAMIC_FTRACE
266 /* All of these are protected by the ftrace_lock */
266 struct list_head func_probes; 267 struct list_head func_probes;
268 struct list_head mod_trace;
269 struct list_head mod_notrace;
267#endif 270#endif
268 /* function tracing enabled */ 271 /* function tracing enabled */
269 int function_enabled; 272 int function_enabled;
@@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
637 640
638void tracing_start_cmdline_record(void); 641void tracing_start_cmdline_record(void);
639void tracing_stop_cmdline_record(void); 642void tracing_stop_cmdline_record(void);
643void tracing_start_tgid_record(void);
644void tracing_stop_tgid_record(void);
645
640int register_tracer(struct tracer *type); 646int register_tracer(struct tracer *type);
641int is_tracing_stopped(void); 647int is_tracing_stopped(void);
642 648
@@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
697extern u64 ftrace_now(int cpu); 703extern u64 ftrace_now(int cpu);
698 704
699extern void trace_find_cmdline(int pid, char comm[]); 705extern void trace_find_cmdline(int pid, char comm[]);
706extern int trace_find_tgid(int pid);
700extern void trace_event_follow_fork(struct trace_array *tr, bool enable); 707extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
701 708
702#ifdef CONFIG_DYNAMIC_FTRACE 709#ifdef CONFIG_DYNAMIC_FTRACE
@@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
761 768
762extern char trace_find_mark(unsigned long long duration); 769extern char trace_find_mark(unsigned long long duration);
763 770
771struct ftrace_hash;
772
773struct ftrace_mod_load {
774 struct list_head list;
775 char *func;
776 char *module;
777 int enable;
778};
779
780enum {
781 FTRACE_HASH_FL_MOD = (1 << 0),
782};
783
764struct ftrace_hash { 784struct ftrace_hash {
765 unsigned long size_bits; 785 unsigned long size_bits;
766 struct hlist_head *buckets; 786 struct hlist_head *buckets;
767 unsigned long count; 787 unsigned long count;
788 unsigned long flags;
768 struct rcu_head rcu; 789 struct rcu_head rcu;
769}; 790};
770 791
@@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
773 794
774static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) 795static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
775{ 796{
776 return !hash || !hash->count; 797 return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
777} 798}
778 799
779/* Standard output formatting function used for function return traces */ 800/* Standard output formatting function used for function return traces */
@@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
1107 C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ 1128 C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
1108 C(LATENCY_FMT, "latency-format"), \ 1129 C(LATENCY_FMT, "latency-format"), \
1109 C(RECORD_CMD, "record-cmd"), \ 1130 C(RECORD_CMD, "record-cmd"), \
1131 C(RECORD_TGID, "record-tgid"), \
1110 C(OVERWRITE, "overwrite"), \ 1132 C(OVERWRITE, "overwrite"), \
1111 C(STOP_ON_FREE, "disable_on_free"), \ 1133 C(STOP_ON_FREE, "disable_on_free"), \
1112 C(IRQ_INFO, "irq-info"), \ 1134 C(IRQ_INFO, "irq-info"), \
@@ -1188,9 +1210,9 @@ struct ftrace_event_field {
1188struct event_filter { 1210struct event_filter {
1189 int n_preds; /* Number assigned */ 1211 int n_preds; /* Number assigned */
1190 int a_preds; /* allocated */ 1212 int a_preds; /* allocated */
1191 struct filter_pred *preds; 1213 struct filter_pred __rcu *preds;
1192 struct filter_pred *root; 1214 struct filter_pred __rcu *root;
1193 char *filter_string; 1215 char *filter_string;
1194}; 1216};
1195 1217
1196struct event_subsystem { 1218struct event_subsystem {
@@ -1423,6 +1445,8 @@ struct ftrace_event_field *
1423trace_find_event_field(struct trace_event_call *call, char *name); 1445trace_find_event_field(struct trace_event_call *call, char *name);
1424 1446
1425extern void trace_event_enable_cmd_record(bool enable); 1447extern void trace_event_enable_cmd_record(bool enable);
1448extern void trace_event_enable_tgid_record(bool enable);
1449
1426extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); 1450extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1427extern int event_trace_del_tracer(struct trace_array *tr); 1451extern int event_trace_del_tracer(struct trace_array *tr);
1428 1452
@@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall)
1773 1797
1774#ifdef CONFIG_EVENT_TRACING 1798#ifdef CONFIG_EVENT_TRACING
1775void trace_event_init(void); 1799void trace_event_init(void);
1776void trace_event_enum_update(struct trace_enum_map **map, int len); 1800void trace_event_eval_update(struct trace_eval_map **map, int len);
1777#else 1801#else
1778static inline void __init trace_event_init(void) { } 1802static inline void __init trace_event_init(void) { }
1779static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } 1803static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
1780#endif 1804#endif
1781 1805
1782extern struct trace_iterator *tracepoint_print_iter; 1806extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
343 mutex_unlock(&event_mutex); 343 mutex_unlock(&event_mutex);
344} 344}
345 345
346void trace_event_enable_tgid_record(bool enable)
347{
348 struct trace_event_file *file;
349 struct trace_array *tr;
350
351 mutex_lock(&event_mutex);
352 do_for_each_event_file(tr, file) {
353 if (!(file->flags & EVENT_FILE_FL_ENABLED))
354 continue;
355
356 if (enable) {
357 tracing_start_tgid_record();
358 set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
359 } else {
360 tracing_stop_tgid_record();
361 clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
362 &file->flags);
363 }
364 } while_for_each_event_file();
365 mutex_unlock(&event_mutex);
366}
367
346static int __ftrace_event_enable_disable(struct trace_event_file *file, 368static int __ftrace_event_enable_disable(struct trace_event_file *file,
347 int enable, int soft_disable) 369 int enable, int soft_disable)
348{ 370{
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
381 tracing_stop_cmdline_record(); 403 tracing_stop_cmdline_record();
382 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); 404 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
383 } 405 }
406
407 if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
408 tracing_stop_tgid_record();
409 clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
410 }
411
384 call->class->reg(call, TRACE_REG_UNREGISTER, file); 412 call->class->reg(call, TRACE_REG_UNREGISTER, file);
385 } 413 }
386 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ 414 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
407 } 435 }
408 436
409 if (!(file->flags & EVENT_FILE_FL_ENABLED)) { 437 if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
438 bool cmd = false, tgid = false;
410 439
411 /* Keep the event disabled, when going to SOFT_MODE. */ 440 /* Keep the event disabled, when going to SOFT_MODE. */
412 if (soft_disable) 441 if (soft_disable)
413 set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); 442 set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
414 443
415 if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { 444 if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
445 cmd = true;
416 tracing_start_cmdline_record(); 446 tracing_start_cmdline_record();
417 set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); 447 set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
418 } 448 }
449
450 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
451 tgid = true;
452 tracing_start_tgid_record();
453 set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
454 }
455
419 ret = call->class->reg(call, TRACE_REG_REGISTER, file); 456 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
420 if (ret) { 457 if (ret) {
421 tracing_stop_cmdline_record(); 458 if (cmd)
459 tracing_stop_cmdline_record();
460 if (tgid)
461 tracing_stop_tgid_record();
422 pr_info("event trace: Could not enable event " 462 pr_info("event trace: Could not enable event "
423 "%s\n", trace_event_name(call)); 463 "%s\n", trace_event_name(call));
424 break; 464 break;
@@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
2067 return 0; 2107 return 0;
2068} 2108}
2069 2109
2070static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) 2110static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
2071{ 2111{
2072 int rlen; 2112 int rlen;
2073 int elen; 2113 int elen;
2074 2114
2075 /* Find the length of the enum value as a string */ 2115 /* Find the length of the eval value as a string */
2076 elen = snprintf(ptr, 0, "%ld", map->enum_value); 2116 elen = snprintf(ptr, 0, "%ld", map->eval_value);
2077 /* Make sure there's enough room to replace the string with the value */ 2117 /* Make sure there's enough room to replace the string with the value */
2078 if (len < elen) 2118 if (len < elen)
2079 return NULL; 2119 return NULL;
2080 2120
2081 snprintf(ptr, elen + 1, "%ld", map->enum_value); 2121 snprintf(ptr, elen + 1, "%ld", map->eval_value);
2082 2122
2083 /* Get the rest of the string of ptr */ 2123 /* Get the rest of the string of ptr */
2084 rlen = strlen(ptr + len); 2124 rlen = strlen(ptr + len);
@@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
2090} 2130}
2091 2131
2092static void update_event_printk(struct trace_event_call *call, 2132static void update_event_printk(struct trace_event_call *call,
2093 struct trace_enum_map *map) 2133 struct trace_eval_map *map)
2094{ 2134{
2095 char *ptr; 2135 char *ptr;
2096 int quote = 0; 2136 int quote = 0;
2097 int len = strlen(map->enum_string); 2137 int len = strlen(map->eval_string);
2098 2138
2099 for (ptr = call->print_fmt; *ptr; ptr++) { 2139 for (ptr = call->print_fmt; *ptr; ptr++) {
2100 if (*ptr == '\\') { 2140 if (*ptr == '\\') {
@@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call,
2125 continue; 2165 continue;
2126 } 2166 }
2127 if (isalpha(*ptr) || *ptr == '_') { 2167 if (isalpha(*ptr) || *ptr == '_') {
2128 if (strncmp(map->enum_string, ptr, len) == 0 && 2168 if (strncmp(map->eval_string, ptr, len) == 0 &&
2129 !isalnum(ptr[len]) && ptr[len] != '_') { 2169 !isalnum(ptr[len]) && ptr[len] != '_') {
2130 ptr = enum_replace(ptr, map, len); 2170 ptr = eval_replace(ptr, map, len);
2131 /* Hmm, enum string smaller than value */ 2171 /* enum/sizeof string smaller than value */
2132 if (WARN_ON_ONCE(!ptr)) 2172 if (WARN_ON_ONCE(!ptr))
2133 return; 2173 return;
2134 /* 2174 /*
2135 * No need to decrement here, as enum_replace() 2175 * No need to decrement here, as eval_replace()
2136 * returns the pointer to the character passed 2176 * returns the pointer to the character passed
2137 * the enum, and two enums can not be placed 2177 * the eval, and two evals can not be placed
2138 * back to back without something in between. 2178 * back to back without something in between.
2139 * We can skip that something in between. 2179 * We can skip that something in between.
2140 */ 2180 */
@@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call,
2165 } 2205 }
2166} 2206}
2167 2207
2168void trace_event_enum_update(struct trace_enum_map **map, int len) 2208void trace_event_eval_update(struct trace_eval_map **map, int len)
2169{ 2209{
2170 struct trace_event_call *call, *p; 2210 struct trace_event_call *call, *p;
2171 const char *last_system = NULL; 2211 const char *last_system = NULL;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2c5221819be5..c9b5aa10fbf9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = {
598 .priority = 1 /* Invoked after kprobe module callback */ 598 .priority = 1 /* Invoked after kprobe module callback */
599}; 599};
600 600
601/* Convert certain expected symbols into '_' when generating event names */
602static inline void sanitize_event_name(char *name)
603{
604 while (*name++ != '\0')
605 if (*name == ':' || *name == '.')
606 *name = '_';
607}
608
601static int create_trace_kprobe(int argc, char **argv) 609static int create_trace_kprobe(int argc, char **argv)
602{ 610{
603 /* 611 /*
@@ -736,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv)
736 else 744 else
737 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", 745 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
738 is_return ? 'r' : 'p', addr); 746 is_return ? 'r' : 'p', addr);
747 sanitize_event_name(buf);
739 event = buf; 748 event = buf;
740 } 749 }
741 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, 750 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
340static void 340static void
341seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 341seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
342{ 342{
343#ifdef CONFIG_KALLSYMS
344 char str[KSYM_SYMBOL_LEN]; 343 char str[KSYM_SYMBOL_LEN];
344#ifdef CONFIG_KALLSYMS
345 const char *name; 345 const char *name;
346 346
347 kallsyms_lookup(address, NULL, NULL, NULL, str); 347 kallsyms_lookup(address, NULL, NULL, NULL, str);
348 348
349 name = kretprobed(str); 349 name = kretprobed(str);
350 350
351 trace_seq_printf(s, fmt, name); 351 if (name && strlen(name)) {
352 trace_seq_printf(s, fmt, name);
353 return;
354 }
352#endif 355#endif
356 snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
357 trace_seq_printf(s, fmt, str);
353} 358}
354 359
355static void 360static void
356seq_print_sym_offset(struct trace_seq *s, const char *fmt, 361seq_print_sym_offset(struct trace_seq *s, const char *fmt,
357 unsigned long address) 362 unsigned long address)
358{ 363{
359#ifdef CONFIG_KALLSYMS
360 char str[KSYM_SYMBOL_LEN]; 364 char str[KSYM_SYMBOL_LEN];
365#ifdef CONFIG_KALLSYMS
361 const char *name; 366 const char *name;
362 367
363 sprint_symbol(str, address); 368 sprint_symbol(str, address);
364 name = kretprobed(str); 369 name = kretprobed(str);
365 370
366 trace_seq_printf(s, fmt, name); 371 if (name && strlen(name)) {
372 trace_seq_printf(s, fmt, name);
373 return;
374 }
367#endif 375#endif
376 snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
377 trace_seq_printf(s, fmt, str);
368} 378}
369 379
370#ifndef CONFIG_64BIT 380#ifndef CONFIG_64BIT
@@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
587 trace_seq_printf(s, "%16s-%-5d [%03d] ", 597 trace_seq_printf(s, "%16s-%-5d [%03d] ",
588 comm, entry->pid, iter->cpu); 598 comm, entry->pid, iter->cpu);
589 599
600 if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
601 unsigned int tgid = trace_find_tgid(entry->pid);
602
603 if (!tgid)
604 trace_seq_printf(s, "(-----) ");
605 else
606 trace_seq_printf(s, "(%5d) ", tgid);
607 }
608
590 if (tr->trace_flags & TRACE_ITER_IRQ_INFO) 609 if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
591 trace_print_lat_fmt(s, entry); 610 trace_print_lat_fmt(s, entry);
592 611
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15static int sched_ref; 15#define RECORD_CMDLINE 1
16#define RECORD_TGID 2
17
18static int sched_cmdline_ref;
19static int sched_tgid_ref;
16static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
17 21
18static void 22static void
19probe_sched_switch(void *ignore, bool preempt, 23probe_sched_switch(void *ignore, bool preempt,
20 struct task_struct *prev, struct task_struct *next) 24 struct task_struct *prev, struct task_struct *next)
21{ 25{
22 if (unlikely(!sched_ref)) 26 int flags;
23 return; 27
28 flags = (RECORD_TGID * !!sched_tgid_ref) +
29 (RECORD_CMDLINE * !!sched_cmdline_ref);
24 30
25 tracing_record_cmdline(prev); 31 if (!flags)
26 tracing_record_cmdline(next); 32 return;
33 tracing_record_taskinfo_sched_switch(prev, next, flags);
27} 34}
28 35
29static void 36static void
30probe_sched_wakeup(void *ignore, struct task_struct *wakee) 37probe_sched_wakeup(void *ignore, struct task_struct *wakee)
31{ 38{
32 if (unlikely(!sched_ref)) 39 int flags;
33 return; 40
41 flags = (RECORD_TGID * !!sched_tgid_ref) +
42 (RECORD_CMDLINE * !!sched_cmdline_ref);
34 43
35 tracing_record_cmdline(current); 44 if (!flags)
45 return;
46 tracing_record_taskinfo(current, flags);
36} 47}
37 48
38static int tracing_sched_register(void) 49static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
75 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); 86 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
76} 87}
77 88
78static void tracing_start_sched_switch(void) 89static void tracing_start_sched_switch(int ops)
79{ 90{
91 bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
80 mutex_lock(&sched_register_mutex); 92 mutex_lock(&sched_register_mutex);
81 if (!(sched_ref++)) 93
94 switch (ops) {
95 case RECORD_CMDLINE:
96 sched_cmdline_ref++;
97 break;
98
99 case RECORD_TGID:
100 sched_tgid_ref++;
101 break;
102 }
103
104 if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
82 tracing_sched_register(); 105 tracing_sched_register();
83 mutex_unlock(&sched_register_mutex); 106 mutex_unlock(&sched_register_mutex);
84} 107}
85 108
86static void tracing_stop_sched_switch(void) 109static void tracing_stop_sched_switch(int ops)
87{ 110{
88 mutex_lock(&sched_register_mutex); 111 mutex_lock(&sched_register_mutex);
89 if (!(--sched_ref)) 112
113 switch (ops) {
114 case RECORD_CMDLINE:
115 sched_cmdline_ref--;
116 break;
117
118 case RECORD_TGID:
119 sched_tgid_ref--;
120 break;
121 }
122
123 if (!sched_cmdline_ref && !sched_tgid_ref)
90 tracing_sched_unregister(); 124 tracing_sched_unregister();
91 mutex_unlock(&sched_register_mutex); 125 mutex_unlock(&sched_register_mutex);
92} 126}
93 127
94void tracing_start_cmdline_record(void) 128void tracing_start_cmdline_record(void)
95{ 129{
96 tracing_start_sched_switch(); 130 tracing_start_sched_switch(RECORD_CMDLINE);
97} 131}
98 132
99void tracing_stop_cmdline_record(void) 133void tracing_stop_cmdline_record(void)
100{ 134{
101 tracing_stop_sched_switch(); 135 tracing_stop_sched_switch(RECORD_CMDLINE);
136}
137
138void tracing_start_tgid_record(void)
139{
140 tracing_start_sched_switch(RECORD_TGID);
141}
142
143void tracing_stop_tgid_record(void)
144{
145 tracing_stop_sched_switch(RECORD_TGID);
102} 146}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b4a751e8f9d6..a4df67cbc711 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = {
406 .release = seq_release, 406 .release = seq_release,
407}; 407};
408 408
409#ifdef CONFIG_DYNAMIC_FTRACE
410
409static int 411static int
410stack_trace_filter_open(struct inode *inode, struct file *file) 412stack_trace_filter_open(struct inode *inode, struct file *file)
411{ 413{
@@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = {
423 .release = ftrace_regex_release, 425 .release = ftrace_regex_release,
424}; 426};
425 427
428#endif /* CONFIG_DYNAMIC_FTRACE */
429
426int 430int
427stack_trace_sysctl(struct ctl_table *table, int write, 431stack_trace_sysctl(struct ctl_table *table, int write,
428 void __user *buffer, size_t *lenp, 432 void __user *buffer, size_t *lenp,
@@ -477,8 +481,10 @@ static __init int stack_trace_init(void)
477 trace_create_file("stack_trace", 0444, d_tracer, 481 trace_create_file("stack_trace", 0444, d_tracer,
478 NULL, &stack_trace_fops); 482 NULL, &stack_trace_fops);
479 483
484#ifdef CONFIG_DYNAMIC_FTRACE
480 trace_create_file("stack_trace_filter", 0444, d_tracer, 485 trace_create_file("stack_trace_filter", 0444, d_tracer,
481 &trace_ops, &stack_trace_filter_fops); 486 &trace_ops, &stack_trace_filter_fops);
487#endif
482 488
483 if (stack_trace_filter_buf[0]) 489 if (stack_trace_filter_buf[0])
484 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); 490 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 03e0b69bb5bf..06d3389bca0d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,7 +9,7 @@
9 * to those contributors as well. 9 * to those contributors as well.
10 */ 10 */
11 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt 12#define pr_fmt(fmt) "watchdog: " fmt
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
@@ -29,15 +29,58 @@
29#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
30#include <linux/kthread.h> 30#include <linux/kthread.h>
31 31
32/* Watchdog configuration */
32static DEFINE_MUTEX(watchdog_proc_mutex); 33static DEFINE_MUTEX(watchdog_proc_mutex);
33 34
34#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) 35int __read_mostly nmi_watchdog_enabled;
35unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; 36
37#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
38unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
39 NMI_WATCHDOG_ENABLED;
36#else 40#else
37unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; 41unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
38#endif 42#endif
39int __read_mostly nmi_watchdog_enabled; 43
44#ifdef CONFIG_HARDLOCKUP_DETECTOR
45/* boot commands */
46/*
47 * Should we panic when a soft-lockup or hard-lockup occurs:
48 */
49unsigned int __read_mostly hardlockup_panic =
50 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
51/*
52 * We may not want to enable hard lockup detection by default in all cases,
53 * for example when running the kernel as a guest on a hypervisor. In these
54 * cases this function can be called to disable hard lockup detection. This
55 * function should only be executed once by the boot processor before the
56 * kernel command line parameters are parsed, because otherwise it is not
57 * possible to override this in hardlockup_panic_setup().
58 */
59void hardlockup_detector_disable(void)
60{
61 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
62}
63
64static int __init hardlockup_panic_setup(char *str)
65{
66 if (!strncmp(str, "panic", 5))
67 hardlockup_panic = 1;
68 else if (!strncmp(str, "nopanic", 7))
69 hardlockup_panic = 0;
70 else if (!strncmp(str, "0", 1))
71 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
72 else if (!strncmp(str, "1", 1))
73 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
74 return 1;
75}
76__setup("nmi_watchdog=", hardlockup_panic_setup);
77
78#endif
79
80#ifdef CONFIG_SOFTLOCKUP_DETECTOR
40int __read_mostly soft_watchdog_enabled; 81int __read_mostly soft_watchdog_enabled;
82#endif
83
41int __read_mostly watchdog_user_enabled; 84int __read_mostly watchdog_user_enabled;
42int __read_mostly watchdog_thresh = 10; 85int __read_mostly watchdog_thresh = 10;
43 86
@@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10;
45int __read_mostly sysctl_softlockup_all_cpu_backtrace; 88int __read_mostly sysctl_softlockup_all_cpu_backtrace;
46int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 89int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
47#endif 90#endif
48static struct cpumask watchdog_cpumask __read_mostly; 91struct cpumask watchdog_cpumask __read_mostly;
49unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 92unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
50 93
51/* Helper for online, unparked cpus. */
52#define for_each_watchdog_cpu(cpu) \
53 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
54
55atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
56
57/* 94/*
58 * The 'watchdog_running' variable is set to 1 when the watchdog threads 95 * The 'watchdog_running' variable is set to 1 when the watchdog threads
59 * are registered/started and is set to 0 when the watchdog threads are 96 * are registered/started and is set to 0 when the watchdog threads are
@@ -72,7 +109,47 @@ static int __read_mostly watchdog_running;
72 * of 'watchdog_running' cannot change while the watchdog is deactivated 109 * of 'watchdog_running' cannot change while the watchdog is deactivated
73 * temporarily (see related code in 'proc' handlers). 110 * temporarily (see related code in 'proc' handlers).
74 */ 111 */
75static int __read_mostly watchdog_suspended; 112int __read_mostly watchdog_suspended;
113
114/*
115 * These functions can be overridden if an architecture implements its
116 * own hardlockup detector.
117 *
118 * watchdog_nmi_enable/disable can be implemented to start and stop when
119 * softlockup watchdog threads start and stop. The arch must select the
120 * SOFTLOCKUP_DETECTOR Kconfig.
121 */
122int __weak watchdog_nmi_enable(unsigned int cpu)
123{
124 return 0;
125}
126void __weak watchdog_nmi_disable(unsigned int cpu)
127{
128}
129
130/*
131 * watchdog_nmi_reconfigure can be implemented to be notified after any
132 * watchdog configuration change. The arch hardlockup watchdog should
133 * respond to the following variables:
134 * - nmi_watchdog_enabled
135 * - watchdog_thresh
136 * - watchdog_cpumask
137 * - sysctl_hardlockup_all_cpu_backtrace
138 * - hardlockup_panic
139 * - watchdog_suspended
140 */
141void __weak watchdog_nmi_reconfigure(void)
142{
143}
144
145
146#ifdef CONFIG_SOFTLOCKUP_DETECTOR
147
148/* Helper for online, unparked cpus. */
149#define for_each_watchdog_cpu(cpu) \
150 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
151
152atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
76 153
77static u64 __read_mostly sample_period; 154static u64 __read_mostly sample_period;
78 155
@@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
120 return 1; 197 return 1;
121} 198}
122__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 199__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
200#ifdef CONFIG_HARDLOCKUP_DETECTOR
123static int __init hardlockup_all_cpu_backtrace_setup(char *str) 201static int __init hardlockup_all_cpu_backtrace_setup(char *str)
124{ 202{
125 sysctl_hardlockup_all_cpu_backtrace = 203 sysctl_hardlockup_all_cpu_backtrace =
@@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
128} 206}
129__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); 207__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
130#endif 208#endif
209#endif
131 210
132/* 211/*
133 * Hard-lockup warnings should be triggered after just a few seconds. Soft- 212 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -213,18 +292,6 @@ void touch_softlockup_watchdog_sync(void)
213 __this_cpu_write(watchdog_touch_ts, 0); 292 __this_cpu_write(watchdog_touch_ts, 0);
214} 293}
215 294
216/* watchdog detector functions */
217bool is_hardlockup(void)
218{
219 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
220
221 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
222 return true;
223
224 __this_cpu_write(hrtimer_interrupts_saved, hrint);
225 return false;
226}
227
228static int is_softlockup(unsigned long touch_ts) 295static int is_softlockup(unsigned long touch_ts)
229{ 296{
230 unsigned long now = get_timestamp(); 297 unsigned long now = get_timestamp();
@@ -237,21 +304,21 @@ static int is_softlockup(unsigned long touch_ts)
237 return 0; 304 return 0;
238} 305}
239 306
240static void watchdog_interrupt_count(void) 307/* watchdog detector functions */
308bool is_hardlockup(void)
241{ 309{
242 __this_cpu_inc(hrtimer_interrupts); 310 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
243}
244 311
245/* 312 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
246 * These two functions are mostly architecture specific 313 return true;
247 * defining them as weak here. 314
248 */ 315 __this_cpu_write(hrtimer_interrupts_saved, hrint);
249int __weak watchdog_nmi_enable(unsigned int cpu) 316 return false;
250{
251 return 0;
252} 317}
253void __weak watchdog_nmi_disable(unsigned int cpu) 318
319static void watchdog_interrupt_count(void)
254{ 320{
321 __this_cpu_inc(hrtimer_interrupts);
255} 322}
256 323
257static int watchdog_enable_all_cpus(void); 324static int watchdog_enable_all_cpus(void);
@@ -502,57 +569,6 @@ static void watchdog_unpark_threads(void)
502 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 569 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
503} 570}
504 571
505/*
506 * Suspend the hard and soft lockup detector by parking the watchdog threads.
507 */
508int lockup_detector_suspend(void)
509{
510 int ret = 0;
511
512 get_online_cpus();
513 mutex_lock(&watchdog_proc_mutex);
514 /*
515 * Multiple suspend requests can be active in parallel (counted by
516 * the 'watchdog_suspended' variable). If the watchdog threads are
517 * running, the first caller takes care that they will be parked.
518 * The state of 'watchdog_running' cannot change while a suspend
519 * request is active (see related code in 'proc' handlers).
520 */
521 if (watchdog_running && !watchdog_suspended)
522 ret = watchdog_park_threads();
523
524 if (ret == 0)
525 watchdog_suspended++;
526 else {
527 watchdog_disable_all_cpus();
528 pr_err("Failed to suspend lockup detectors, disabled\n");
529 watchdog_enabled = 0;
530 }
531
532 mutex_unlock(&watchdog_proc_mutex);
533
534 return ret;
535}
536
537/*
538 * Resume the hard and soft lockup detector by unparking the watchdog threads.
539 */
540void lockup_detector_resume(void)
541{
542 mutex_lock(&watchdog_proc_mutex);
543
544 watchdog_suspended--;
545 /*
546 * The watchdog threads are unparked if they were previously running
547 * and if there is no more active suspend request.
548 */
549 if (watchdog_running && !watchdog_suspended)
550 watchdog_unpark_threads();
551
552 mutex_unlock(&watchdog_proc_mutex);
553 put_online_cpus();
554}
555
556static int update_watchdog_all_cpus(void) 572static int update_watchdog_all_cpus(void)
557{ 573{
558 int ret; 574 int ret;
@@ -605,6 +621,100 @@ static void watchdog_disable_all_cpus(void)
605} 621}
606 622
607#ifdef CONFIG_SYSCTL 623#ifdef CONFIG_SYSCTL
624static int watchdog_update_cpus(void)
625{
626 return smpboot_update_cpumask_percpu_thread(
627 &watchdog_threads, &watchdog_cpumask);
628}
629#endif
630
631#else /* SOFTLOCKUP */
632static int watchdog_park_threads(void)
633{
634 return 0;
635}
636
637static void watchdog_unpark_threads(void)
638{
639}
640
641static int watchdog_enable_all_cpus(void)
642{
643 return 0;
644}
645
646static void watchdog_disable_all_cpus(void)
647{
648}
649
650#ifdef CONFIG_SYSCTL
651static int watchdog_update_cpus(void)
652{
653 return 0;
654}
655#endif
656
657static void set_sample_period(void)
658{
659}
660#endif /* SOFTLOCKUP */
661
662/*
663 * Suspend the hard and soft lockup detector by parking the watchdog threads.
664 */
665int lockup_detector_suspend(void)
666{
667 int ret = 0;
668
669 get_online_cpus();
670 mutex_lock(&watchdog_proc_mutex);
671 /*
672 * Multiple suspend requests can be active in parallel (counted by
673 * the 'watchdog_suspended' variable). If the watchdog threads are
674 * running, the first caller takes care that they will be parked.
675 * The state of 'watchdog_running' cannot change while a suspend
676 * request is active (see related code in 'proc' handlers).
677 */
678 if (watchdog_running && !watchdog_suspended)
679 ret = watchdog_park_threads();
680
681 if (ret == 0)
682 watchdog_suspended++;
683 else {
684 watchdog_disable_all_cpus();
685 pr_err("Failed to suspend lockup detectors, disabled\n");
686 watchdog_enabled = 0;
687 }
688
689 watchdog_nmi_reconfigure();
690
691 mutex_unlock(&watchdog_proc_mutex);
692
693 return ret;
694}
695
696/*
697 * Resume the hard and soft lockup detector by unparking the watchdog threads.
698 */
699void lockup_detector_resume(void)
700{
701 mutex_lock(&watchdog_proc_mutex);
702
703 watchdog_suspended--;
704 /*
705 * The watchdog threads are unparked if they were previously running
706 * and if there is no more active suspend request.
707 */
708 if (watchdog_running && !watchdog_suspended)
709 watchdog_unpark_threads();
710
711 watchdog_nmi_reconfigure();
712
713 mutex_unlock(&watchdog_proc_mutex);
714 put_online_cpus();
715}
716
717#ifdef CONFIG_SYSCTL
608 718
609/* 719/*
610 * Update the run state of the lockup detectors. 720 * Update the run state of the lockup detectors.
@@ -625,6 +735,8 @@ static int proc_watchdog_update(void)
625 else 735 else
626 watchdog_disable_all_cpus(); 736 watchdog_disable_all_cpus();
627 737
738 watchdog_nmi_reconfigure();
739
628 return err; 740 return err;
629 741
630} 742}
@@ -810,10 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
810 * a temporary cpumask, so we are likely not in a 922 * a temporary cpumask, so we are likely not in a
811 * position to do much else to make things better. 923 * position to do much else to make things better.
812 */ 924 */
813 if (smpboot_update_cpumask_percpu_thread( 925 if (watchdog_update_cpus() != 0)
814 &watchdog_threads, &watchdog_cpumask) != 0)
815 pr_err("cpumask update failed\n"); 926 pr_err("cpumask update failed\n");
816 } 927 }
928
929 watchdog_nmi_reconfigure();
817 } 930 }
818out: 931out:
819 mutex_unlock(&watchdog_proc_mutex); 932 mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 54a427d1f344..295a0d84934c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
22static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 22static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 23static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
24 24
25/* boot commands */
26/*
27 * Should we panic when a soft-lockup or hard-lockup occurs:
28 */
29unsigned int __read_mostly hardlockup_panic =
30 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
31static unsigned long hardlockup_allcpu_dumped; 25static unsigned long hardlockup_allcpu_dumped;
32/*
33 * We may not want to enable hard lockup detection by default in all cases,
34 * for example when running the kernel as a guest on a hypervisor. In these
35 * cases this function can be called to disable hard lockup detection. This
36 * function should only be executed once by the boot processor before the
37 * kernel command line parameters are parsed, because otherwise it is not
38 * possible to override this in hardlockup_panic_setup().
39 */
40void hardlockup_detector_disable(void)
41{
42 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
43}
44
45static int __init hardlockup_panic_setup(char *str)
46{
47 if (!strncmp(str, "panic", 5))
48 hardlockup_panic = 1;
49 else if (!strncmp(str, "nopanic", 7))
50 hardlockup_panic = 0;
51 else if (!strncmp(str, "0", 1))
52 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
53 else if (!strncmp(str, "1", 1))
54 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
55 return 1;
56}
57__setup("nmi_watchdog=", hardlockup_panic_setup);
58 26
59void touch_nmi_watchdog(void) 27void arch_touch_nmi_watchdog(void)
60{ 28{
61 /* 29 /*
62 * Using __raw here because some code paths have 30 * Using __raw here because some code paths have
@@ -66,9 +34,8 @@ void touch_nmi_watchdog(void)
66 * going off. 34 * going off.
67 */ 35 */
68 raw_cpu_write(watchdog_nmi_touch, true); 36 raw_cpu_write(watchdog_nmi_touch, true);
69 touch_softlockup_watchdog();
70} 37}
71EXPORT_SYMBOL(touch_nmi_watchdog); 38EXPORT_SYMBOL(arch_touch_nmi_watchdog);
72 39
73static struct perf_event_attr wd_hw_attr = { 40static struct perf_event_attr wd_hw_attr = {
74 .type = PERF_TYPE_HARDWARE, 41 .type = PERF_TYPE_HARDWARE,