diff options
Diffstat (limited to 'kernel')
134 files changed, 8915 insertions, 3800 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index b302b4731d16..72aa080f91f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o | |||
59 | obj-$(CONFIG_MODULE_SIG) += module_signing.o | 59 | obj-$(CONFIG_MODULE_SIG) += module_signing.o |
60 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 60 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
61 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 61 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
62 | obj-$(CONFIG_CRASH_CORE) += crash_core.o | ||
62 | obj-$(CONFIG_KEXEC_CORE) += kexec_core.o | 63 | obj-$(CONFIG_KEXEC_CORE) += kexec_core.o |
63 | obj-$(CONFIG_KEXEC) += kexec.o | 64 | obj-$(CONFIG_KEXEC) += kexec.o |
64 | obj-$(CONFIG_KEXEC_FILE) += kexec_file.o | 65 | obj-$(CONFIG_KEXEC_FILE) += kexec_file.o |
diff --git a/kernel/audit.c b/kernel/audit.c index a871bf80fde1..4b7d49868ce1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -58,6 +58,8 @@ | |||
58 | #include <linux/rcupdate.h> | 58 | #include <linux/rcupdate.h> |
59 | #include <linux/mutex.h> | 59 | #include <linux/mutex.h> |
60 | #include <linux/gfp.h> | 60 | #include <linux/gfp.h> |
61 | #include <linux/pid.h> | ||
62 | #include <linux/slab.h> | ||
61 | 63 | ||
62 | #include <linux/audit.h> | 64 | #include <linux/audit.h> |
63 | 65 | ||
@@ -110,18 +112,19 @@ struct audit_net { | |||
110 | * @pid: auditd PID | 112 | * @pid: auditd PID |
111 | * @portid: netlink portid | 113 | * @portid: netlink portid |
112 | * @net: the associated network namespace | 114 | * @net: the associated network namespace |
113 | * @lock: spinlock to protect write access | 115 | * @rcu: RCU head |
114 | * | 116 | * |
115 | * Description: | 117 | * Description: |
116 | * This struct is RCU protected; you must either hold the RCU lock for reading | 118 | * This struct is RCU protected; you must either hold the RCU lock for reading |
117 | * or the included spinlock for writing. | 119 | * or the associated spinlock for writing. |
118 | */ | 120 | */ |
119 | static struct auditd_connection { | 121 | static struct auditd_connection { |
120 | int pid; | 122 | struct pid *pid; |
121 | u32 portid; | 123 | u32 portid; |
122 | struct net *net; | 124 | struct net *net; |
123 | spinlock_t lock; | 125 | struct rcu_head rcu; |
124 | } auditd_conn; | 126 | } *auditd_conn = NULL; |
127 | static DEFINE_SPINLOCK(auditd_conn_lock); | ||
125 | 128 | ||
126 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records | 129 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
127 | * to that number per second. This prevents DoS attacks, but results in | 130 | * to that number per second. This prevents DoS attacks, but results in |
@@ -151,12 +154,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
151 | /* Hash for inode-based rules */ | 154 | /* Hash for inode-based rules */ |
152 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | 155 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; |
153 | 156 | ||
154 | /* The audit_freelist is a list of pre-allocated audit buffers (if more | 157 | static struct kmem_cache *audit_buffer_cache; |
155 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | ||
156 | * being placed on the freelist). */ | ||
157 | static DEFINE_SPINLOCK(audit_freelist_lock); | ||
158 | static int audit_freelist_count; | ||
159 | static LIST_HEAD(audit_freelist); | ||
160 | 158 | ||
161 | /* queue msgs to send via kauditd_task */ | 159 | /* queue msgs to send via kauditd_task */ |
162 | static struct sk_buff_head audit_queue; | 160 | static struct sk_buff_head audit_queue; |
@@ -191,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex); | |||
191 | * should be at least that large. */ | 189 | * should be at least that large. */ |
192 | #define AUDIT_BUFSIZ 1024 | 190 | #define AUDIT_BUFSIZ 1024 |
193 | 191 | ||
194 | /* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the | ||
195 | * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ | ||
196 | #define AUDIT_MAXFREE (2*NR_CPUS) | ||
197 | |||
198 | /* The audit_buffer is used when formatting an audit record. The caller | 192 | /* The audit_buffer is used when formatting an audit record. The caller |
199 | * locks briefly to get the record off the freelist or to allocate the | 193 | * locks briefly to get the record off the freelist or to allocate the |
200 | * buffer, and locks briefly to send the buffer to the netlink layer or | 194 | * buffer, and locks briefly to send the buffer to the netlink layer or |
201 | * to place it on a transmit queue. Multiple audit_buffers can be in | 195 | * to place it on a transmit queue. Multiple audit_buffers can be in |
202 | * use simultaneously. */ | 196 | * use simultaneously. */ |
203 | struct audit_buffer { | 197 | struct audit_buffer { |
204 | struct list_head list; | ||
205 | struct sk_buff *skb; /* formatted skb ready to send */ | 198 | struct sk_buff *skb; /* formatted skb ready to send */ |
206 | struct audit_context *ctx; /* NULL or associated context */ | 199 | struct audit_context *ctx; /* NULL or associated context */ |
207 | gfp_t gfp_mask; | 200 | gfp_t gfp_mask; |
@@ -220,18 +213,42 @@ struct audit_reply { | |||
220 | * Description: | 213 | * Description: |
221 | * Return 1 if the task is a registered audit daemon, 0 otherwise. | 214 | * Return 1 if the task is a registered audit daemon, 0 otherwise. |
222 | */ | 215 | */ |
223 | int auditd_test_task(const struct task_struct *task) | 216 | int auditd_test_task(struct task_struct *task) |
224 | { | 217 | { |
225 | int rc; | 218 | int rc; |
219 | struct auditd_connection *ac; | ||
226 | 220 | ||
227 | rcu_read_lock(); | 221 | rcu_read_lock(); |
228 | rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0); | 222 | ac = rcu_dereference(auditd_conn); |
223 | rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); | ||
229 | rcu_read_unlock(); | 224 | rcu_read_unlock(); |
230 | 225 | ||
231 | return rc; | 226 | return rc; |
232 | } | 227 | } |
233 | 228 | ||
234 | /** | 229 | /** |
230 | * auditd_pid_vnr - Return the auditd PID relative to the namespace | ||
231 | * | ||
232 | * Description: | ||
233 | * Returns the PID in relation to the namespace, 0 on failure. | ||
234 | */ | ||
235 | static pid_t auditd_pid_vnr(void) | ||
236 | { | ||
237 | pid_t pid; | ||
238 | const struct auditd_connection *ac; | ||
239 | |||
240 | rcu_read_lock(); | ||
241 | ac = rcu_dereference(auditd_conn); | ||
242 | if (!ac || !ac->pid) | ||
243 | pid = 0; | ||
244 | else | ||
245 | pid = pid_vnr(ac->pid); | ||
246 | rcu_read_unlock(); | ||
247 | |||
248 | return pid; | ||
249 | } | ||
250 | |||
251 | /** | ||
235 | * audit_get_sk - Return the audit socket for the given network namespace | 252 | * audit_get_sk - Return the audit socket for the given network namespace |
236 | * @net: the destination network namespace | 253 | * @net: the destination network namespace |
237 | * | 254 | * |
@@ -250,14 +267,6 @@ static struct sock *audit_get_sk(const struct net *net) | |||
250 | return aunet->sk; | 267 | return aunet->sk; |
251 | } | 268 | } |
252 | 269 | ||
253 | static void audit_set_portid(struct audit_buffer *ab, __u32 portid) | ||
254 | { | ||
255 | if (ab) { | ||
256 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | ||
257 | nlh->nlmsg_pid = portid; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | void audit_panic(const char *message) | 270 | void audit_panic(const char *message) |
262 | { | 271 | { |
263 | switch (audit_failure) { | 272 | switch (audit_failure) { |
@@ -427,6 +436,24 @@ static int audit_set_failure(u32 state) | |||
427 | } | 436 | } |
428 | 437 | ||
429 | /** | 438 | /** |
439 | * auditd_conn_free - RCU helper to release an auditd connection struct | ||
440 | * @rcu: RCU head | ||
441 | * | ||
442 | * Description: | ||
443 | * Drop any references inside the auditd connection tracking struct and free | ||
444 | * the memory. | ||
445 | */ | ||
446 | static void auditd_conn_free(struct rcu_head *rcu) | ||
447 | { | ||
448 | struct auditd_connection *ac; | ||
449 | |||
450 | ac = container_of(rcu, struct auditd_connection, rcu); | ||
451 | put_pid(ac->pid); | ||
452 | put_net(ac->net); | ||
453 | kfree(ac); | ||
454 | } | ||
455 | |||
456 | /** | ||
430 | * auditd_set - Set/Reset the auditd connection state | 457 | * auditd_set - Set/Reset the auditd connection state |
431 | * @pid: auditd PID | 458 | * @pid: auditd PID |
432 | * @portid: auditd netlink portid | 459 | * @portid: auditd netlink portid |
@@ -434,22 +461,33 @@ static int audit_set_failure(u32 state) | |||
434 | * | 461 | * |
435 | * Description: | 462 | * Description: |
436 | * This function will obtain and drop network namespace references as | 463 | * This function will obtain and drop network namespace references as |
437 | * necessary. | 464 | * necessary. Returns zero on success, negative values on failure. |
438 | */ | 465 | */ |
439 | static void auditd_set(int pid, u32 portid, struct net *net) | 466 | static int auditd_set(struct pid *pid, u32 portid, struct net *net) |
440 | { | 467 | { |
441 | unsigned long flags; | 468 | unsigned long flags; |
469 | struct auditd_connection *ac_old, *ac_new; | ||
442 | 470 | ||
443 | spin_lock_irqsave(&auditd_conn.lock, flags); | 471 | if (!pid || !net) |
444 | auditd_conn.pid = pid; | 472 | return -EINVAL; |
445 | auditd_conn.portid = portid; | 473 | |
446 | if (auditd_conn.net) | 474 | ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); |
447 | put_net(auditd_conn.net); | 475 | if (!ac_new) |
448 | if (net) | 476 | return -ENOMEM; |
449 | auditd_conn.net = get_net(net); | 477 | ac_new->pid = get_pid(pid); |
450 | else | 478 | ac_new->portid = portid; |
451 | auditd_conn.net = NULL; | 479 | ac_new->net = get_net(net); |
452 | spin_unlock_irqrestore(&auditd_conn.lock, flags); | 480 | |
481 | spin_lock_irqsave(&auditd_conn_lock, flags); | ||
482 | ac_old = rcu_dereference_protected(auditd_conn, | ||
483 | lockdep_is_held(&auditd_conn_lock)); | ||
484 | rcu_assign_pointer(auditd_conn, ac_new); | ||
485 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | ||
486 | |||
487 | if (ac_old) | ||
488 | call_rcu(&ac_old->rcu, auditd_conn_free); | ||
489 | |||
490 | return 0; | ||
453 | } | 491 | } |
454 | 492 | ||
455 | /** | 493 | /** |
@@ -544,13 +582,19 @@ static void kauditd_retry_skb(struct sk_buff *skb) | |||
544 | */ | 582 | */ |
545 | static void auditd_reset(void) | 583 | static void auditd_reset(void) |
546 | { | 584 | { |
585 | unsigned long flags; | ||
547 | struct sk_buff *skb; | 586 | struct sk_buff *skb; |
587 | struct auditd_connection *ac_old; | ||
548 | 588 | ||
549 | /* if it isn't already broken, break the connection */ | 589 | /* if it isn't already broken, break the connection */ |
550 | rcu_read_lock(); | 590 | spin_lock_irqsave(&auditd_conn_lock, flags); |
551 | if (auditd_conn.pid) | 591 | ac_old = rcu_dereference_protected(auditd_conn, |
552 | auditd_set(0, 0, NULL); | 592 | lockdep_is_held(&auditd_conn_lock)); |
553 | rcu_read_unlock(); | 593 | rcu_assign_pointer(auditd_conn, NULL); |
594 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | ||
595 | |||
596 | if (ac_old) | ||
597 | call_rcu(&ac_old->rcu, auditd_conn_free); | ||
554 | 598 | ||
555 | /* flush all of the main and retry queues to the hold queue */ | 599 | /* flush all of the main and retry queues to the hold queue */ |
556 | while ((skb = skb_dequeue(&audit_retry_queue))) | 600 | while ((skb = skb_dequeue(&audit_retry_queue))) |
@@ -576,6 +620,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
576 | u32 portid; | 620 | u32 portid; |
577 | struct net *net; | 621 | struct net *net; |
578 | struct sock *sk; | 622 | struct sock *sk; |
623 | struct auditd_connection *ac; | ||
579 | 624 | ||
580 | /* NOTE: we can't call netlink_unicast while in the RCU section so | 625 | /* NOTE: we can't call netlink_unicast while in the RCU section so |
581 | * take a reference to the network namespace and grab local | 626 | * take a reference to the network namespace and grab local |
@@ -585,15 +630,15 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
585 | * section netlink_unicast() should safely return an error */ | 630 | * section netlink_unicast() should safely return an error */ |
586 | 631 | ||
587 | rcu_read_lock(); | 632 | rcu_read_lock(); |
588 | if (!auditd_conn.pid) { | 633 | ac = rcu_dereference(auditd_conn); |
634 | if (!ac) { | ||
589 | rcu_read_unlock(); | 635 | rcu_read_unlock(); |
590 | rc = -ECONNREFUSED; | 636 | rc = -ECONNREFUSED; |
591 | goto err; | 637 | goto err; |
592 | } | 638 | } |
593 | net = auditd_conn.net; | 639 | net = get_net(ac->net); |
594 | get_net(net); | ||
595 | sk = audit_get_sk(net); | 640 | sk = audit_get_sk(net); |
596 | portid = auditd_conn.portid; | 641 | portid = ac->portid; |
597 | rcu_read_unlock(); | 642 | rcu_read_unlock(); |
598 | 643 | ||
599 | rc = netlink_unicast(sk, skb, portid, 0); | 644 | rc = netlink_unicast(sk, skb, portid, 0); |
@@ -728,6 +773,7 @@ static int kauditd_thread(void *dummy) | |||
728 | u32 portid = 0; | 773 | u32 portid = 0; |
729 | struct net *net = NULL; | 774 | struct net *net = NULL; |
730 | struct sock *sk = NULL; | 775 | struct sock *sk = NULL; |
776 | struct auditd_connection *ac; | ||
731 | 777 | ||
732 | #define UNICAST_RETRIES 5 | 778 | #define UNICAST_RETRIES 5 |
733 | 779 | ||
@@ -735,14 +781,14 @@ static int kauditd_thread(void *dummy) | |||
735 | while (!kthread_should_stop()) { | 781 | while (!kthread_should_stop()) { |
736 | /* NOTE: see the lock comments in auditd_send_unicast_skb() */ | 782 | /* NOTE: see the lock comments in auditd_send_unicast_skb() */ |
737 | rcu_read_lock(); | 783 | rcu_read_lock(); |
738 | if (!auditd_conn.pid) { | 784 | ac = rcu_dereference(auditd_conn); |
785 | if (!ac) { | ||
739 | rcu_read_unlock(); | 786 | rcu_read_unlock(); |
740 | goto main_queue; | 787 | goto main_queue; |
741 | } | 788 | } |
742 | net = auditd_conn.net; | 789 | net = get_net(ac->net); |
743 | get_net(net); | ||
744 | sk = audit_get_sk(net); | 790 | sk = audit_get_sk(net); |
745 | portid = auditd_conn.portid; | 791 | portid = ac->portid; |
746 | rcu_read_unlock(); | 792 | rcu_read_unlock(); |
747 | 793 | ||
748 | /* attempt to flush the hold queue */ | 794 | /* attempt to flush the hold queue */ |
@@ -816,7 +862,7 @@ int audit_send_list(void *_dest) | |||
816 | return 0; | 862 | return 0; |
817 | } | 863 | } |
818 | 864 | ||
819 | struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, | 865 | struct sk_buff *audit_make_reply(int seq, int type, int done, |
820 | int multi, const void *payload, int size) | 866 | int multi, const void *payload, int size) |
821 | { | 867 | { |
822 | struct sk_buff *skb; | 868 | struct sk_buff *skb; |
@@ -829,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, | |||
829 | if (!skb) | 875 | if (!skb) |
830 | return NULL; | 876 | return NULL; |
831 | 877 | ||
832 | nlh = nlmsg_put(skb, portid, seq, t, size, flags); | 878 | nlh = nlmsg_put(skb, 0, seq, t, size, flags); |
833 | if (!nlh) | 879 | if (!nlh) |
834 | goto out_kfree_skb; | 880 | goto out_kfree_skb; |
835 | data = nlmsg_data(nlh); | 881 | data = nlmsg_data(nlh); |
@@ -873,7 +919,6 @@ static int audit_send_reply_thread(void *arg) | |||
873 | static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, | 919 | static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, |
874 | int multi, const void *payload, int size) | 920 | int multi, const void *payload, int size) |
875 | { | 921 | { |
876 | u32 portid = NETLINK_CB(request_skb).portid; | ||
877 | struct net *net = sock_net(NETLINK_CB(request_skb).sk); | 922 | struct net *net = sock_net(NETLINK_CB(request_skb).sk); |
878 | struct sk_buff *skb; | 923 | struct sk_buff *skb; |
879 | struct task_struct *tsk; | 924 | struct task_struct *tsk; |
@@ -883,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int | |||
883 | if (!reply) | 928 | if (!reply) |
884 | return; | 929 | return; |
885 | 930 | ||
886 | skb = audit_make_reply(portid, seq, type, done, multi, payload, size); | 931 | skb = audit_make_reply(seq, type, done, multi, payload, size); |
887 | if (!skb) | 932 | if (!skb) |
888 | goto out; | 933 | goto out; |
889 | 934 | ||
890 | reply->net = get_net(net); | 935 | reply->net = get_net(net); |
891 | reply->portid = portid; | 936 | reply->portid = NETLINK_CB(request_skb).portid; |
892 | reply->skb = skb; | 937 | reply->skb = skb; |
893 | 938 | ||
894 | tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); | 939 | tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); |
@@ -1068,11 +1113,13 @@ static int audit_set_feature(struct sk_buff *skb) | |||
1068 | return 0; | 1113 | return 0; |
1069 | } | 1114 | } |
1070 | 1115 | ||
1071 | static int audit_replace(pid_t pid) | 1116 | static int audit_replace(struct pid *pid) |
1072 | { | 1117 | { |
1118 | pid_t pvnr; | ||
1073 | struct sk_buff *skb; | 1119 | struct sk_buff *skb; |
1074 | 1120 | ||
1075 | skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid)); | 1121 | pvnr = pid_vnr(pid); |
1122 | skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); | ||
1076 | if (!skb) | 1123 | if (!skb) |
1077 | return -ENOMEM; | 1124 | return -ENOMEM; |
1078 | return auditd_send_unicast_skb(skb); | 1125 | return auditd_send_unicast_skb(skb); |
@@ -1102,9 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
1102 | memset(&s, 0, sizeof(s)); | 1149 | memset(&s, 0, sizeof(s)); |
1103 | s.enabled = audit_enabled; | 1150 | s.enabled = audit_enabled; |
1104 | s.failure = audit_failure; | 1151 | s.failure = audit_failure; |
1105 | rcu_read_lock(); | 1152 | /* NOTE: use pid_vnr() so the PID is relative to the current |
1106 | s.pid = auditd_conn.pid; | 1153 | * namespace */ |
1107 | rcu_read_unlock(); | 1154 | s.pid = auditd_pid_vnr(); |
1108 | s.rate_limit = audit_rate_limit; | 1155 | s.rate_limit = audit_rate_limit; |
1109 | s.backlog_limit = audit_backlog_limit; | 1156 | s.backlog_limit = audit_backlog_limit; |
1110 | s.lost = atomic_read(&audit_lost); | 1157 | s.lost = atomic_read(&audit_lost); |
@@ -1130,51 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
1130 | return err; | 1177 | return err; |
1131 | } | 1178 | } |
1132 | if (s.mask & AUDIT_STATUS_PID) { | 1179 | if (s.mask & AUDIT_STATUS_PID) { |
1133 | /* NOTE: we are using task_tgid_vnr() below because | 1180 | /* NOTE: we are using the vnr PID functions below |
1134 | * the s.pid value is relative to the namespace | 1181 | * because the s.pid value is relative to the |
1135 | * of the caller; at present this doesn't matter | 1182 | * namespace of the caller; at present this |
1136 | * much since you can really only run auditd | 1183 | * doesn't matter much since you can really only |
1137 | * from the initial pid namespace, but something | 1184 | * run auditd from the initial pid namespace, but |
1138 | * to keep in mind if this changes */ | 1185 | * something to keep in mind if this changes */ |
1139 | int new_pid = s.pid; | 1186 | pid_t new_pid = s.pid; |
1140 | pid_t auditd_pid; | 1187 | pid_t auditd_pid; |
1141 | pid_t requesting_pid = task_tgid_vnr(current); | 1188 | struct pid *req_pid = task_tgid(current); |
1189 | |||
1190 | /* sanity check - PID values must match */ | ||
1191 | if (new_pid != pid_vnr(req_pid)) | ||
1192 | return -EINVAL; | ||
1142 | 1193 | ||
1143 | /* test the auditd connection */ | 1194 | /* test the auditd connection */ |
1144 | audit_replace(requesting_pid); | 1195 | audit_replace(req_pid); |
1145 | 1196 | ||
1146 | rcu_read_lock(); | 1197 | auditd_pid = auditd_pid_vnr(); |
1147 | auditd_pid = auditd_conn.pid; | ||
1148 | /* only the current auditd can unregister itself */ | 1198 | /* only the current auditd can unregister itself */ |
1149 | if ((!new_pid) && (requesting_pid != auditd_pid)) { | 1199 | if ((!new_pid) && (new_pid != auditd_pid)) { |
1150 | rcu_read_unlock(); | ||
1151 | audit_log_config_change("audit_pid", new_pid, | 1200 | audit_log_config_change("audit_pid", new_pid, |
1152 | auditd_pid, 0); | 1201 | auditd_pid, 0); |
1153 | return -EACCES; | 1202 | return -EACCES; |
1154 | } | 1203 | } |
1155 | /* replacing a healthy auditd is not allowed */ | 1204 | /* replacing a healthy auditd is not allowed */ |
1156 | if (auditd_pid && new_pid) { | 1205 | if (auditd_pid && new_pid) { |
1157 | rcu_read_unlock(); | ||
1158 | audit_log_config_change("audit_pid", new_pid, | 1206 | audit_log_config_change("audit_pid", new_pid, |
1159 | auditd_pid, 0); | 1207 | auditd_pid, 0); |
1160 | return -EEXIST; | 1208 | return -EEXIST; |
1161 | } | 1209 | } |
1162 | rcu_read_unlock(); | ||
1163 | |||
1164 | if (audit_enabled != AUDIT_OFF) | ||
1165 | audit_log_config_change("audit_pid", new_pid, | ||
1166 | auditd_pid, 1); | ||
1167 | 1210 | ||
1168 | if (new_pid) { | 1211 | if (new_pid) { |
1169 | /* register a new auditd connection */ | 1212 | /* register a new auditd connection */ |
1170 | auditd_set(new_pid, | 1213 | err = auditd_set(req_pid, |
1171 | NETLINK_CB(skb).portid, | 1214 | NETLINK_CB(skb).portid, |
1172 | sock_net(NETLINK_CB(skb).sk)); | 1215 | sock_net(NETLINK_CB(skb).sk)); |
1216 | if (audit_enabled != AUDIT_OFF) | ||
1217 | audit_log_config_change("audit_pid", | ||
1218 | new_pid, | ||
1219 | auditd_pid, | ||
1220 | err ? 0 : 1); | ||
1221 | if (err) | ||
1222 | return err; | ||
1223 | |||
1173 | /* try to process any backlog */ | 1224 | /* try to process any backlog */ |
1174 | wake_up_interruptible(&kauditd_wait); | 1225 | wake_up_interruptible(&kauditd_wait); |
1175 | } else | 1226 | } else { |
1227 | if (audit_enabled != AUDIT_OFF) | ||
1228 | audit_log_config_change("audit_pid", | ||
1229 | new_pid, | ||
1230 | auditd_pid, 1); | ||
1231 | |||
1176 | /* unregister the auditd connection */ | 1232 | /* unregister the auditd connection */ |
1177 | auditd_reset(); | 1233 | auditd_reset(); |
1234 | } | ||
1178 | } | 1235 | } |
1179 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { | 1236 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { |
1180 | err = audit_set_rate_limit(s.rate_limit); | 1237 | err = audit_set_rate_limit(s.rate_limit); |
@@ -1242,7 +1299,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
1242 | size--; | 1299 | size--; |
1243 | audit_log_n_untrustedstring(ab, data, size); | 1300 | audit_log_n_untrustedstring(ab, data, size); |
1244 | } | 1301 | } |
1245 | audit_set_portid(ab, NETLINK_CB(skb).portid); | ||
1246 | audit_log_end(ab); | 1302 | audit_log_end(ab); |
1247 | } | 1303 | } |
1248 | break; | 1304 | break; |
@@ -1256,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
1256 | audit_log_end(ab); | 1312 | audit_log_end(ab); |
1257 | return -EPERM; | 1313 | return -EPERM; |
1258 | } | 1314 | } |
1259 | err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, | 1315 | err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); |
1260 | seq, data, nlmsg_len(nlh)); | ||
1261 | break; | 1316 | break; |
1262 | case AUDIT_LIST_RULES: | 1317 | case AUDIT_LIST_RULES: |
1263 | err = audit_list_rules_send(skb, seq); | 1318 | err = audit_list_rules_send(skb, seq); |
@@ -1378,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
1378 | return err < 0 ? err : 0; | 1433 | return err < 0 ? err : 0; |
1379 | } | 1434 | } |
1380 | 1435 | ||
1381 | /* | 1436 | /** |
1382 | * Get message from skb. Each message is processed by audit_receive_msg. | 1437 | * audit_receive - receive messages from a netlink control socket |
1383 | * Malformed skbs with wrong length are discarded silently. | 1438 | * @skb: the message buffer |
1439 | * | ||
1440 | * Parse the provided skb and deal with any messages that may be present, | ||
1441 | * malformed skbs are discarded. | ||
1384 | */ | 1442 | */ |
1385 | static void audit_receive_skb(struct sk_buff *skb) | 1443 | static void audit_receive(struct sk_buff *skb) |
1386 | { | 1444 | { |
1387 | struct nlmsghdr *nlh; | 1445 | struct nlmsghdr *nlh; |
1388 | /* | 1446 | /* |
@@ -1395,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb) | |||
1395 | nlh = nlmsg_hdr(skb); | 1453 | nlh = nlmsg_hdr(skb); |
1396 | len = skb->len; | 1454 | len = skb->len; |
1397 | 1455 | ||
1456 | mutex_lock(&audit_cmd_mutex); | ||
1398 | while (nlmsg_ok(nlh, len)) { | 1457 | while (nlmsg_ok(nlh, len)) { |
1399 | err = audit_receive_msg(skb, nlh); | 1458 | err = audit_receive_msg(skb, nlh); |
1400 | /* if err or if this message says it wants a response */ | 1459 | /* if err or if this message says it wants a response */ |
1401 | if (err || (nlh->nlmsg_flags & NLM_F_ACK)) | 1460 | if (err || (nlh->nlmsg_flags & NLM_F_ACK)) |
1402 | netlink_ack(skb, nlh, err); | 1461 | netlink_ack(skb, nlh, err, NULL); |
1403 | 1462 | ||
1404 | nlh = nlmsg_next(nlh, &len); | 1463 | nlh = nlmsg_next(nlh, &len); |
1405 | } | 1464 | } |
1406 | } | ||
1407 | |||
1408 | /* Receive messages from netlink socket. */ | ||
1409 | static void audit_receive(struct sk_buff *skb) | ||
1410 | { | ||
1411 | mutex_lock(&audit_cmd_mutex); | ||
1412 | audit_receive_skb(skb); | ||
1413 | mutex_unlock(&audit_cmd_mutex); | 1465 | mutex_unlock(&audit_cmd_mutex); |
1414 | } | 1466 | } |
1415 | 1467 | ||
@@ -1447,10 +1499,11 @@ static void __net_exit audit_net_exit(struct net *net) | |||
1447 | { | 1499 | { |
1448 | struct audit_net *aunet = net_generic(net, audit_net_id); | 1500 | struct audit_net *aunet = net_generic(net, audit_net_id); |
1449 | 1501 | ||
1450 | rcu_read_lock(); | 1502 | /* NOTE: you would think that we would want to check the auditd |
1451 | if (net == auditd_conn.net) | 1503 | * connection and potentially reset it here if it lives in this |
1452 | auditd_reset(); | 1504 | * namespace, but since the auditd connection tracking struct holds a |
1453 | rcu_read_unlock(); | 1505 | * reference to this namespace (see auditd_set()) we are only ever |
1506 | * going to get here after that connection has been released */ | ||
1454 | 1507 | ||
1455 | netlink_kernel_release(aunet->sk); | 1508 | netlink_kernel_release(aunet->sk); |
1456 | } | 1509 | } |
@@ -1470,8 +1523,9 @@ static int __init audit_init(void) | |||
1470 | if (audit_initialized == AUDIT_DISABLED) | 1523 | if (audit_initialized == AUDIT_DISABLED) |
1471 | return 0; | 1524 | return 0; |
1472 | 1525 | ||
1473 | memset(&auditd_conn, 0, sizeof(auditd_conn)); | 1526 | audit_buffer_cache = kmem_cache_create("audit_buffer", |
1474 | spin_lock_init(&auditd_conn.lock); | 1527 | sizeof(struct audit_buffer), |
1528 | 0, SLAB_PANIC, NULL); | ||
1475 | 1529 | ||
1476 | skb_queue_head_init(&audit_queue); | 1530 | skb_queue_head_init(&audit_queue); |
1477 | skb_queue_head_init(&audit_retry_queue); | 1531 | skb_queue_head_init(&audit_retry_queue); |
@@ -1538,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); | |||
1538 | 1592 | ||
1539 | static void audit_buffer_free(struct audit_buffer *ab) | 1593 | static void audit_buffer_free(struct audit_buffer *ab) |
1540 | { | 1594 | { |
1541 | unsigned long flags; | ||
1542 | |||
1543 | if (!ab) | 1595 | if (!ab) |
1544 | return; | 1596 | return; |
1545 | 1597 | ||
1546 | kfree_skb(ab->skb); | 1598 | kfree_skb(ab->skb); |
1547 | spin_lock_irqsave(&audit_freelist_lock, flags); | 1599 | kmem_cache_free(audit_buffer_cache, ab); |
1548 | if (audit_freelist_count > AUDIT_MAXFREE) | ||
1549 | kfree(ab); | ||
1550 | else { | ||
1551 | audit_freelist_count++; | ||
1552 | list_add(&ab->list, &audit_freelist); | ||
1553 | } | ||
1554 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
1555 | } | 1600 | } |
1556 | 1601 | ||
1557 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | 1602 | static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, |
1558 | gfp_t gfp_mask, int type) | 1603 | gfp_t gfp_mask, int type) |
1559 | { | 1604 | { |
1560 | unsigned long flags; | 1605 | struct audit_buffer *ab; |
1561 | struct audit_buffer *ab = NULL; | ||
1562 | struct nlmsghdr *nlh; | ||
1563 | |||
1564 | spin_lock_irqsave(&audit_freelist_lock, flags); | ||
1565 | if (!list_empty(&audit_freelist)) { | ||
1566 | ab = list_entry(audit_freelist.next, | ||
1567 | struct audit_buffer, list); | ||
1568 | list_del(&ab->list); | ||
1569 | --audit_freelist_count; | ||
1570 | } | ||
1571 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | ||
1572 | |||
1573 | if (!ab) { | ||
1574 | ab = kmalloc(sizeof(*ab), gfp_mask); | ||
1575 | if (!ab) | ||
1576 | goto err; | ||
1577 | } | ||
1578 | 1606 | ||
1579 | ab->ctx = ctx; | 1607 | ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); |
1580 | ab->gfp_mask = gfp_mask; | 1608 | if (!ab) |
1609 | return NULL; | ||
1581 | 1610 | ||
1582 | ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); | 1611 | ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); |
1583 | if (!ab->skb) | 1612 | if (!ab->skb) |
1584 | goto err; | 1613 | goto err; |
1614 | if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) | ||
1615 | goto err; | ||
1585 | 1616 | ||
1586 | nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); | 1617 | ab->ctx = ctx; |
1587 | if (!nlh) | 1618 | ab->gfp_mask = gfp_mask; |
1588 | goto out_kfree_skb; | ||
1589 | 1619 | ||
1590 | return ab; | 1620 | return ab; |
1591 | 1621 | ||
1592 | out_kfree_skb: | ||
1593 | kfree_skb(ab->skb); | ||
1594 | ab->skb = NULL; | ||
1595 | err: | 1622 | err: |
1596 | audit_buffer_free(ab); | 1623 | audit_buffer_free(ab); |
1597 | return NULL; | 1624 | return NULL; |
@@ -1622,10 +1649,10 @@ unsigned int audit_serial(void) | |||
1622 | } | 1649 | } |
1623 | 1650 | ||
1624 | static inline void audit_get_stamp(struct audit_context *ctx, | 1651 | static inline void audit_get_stamp(struct audit_context *ctx, |
1625 | struct timespec *t, unsigned int *serial) | 1652 | struct timespec64 *t, unsigned int *serial) |
1626 | { | 1653 | { |
1627 | if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { | 1654 | if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { |
1628 | *t = CURRENT_TIME; | 1655 | ktime_get_real_ts64(t); |
1629 | *serial = audit_serial(); | 1656 | *serial = audit_serial(); |
1630 | } | 1657 | } |
1631 | } | 1658 | } |
@@ -1649,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1649 | int type) | 1676 | int type) |
1650 | { | 1677 | { |
1651 | struct audit_buffer *ab; | 1678 | struct audit_buffer *ab; |
1652 | struct timespec t; | 1679 | struct timespec64 t; |
1653 | unsigned int uninitialized_var(serial); | 1680 | unsigned int uninitialized_var(serial); |
1654 | 1681 | ||
1655 | if (audit_initialized != AUDIT_INITIALIZED) | 1682 | if (audit_initialized != AUDIT_INITIALIZED) |
@@ -1702,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1702 | } | 1729 | } |
1703 | 1730 | ||
1704 | audit_get_stamp(ab->ctx, &t, &serial); | 1731 | audit_get_stamp(ab->ctx, &t, &serial); |
1705 | audit_log_format(ab, "audit(%lu.%03lu:%u): ", | 1732 | audit_log_format(ab, "audit(%llu.%03lu:%u): ", |
1706 | t.tv_sec, t.tv_nsec/1000000, serial); | 1733 | (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); |
1707 | 1734 | ||
1708 | return ab; | 1735 | return ab; |
1709 | } | 1736 | } |
diff --git a/kernel/audit.h b/kernel/audit.h index 0d87f8ab8778..ddfce2ea4891 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -112,7 +112,7 @@ struct audit_context { | |||
112 | enum audit_state state, current_state; | 112 | enum audit_state state, current_state; |
113 | unsigned int serial; /* serial number for record */ | 113 | unsigned int serial; /* serial number for record */ |
114 | int major; /* syscall number */ | 114 | int major; /* syscall number */ |
115 | struct timespec ctime; /* time of syscall entry */ | 115 | struct timespec64 ctime; /* time of syscall entry */ |
116 | unsigned long argv[4]; /* syscall arguments */ | 116 | unsigned long argv[4]; /* syscall arguments */ |
117 | long return_code;/* syscall return code */ | 117 | long return_code;/* syscall return code */ |
118 | u64 prio; | 118 | u64 prio; |
@@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context, | |||
218 | struct audit_names *n, const struct path *path, | 218 | struct audit_names *n, const struct path *path, |
219 | int record_num, int *call_panic); | 219 | int record_num, int *call_panic); |
220 | 220 | ||
221 | extern int auditd_test_task(const struct task_struct *task); | 221 | extern int auditd_test_task(struct task_struct *task); |
222 | 222 | ||
223 | #define AUDIT_INODE_BUCKETS 32 | 223 | #define AUDIT_INODE_BUCKETS 32 |
224 | extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | 224 | extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; |
@@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); | |||
237 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); | 237 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); |
238 | extern int parent_len(const char *path); | 238 | extern int parent_len(const char *path); |
239 | extern int audit_compare_dname_path(const char *dname, const char *path, int plen); | 239 | extern int audit_compare_dname_path(const char *dname, const char *path, int plen); |
240 | extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, | 240 | extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, |
241 | int done, int multi, | ||
242 | const void *payload, int size); | 241 | const void *payload, int size); |
243 | extern void audit_panic(const char *message); | 242 | extern void audit_panic(const char *message); |
244 | 243 | ||
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 7ea57e516029..52f368b6561e 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c | |||
@@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa | |||
103 | goto out; | 103 | goto out; |
104 | } | 104 | } |
105 | 105 | ||
106 | fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); | 106 | fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group); |
107 | audit_mark->mark.mask = AUDIT_FS_EVENTS; | 107 | audit_mark->mark.mask = AUDIT_FS_EVENTS; |
108 | audit_mark->path = pathname; | 108 | audit_mark->path = pathname; |
109 | audit_update_mark(audit_mark, dentry->d_inode); | 109 | audit_update_mark(audit_mark, dentry->d_inode); |
110 | audit_mark->rule = krule; | 110 | audit_mark->rule = krule; |
111 | 111 | ||
112 | ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); | 112 | ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true); |
113 | if (ret < 0) { | 113 | if (ret < 0) { |
114 | audit_fsnotify_mark_free(audit_mark); | 114 | fsnotify_put_mark(&audit_mark->mark); |
115 | audit_mark = ERR_PTR(ret); | 115 | audit_mark = ERR_PTR(ret); |
116 | } | 116 | } |
117 | out: | 117 | out: |
@@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group, | |||
168 | struct fsnotify_mark *inode_mark, | 168 | struct fsnotify_mark *inode_mark, |
169 | struct fsnotify_mark *vfsmount_mark, | 169 | struct fsnotify_mark *vfsmount_mark, |
170 | u32 mask, const void *data, int data_type, | 170 | u32 mask, const void *data, int data_type, |
171 | const unsigned char *dname, u32 cookie) | 171 | const unsigned char *dname, u32 cookie, |
172 | struct fsnotify_iter_info *iter_info) | ||
172 | { | 173 | { |
173 | struct audit_fsnotify_mark *audit_mark; | 174 | struct audit_fsnotify_mark *audit_mark; |
174 | const struct inode *inode = NULL; | 175 | const struct inode *inode = NULL; |
@@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, | |||
187 | default: | 188 | default: |
188 | BUG(); | 189 | BUG(); |
189 | return 0; | 190 | return 0; |
190 | }; | 191 | } |
191 | 192 | ||
192 | if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { | 193 | if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { |
193 | if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) | 194 | if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) |
@@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, | |||
201 | 202 | ||
202 | static const struct fsnotify_ops audit_mark_fsnotify_ops = { | 203 | static const struct fsnotify_ops audit_mark_fsnotify_ops = { |
203 | .handle_event = audit_mark_handle_event, | 204 | .handle_event = audit_mark_handle_event, |
205 | .free_mark = audit_fsnotify_free_mark, | ||
204 | }; | 206 | }; |
205 | 207 | ||
206 | static int __init audit_fsnotify_init(void) | 208 | static int __init audit_fsnotify_init(void) |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7b44195da81b..011d46e5f73f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -3,13 +3,14 @@ | |||
3 | #include <linux/namei.h> | 3 | #include <linux/namei.h> |
4 | #include <linux/mount.h> | 4 | #include <linux/mount.h> |
5 | #include <linux/kthread.h> | 5 | #include <linux/kthread.h> |
6 | #include <linux/refcount.h> | ||
6 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
7 | 8 | ||
8 | struct audit_tree; | 9 | struct audit_tree; |
9 | struct audit_chunk; | 10 | struct audit_chunk; |
10 | 11 | ||
11 | struct audit_tree { | 12 | struct audit_tree { |
12 | atomic_t count; | 13 | refcount_t count; |
13 | int goner; | 14 | int goner; |
14 | struct audit_chunk *root; | 15 | struct audit_chunk *root; |
15 | struct list_head chunks; | 16 | struct list_head chunks; |
@@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s) | |||
77 | 78 | ||
78 | tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); | 79 | tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); |
79 | if (tree) { | 80 | if (tree) { |
80 | atomic_set(&tree->count, 1); | 81 | refcount_set(&tree->count, 1); |
81 | tree->goner = 0; | 82 | tree->goner = 0; |
82 | INIT_LIST_HEAD(&tree->chunks); | 83 | INIT_LIST_HEAD(&tree->chunks); |
83 | INIT_LIST_HEAD(&tree->rules); | 84 | INIT_LIST_HEAD(&tree->rules); |
@@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s) | |||
91 | 92 | ||
92 | static inline void get_tree(struct audit_tree *tree) | 93 | static inline void get_tree(struct audit_tree *tree) |
93 | { | 94 | { |
94 | atomic_inc(&tree->count); | 95 | refcount_inc(&tree->count); |
95 | } | 96 | } |
96 | 97 | ||
97 | static inline void put_tree(struct audit_tree *tree) | 98 | static inline void put_tree(struct audit_tree *tree) |
98 | { | 99 | { |
99 | if (atomic_dec_and_test(&tree->count)) | 100 | if (refcount_dec_and_test(&tree->count)) |
100 | kfree_rcu(tree, head); | 101 | kfree_rcu(tree, head); |
101 | } | 102 | } |
102 | 103 | ||
@@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count) | |||
154 | INIT_LIST_HEAD(&chunk->owners[i].list); | 155 | INIT_LIST_HEAD(&chunk->owners[i].list); |
155 | chunk->owners[i].index = i; | 156 | chunk->owners[i].index = i; |
156 | } | 157 | } |
157 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); | 158 | fsnotify_init_mark(&chunk->mark, audit_tree_group); |
158 | chunk->mark.mask = FS_IN_IGNORED; | 159 | chunk->mark.mask = FS_IN_IGNORED; |
159 | return chunk; | 160 | return chunk; |
160 | } | 161 | } |
@@ -163,33 +164,54 @@ enum {HASH_SIZE = 128}; | |||
163 | static struct list_head chunk_hash_heads[HASH_SIZE]; | 164 | static struct list_head chunk_hash_heads[HASH_SIZE]; |
164 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); | 165 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); |
165 | 166 | ||
166 | static inline struct list_head *chunk_hash(const struct inode *inode) | 167 | /* Function to return search key in our hash from inode. */ |
168 | static unsigned long inode_to_key(const struct inode *inode) | ||
167 | { | 169 | { |
168 | unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; | 170 | return (unsigned long)inode; |
171 | } | ||
172 | |||
173 | /* | ||
174 | * Function to return search key in our hash from chunk. Key 0 is special and | ||
175 | * should never be present in the hash. | ||
176 | */ | ||
177 | static unsigned long chunk_to_key(struct audit_chunk *chunk) | ||
178 | { | ||
179 | /* | ||
180 | * We have a reference to the mark so it should be attached to a | ||
181 | * connector. | ||
182 | */ | ||
183 | if (WARN_ON_ONCE(!chunk->mark.connector)) | ||
184 | return 0; | ||
185 | return (unsigned long)chunk->mark.connector->inode; | ||
186 | } | ||
187 | |||
188 | static inline struct list_head *chunk_hash(unsigned long key) | ||
189 | { | ||
190 | unsigned long n = key / L1_CACHE_BYTES; | ||
169 | return chunk_hash_heads + n % HASH_SIZE; | 191 | return chunk_hash_heads + n % HASH_SIZE; |
170 | } | 192 | } |
171 | 193 | ||
172 | /* hash_lock & entry->lock is held by caller */ | 194 | /* hash_lock & entry->lock is held by caller */ |
173 | static void insert_hash(struct audit_chunk *chunk) | 195 | static void insert_hash(struct audit_chunk *chunk) |
174 | { | 196 | { |
175 | struct fsnotify_mark *entry = &chunk->mark; | 197 | unsigned long key = chunk_to_key(chunk); |
176 | struct list_head *list; | 198 | struct list_head *list; |
177 | 199 | ||
178 | if (!entry->inode) | 200 | if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) |
179 | return; | 201 | return; |
180 | list = chunk_hash(entry->inode); | 202 | list = chunk_hash(key); |
181 | list_add_rcu(&chunk->hash, list); | 203 | list_add_rcu(&chunk->hash, list); |
182 | } | 204 | } |
183 | 205 | ||
184 | /* called under rcu_read_lock */ | 206 | /* called under rcu_read_lock */ |
185 | struct audit_chunk *audit_tree_lookup(const struct inode *inode) | 207 | struct audit_chunk *audit_tree_lookup(const struct inode *inode) |
186 | { | 208 | { |
187 | struct list_head *list = chunk_hash(inode); | 209 | unsigned long key = inode_to_key(inode); |
210 | struct list_head *list = chunk_hash(key); | ||
188 | struct audit_chunk *p; | 211 | struct audit_chunk *p; |
189 | 212 | ||
190 | list_for_each_entry_rcu(p, list, hash) { | 213 | list_for_each_entry_rcu(p, list, hash) { |
191 | /* mark.inode may have gone NULL, but who cares? */ | 214 | if (chunk_to_key(p) == key) { |
192 | if (p->mark.inode == inode) { | ||
193 | atomic_long_inc(&p->refs); | 215 | atomic_long_inc(&p->refs); |
194 | return p; | 216 | return p; |
195 | } | 217 | } |
@@ -233,11 +255,15 @@ static void untag_chunk(struct node *p) | |||
233 | 255 | ||
234 | mutex_lock(&entry->group->mark_mutex); | 256 | mutex_lock(&entry->group->mark_mutex); |
235 | spin_lock(&entry->lock); | 257 | spin_lock(&entry->lock); |
236 | if (chunk->dead || !entry->inode) { | 258 | /* |
259 | * mark_mutex protects mark from getting detached and thus also from | ||
260 | * mark->connector->inode getting NULL. | ||
261 | */ | ||
262 | if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { | ||
237 | spin_unlock(&entry->lock); | 263 | spin_unlock(&entry->lock); |
238 | mutex_unlock(&entry->group->mark_mutex); | 264 | mutex_unlock(&entry->group->mark_mutex); |
239 | if (new) | 265 | if (new) |
240 | free_chunk(new); | 266 | fsnotify_put_mark(&new->mark); |
241 | goto out; | 267 | goto out; |
242 | } | 268 | } |
243 | 269 | ||
@@ -261,7 +287,7 @@ static void untag_chunk(struct node *p) | |||
261 | if (!new) | 287 | if (!new) |
262 | goto Fallback; | 288 | goto Fallback; |
263 | 289 | ||
264 | if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, | 290 | if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode, |
265 | NULL, 1)) { | 291 | NULL, 1)) { |
266 | fsnotify_put_mark(&new->mark); | 292 | fsnotify_put_mark(&new->mark); |
267 | goto Fallback; | 293 | goto Fallback; |
@@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
327 | return -ENOMEM; | 353 | return -ENOMEM; |
328 | 354 | ||
329 | entry = &chunk->mark; | 355 | entry = &chunk->mark; |
330 | if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { | 356 | if (fsnotify_add_mark(entry, inode, NULL, 0)) { |
331 | fsnotify_put_mark(entry); | 357 | fsnotify_put_mark(entry); |
332 | return -ENOSPC; | 358 | return -ENOSPC; |
333 | } | 359 | } |
@@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
366 | struct node *p; | 392 | struct node *p; |
367 | int n; | 393 | int n; |
368 | 394 | ||
369 | old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); | 395 | old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, |
396 | audit_tree_group); | ||
370 | if (!old_entry) | 397 | if (!old_entry) |
371 | return create_chunk(inode, tree); | 398 | return create_chunk(inode, tree); |
372 | 399 | ||
@@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
393 | 420 | ||
394 | mutex_lock(&old_entry->group->mark_mutex); | 421 | mutex_lock(&old_entry->group->mark_mutex); |
395 | spin_lock(&old_entry->lock); | 422 | spin_lock(&old_entry->lock); |
396 | if (!old_entry->inode) { | 423 | /* |
424 | * mark_mutex protects mark from getting detached and thus also from | ||
425 | * mark->connector->inode getting NULL. | ||
426 | */ | ||
427 | if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { | ||
397 | /* old_entry is being shot, lets just lie */ | 428 | /* old_entry is being shot, lets just lie */ |
398 | spin_unlock(&old_entry->lock); | 429 | spin_unlock(&old_entry->lock); |
399 | mutex_unlock(&old_entry->group->mark_mutex); | 430 | mutex_unlock(&old_entry->group->mark_mutex); |
400 | fsnotify_put_mark(old_entry); | 431 | fsnotify_put_mark(old_entry); |
401 | free_chunk(chunk); | 432 | fsnotify_put_mark(&chunk->mark); |
402 | return -ENOENT; | 433 | return -ENOENT; |
403 | } | 434 | } |
404 | 435 | ||
405 | if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, | 436 | if (fsnotify_add_mark_locked(chunk_entry, |
406 | old_entry->inode, NULL, 1)) { | 437 | old_entry->connector->inode, NULL, 1)) { |
407 | spin_unlock(&old_entry->lock); | 438 | spin_unlock(&old_entry->lock); |
408 | mutex_unlock(&old_entry->group->mark_mutex); | 439 | mutex_unlock(&old_entry->group->mark_mutex); |
409 | fsnotify_put_mark(chunk_entry); | 440 | fsnotify_put_mark(chunk_entry); |
@@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule) | |||
588 | 619 | ||
589 | static int compare_root(struct vfsmount *mnt, void *arg) | 620 | static int compare_root(struct vfsmount *mnt, void *arg) |
590 | { | 621 | { |
591 | return d_backing_inode(mnt->mnt_root) == arg; | 622 | return inode_to_key(d_backing_inode(mnt->mnt_root)) == |
623 | (unsigned long)arg; | ||
592 | } | 624 | } |
593 | 625 | ||
594 | void audit_trim_trees(void) | 626 | void audit_trim_trees(void) |
@@ -623,9 +655,10 @@ void audit_trim_trees(void) | |||
623 | list_for_each_entry(node, &tree->chunks, list) { | 655 | list_for_each_entry(node, &tree->chunks, list) { |
624 | struct audit_chunk *chunk = find_chunk(node); | 656 | struct audit_chunk *chunk = find_chunk(node); |
625 | /* this could be NULL if the watch is dying else where... */ | 657 | /* this could be NULL if the watch is dying else where... */ |
626 | struct inode *inode = chunk->mark.inode; | ||
627 | node->index |= 1U<<31; | 658 | node->index |= 1U<<31; |
628 | if (iterate_mounts(compare_root, inode, root_mnt)) | 659 | if (iterate_mounts(compare_root, |
660 | (void *)chunk_to_key(chunk), | ||
661 | root_mnt)) | ||
629 | node->index &= ~(1U<<31); | 662 | node->index &= ~(1U<<31); |
630 | } | 663 | } |
631 | spin_unlock(&hash_lock); | 664 | spin_unlock(&hash_lock); |
@@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group, | |||
958 | struct fsnotify_mark *inode_mark, | 991 | struct fsnotify_mark *inode_mark, |
959 | struct fsnotify_mark *vfsmount_mark, | 992 | struct fsnotify_mark *vfsmount_mark, |
960 | u32 mask, const void *data, int data_type, | 993 | u32 mask, const void *data, int data_type, |
961 | const unsigned char *file_name, u32 cookie) | 994 | const unsigned char *file_name, u32 cookie, |
995 | struct fsnotify_iter_info *iter_info) | ||
962 | { | 996 | { |
963 | return 0; | 997 | return 0; |
964 | } | 998 | } |
@@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify | |||
979 | static const struct fsnotify_ops audit_tree_ops = { | 1013 | static const struct fsnotify_ops audit_tree_ops = { |
980 | .handle_event = audit_tree_handle_event, | 1014 | .handle_event = audit_tree_handle_event, |
981 | .freeing_mark = audit_tree_freeing_mark, | 1015 | .freeing_mark = audit_tree_freeing_mark, |
1016 | .free_mark = audit_tree_destroy_watch, | ||
982 | }; | 1017 | }; |
983 | 1018 | ||
984 | static int __init audit_tree_init(void) | 1019 | static int __init audit_tree_init(void) |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f79e4658433d..62d686d96581 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/fsnotify_backend.h> | 28 | #include <linux/fsnotify_backend.h> |
29 | #include <linux/namei.h> | 29 | #include <linux/namei.h> |
30 | #include <linux/netlink.h> | 30 | #include <linux/netlink.h> |
31 | #include <linux/refcount.h> | ||
31 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
@@ -46,7 +47,7 @@ | |||
46 | */ | 47 | */ |
47 | 48 | ||
48 | struct audit_watch { | 49 | struct audit_watch { |
49 | atomic_t count; /* reference count */ | 50 | refcount_t count; /* reference count */ |
50 | dev_t dev; /* associated superblock device */ | 51 | dev_t dev; /* associated superblock device */ |
51 | char *path; /* insertion path */ | 52 | char *path; /* insertion path */ |
52 | unsigned long ino; /* associated inode number */ | 53 | unsigned long ino; /* associated inode number */ |
@@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) | |||
102 | struct audit_parent *parent = NULL; | 103 | struct audit_parent *parent = NULL; |
103 | struct fsnotify_mark *entry; | 104 | struct fsnotify_mark *entry; |
104 | 105 | ||
105 | entry = fsnotify_find_inode_mark(audit_watch_group, inode); | 106 | entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); |
106 | if (entry) | 107 | if (entry) |
107 | parent = container_of(entry, struct audit_parent, mark); | 108 | parent = container_of(entry, struct audit_parent, mark); |
108 | 109 | ||
@@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) | |||
111 | 112 | ||
112 | void audit_get_watch(struct audit_watch *watch) | 113 | void audit_get_watch(struct audit_watch *watch) |
113 | { | 114 | { |
114 | atomic_inc(&watch->count); | 115 | refcount_inc(&watch->count); |
115 | } | 116 | } |
116 | 117 | ||
117 | void audit_put_watch(struct audit_watch *watch) | 118 | void audit_put_watch(struct audit_watch *watch) |
118 | { | 119 | { |
119 | if (atomic_dec_and_test(&watch->count)) { | 120 | if (refcount_dec_and_test(&watch->count)) { |
120 | WARN_ON(watch->parent); | 121 | WARN_ON(watch->parent); |
121 | WARN_ON(!list_empty(&watch->rules)); | 122 | WARN_ON(!list_empty(&watch->rules)); |
122 | kfree(watch->path); | 123 | kfree(watch->path); |
@@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path) | |||
157 | 158 | ||
158 | INIT_LIST_HEAD(&parent->watches); | 159 | INIT_LIST_HEAD(&parent->watches); |
159 | 160 | ||
160 | fsnotify_init_mark(&parent->mark, audit_watch_free_mark); | 161 | fsnotify_init_mark(&parent->mark, audit_watch_group); |
161 | parent->mark.mask = AUDIT_FS_WATCH; | 162 | parent->mark.mask = AUDIT_FS_WATCH; |
162 | ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); | 163 | ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0); |
163 | if (ret < 0) { | 164 | if (ret < 0) { |
164 | audit_free_parent(parent); | 165 | audit_free_parent(parent); |
165 | return ERR_PTR(ret); | 166 | return ERR_PTR(ret); |
@@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path) | |||
178 | return ERR_PTR(-ENOMEM); | 179 | return ERR_PTR(-ENOMEM); |
179 | 180 | ||
180 | INIT_LIST_HEAD(&watch->rules); | 181 | INIT_LIST_HEAD(&watch->rules); |
181 | atomic_set(&watch->count, 1); | 182 | refcount_set(&watch->count, 1); |
182 | watch->path = path; | 183 | watch->path = path; |
183 | watch->dev = AUDIT_DEV_UNSET; | 184 | watch->dev = AUDIT_DEV_UNSET; |
184 | watch->ino = AUDIT_INO_UNSET; | 185 | watch->ino = AUDIT_INO_UNSET; |
@@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
472 | struct fsnotify_mark *inode_mark, | 473 | struct fsnotify_mark *inode_mark, |
473 | struct fsnotify_mark *vfsmount_mark, | 474 | struct fsnotify_mark *vfsmount_mark, |
474 | u32 mask, const void *data, int data_type, | 475 | u32 mask, const void *data, int data_type, |
475 | const unsigned char *dname, u32 cookie) | 476 | const unsigned char *dname, u32 cookie, |
477 | struct fsnotify_iter_info *iter_info) | ||
476 | { | 478 | { |
477 | const struct inode *inode; | 479 | const struct inode *inode; |
478 | struct audit_parent *parent; | 480 | struct audit_parent *parent; |
@@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
492 | BUG(); | 494 | BUG(); |
493 | inode = NULL; | 495 | inode = NULL; |
494 | break; | 496 | break; |
495 | }; | 497 | } |
496 | 498 | ||
497 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) | 499 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) |
498 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); | 500 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); |
@@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, | |||
506 | 508 | ||
507 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { | 509 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { |
508 | .handle_event = audit_watch_handle_event, | 510 | .handle_event = audit_watch_handle_event, |
511 | .free_mark = audit_watch_free_mark, | ||
509 | }; | 512 | }; |
510 | 513 | ||
511 | static int __init audit_watch_init(void) | 514 | static int __init audit_watch_init(void) |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 880519d6cf2a..0b0aa5854dac 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
338 | entry->rule.listnr != AUDIT_FILTER_USER) | 338 | entry->rule.listnr != AUDIT_FILTER_USER) |
339 | return -EINVAL; | 339 | return -EINVAL; |
340 | break; | 340 | break; |
341 | }; | 341 | } |
342 | 342 | ||
343 | switch(f->type) { | 343 | switch(f->type) { |
344 | default: | 344 | default: |
@@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
412 | if (entry->rule.listnr != AUDIT_FILTER_EXIT) | 412 | if (entry->rule.listnr != AUDIT_FILTER_EXIT) |
413 | return -EINVAL; | 413 | return -EINVAL; |
414 | break; | 414 | break; |
415 | }; | 415 | } |
416 | return 0; | 416 | return 0; |
417 | } | 417 | } |
418 | 418 | ||
@@ -1033,7 +1033,7 @@ out: | |||
1033 | } | 1033 | } |
1034 | 1034 | ||
1035 | /* List rules using struct audit_rule_data. */ | 1035 | /* List rules using struct audit_rule_data. */ |
1036 | static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) | 1036 | static void audit_list_rules(int seq, struct sk_buff_head *q) |
1037 | { | 1037 | { |
1038 | struct sk_buff *skb; | 1038 | struct sk_buff *skb; |
1039 | struct audit_krule *r; | 1039 | struct audit_krule *r; |
@@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) | |||
1048 | data = audit_krule_to_data(r); | 1048 | data = audit_krule_to_data(r); |
1049 | if (unlikely(!data)) | 1049 | if (unlikely(!data)) |
1050 | break; | 1050 | break; |
1051 | skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, | 1051 | skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, |
1052 | 0, 1, data, | 1052 | data, |
1053 | sizeof(*data) + data->buflen); | 1053 | sizeof(*data) + data->buflen); |
1054 | if (skb) | 1054 | if (skb) |
1055 | skb_queue_tail(q, skb); | 1055 | skb_queue_tail(q, skb); |
1056 | kfree(data); | 1056 | kfree(data); |
1057 | } | 1057 | } |
1058 | } | 1058 | } |
1059 | skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | 1059 | skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); |
1060 | if (skb) | 1060 | if (skb) |
1061 | skb_queue_tail(q, skb); | 1061 | skb_queue_tail(q, skb); |
1062 | } | 1062 | } |
@@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re | |||
1085 | /** | 1085 | /** |
1086 | * audit_rule_change - apply all rules to the specified message type | 1086 | * audit_rule_change - apply all rules to the specified message type |
1087 | * @type: audit message type | 1087 | * @type: audit message type |
1088 | * @portid: target port id for netlink audit messages | ||
1089 | * @seq: netlink audit message sequence (serial) number | 1088 | * @seq: netlink audit message sequence (serial) number |
1090 | * @data: payload data | 1089 | * @data: payload data |
1091 | * @datasz: size of payload data | 1090 | * @datasz: size of payload data |
1092 | */ | 1091 | */ |
1093 | int audit_rule_change(int type, __u32 portid, int seq, void *data, | 1092 | int audit_rule_change(int type, int seq, void *data, size_t datasz) |
1094 | size_t datasz) | ||
1095 | { | 1093 | { |
1096 | int err = 0; | 1094 | int err = 0; |
1097 | struct audit_entry *entry; | 1095 | struct audit_entry *entry; |
@@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) | |||
1150 | skb_queue_head_init(&dest->q); | 1148 | skb_queue_head_init(&dest->q); |
1151 | 1149 | ||
1152 | mutex_lock(&audit_filter_mutex); | 1150 | mutex_lock(&audit_filter_mutex); |
1153 | audit_list_rules(portid, seq, &dest->q); | 1151 | audit_list_rules(seq, &dest->q); |
1154 | mutex_unlock(&audit_filter_mutex); | 1152 | mutex_unlock(&audit_filter_mutex); |
1155 | 1153 | ||
1156 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | 1154 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c2333155893..bb724baa7ac9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -73,6 +73,7 @@ | |||
73 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
74 | #include <linux/string.h> | 74 | #include <linux/string.h> |
75 | #include <linux/uaccess.h> | 75 | #include <linux/uaccess.h> |
76 | #include <linux/fsnotify_backend.h> | ||
76 | #include <uapi/linux/limits.h> | 77 | #include <uapi/linux/limits.h> |
77 | 78 | ||
78 | #include "audit.h" | 79 | #include "audit.h" |
@@ -1532,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, | |||
1532 | return; | 1533 | return; |
1533 | 1534 | ||
1534 | context->serial = 0; | 1535 | context->serial = 0; |
1535 | context->ctime = CURRENT_TIME; | 1536 | ktime_get_real_ts64(&context->ctime); |
1536 | context->in_syscall = 1; | 1537 | context->in_syscall = 1; |
1537 | context->current_state = state; | 1538 | context->current_state = state; |
1538 | context->ppid = 0; | 1539 | context->ppid = 0; |
@@ -1596,7 +1597,7 @@ static inline void handle_one(const struct inode *inode) | |||
1596 | struct audit_tree_refs *p; | 1597 | struct audit_tree_refs *p; |
1597 | struct audit_chunk *chunk; | 1598 | struct audit_chunk *chunk; |
1598 | int count; | 1599 | int count; |
1599 | if (likely(hlist_empty(&inode->i_fsnotify_marks))) | 1600 | if (likely(!inode->i_fsnotify_marks)) |
1600 | return; | 1601 | return; |
1601 | context = current->audit_context; | 1602 | context = current->audit_context; |
1602 | p = context->trees; | 1603 | p = context->trees; |
@@ -1639,7 +1640,7 @@ retry: | |||
1639 | seq = read_seqbegin(&rename_lock); | 1640 | seq = read_seqbegin(&rename_lock); |
1640 | for(;;) { | 1641 | for(;;) { |
1641 | struct inode *inode = d_backing_inode(d); | 1642 | struct inode *inode = d_backing_inode(d); |
1642 | if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { | 1643 | if (inode && unlikely(inode->i_fsnotify_marks)) { |
1643 | struct audit_chunk *chunk; | 1644 | struct audit_chunk *chunk; |
1644 | chunk = audit_tree_lookup(inode); | 1645 | chunk = audit_tree_lookup(inode); |
1645 | if (chunk) { | 1646 | if (chunk) { |
@@ -1941,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); | |||
1941 | /** | 1942 | /** |
1942 | * auditsc_get_stamp - get local copies of audit_context values | 1943 | * auditsc_get_stamp - get local copies of audit_context values |
1943 | * @ctx: audit_context for the task | 1944 | * @ctx: audit_context for the task |
1944 | * @t: timespec to store time recorded in the audit_context | 1945 | * @t: timespec64 to store time recorded in the audit_context |
1945 | * @serial: serial value that is recorded in the audit_context | 1946 | * @serial: serial value that is recorded in the audit_context |
1946 | * | 1947 | * |
1947 | * Also sets the context as auditable. | 1948 | * Also sets the context as auditable. |
1948 | */ | 1949 | */ |
1949 | int auditsc_get_stamp(struct audit_context *ctx, | 1950 | int auditsc_get_stamp(struct audit_context *ctx, |
1950 | struct timespec *t, unsigned int *serial) | 1951 | struct timespec64 *t, unsigned int *serial) |
1951 | { | 1952 | { |
1952 | if (!ctx->in_syscall) | 1953 | if (!ctx->in_syscall) |
1953 | return 0; | 1954 | return 0; |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | obj-y := core.o | 1 | obj-y := core.o |
2 | 2 | ||
3 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o | 3 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o |
4 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o | 4 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o |
5 | ifeq ($(CONFIG_PERF_EVENTS),y) | 5 | ifeq ($(CONFIG_PERF_EVENTS),y) |
6 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o | 6 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o |
7 | endif | 7 | endif |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 6b6f41f0b211..5e00b2333c26 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -1,4 +1,5 @@ | |||
1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
2 | * Copyright (c) 2016,2017 Facebook | ||
2 | * | 3 | * |
3 | * This program is free software; you can redistribute it and/or | 4 | * This program is free software; you can redistribute it and/or |
4 | * modify it under the terms of version 2 of the GNU General Public | 5 | * modify it under the terms of version 2 of the GNU General Public |
@@ -16,6 +17,8 @@ | |||
16 | #include <linux/filter.h> | 17 | #include <linux/filter.h> |
17 | #include <linux/perf_event.h> | 18 | #include <linux/perf_event.h> |
18 | 19 | ||
20 | #include "map_in_map.h" | ||
21 | |||
19 | static void bpf_array_free_percpu(struct bpf_array *array) | 22 | static void bpf_array_free_percpu(struct bpf_array *array) |
20 | { | 23 | { |
21 | int i; | 24 | int i; |
@@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) | |||
113 | return array->value + array->elem_size * index; | 116 | return array->value + array->elem_size * index; |
114 | } | 117 | } |
115 | 118 | ||
119 | /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ | ||
120 | static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) | ||
121 | { | ||
122 | struct bpf_insn *insn = insn_buf; | ||
123 | u32 elem_size = round_up(map->value_size, 8); | ||
124 | const int ret = BPF_REG_0; | ||
125 | const int map_ptr = BPF_REG_1; | ||
126 | const int index = BPF_REG_2; | ||
127 | |||
128 | *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); | ||
129 | *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); | ||
130 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); | ||
131 | |||
132 | if (is_power_of_2(elem_size)) { | ||
133 | *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); | ||
134 | } else { | ||
135 | *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); | ||
136 | } | ||
137 | *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); | ||
138 | *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); | ||
139 | *insn++ = BPF_MOV64_IMM(ret, 0); | ||
140 | return insn - insn_buf; | ||
141 | } | ||
142 | |||
116 | /* Called from eBPF program */ | 143 | /* Called from eBPF program */ |
117 | static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) | 144 | static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) |
118 | { | 145 | { |
@@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) | |||
155 | static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | 182 | static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
156 | { | 183 | { |
157 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 184 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
158 | u32 index = *(u32 *)key; | 185 | u32 index = key ? *(u32 *)key : U32_MAX; |
159 | u32 *next = (u32 *)next_key; | 186 | u32 *next = (u32 *)next_key; |
160 | 187 | ||
161 | if (index >= array->map.max_entries) { | 188 | if (index >= array->map.max_entries) { |
@@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map) | |||
260 | bpf_map_area_free(array); | 287 | bpf_map_area_free(array); |
261 | } | 288 | } |
262 | 289 | ||
263 | static const struct bpf_map_ops array_ops = { | 290 | const struct bpf_map_ops array_map_ops = { |
264 | .map_alloc = array_map_alloc, | 291 | .map_alloc = array_map_alloc, |
265 | .map_free = array_map_free, | 292 | .map_free = array_map_free, |
266 | .map_get_next_key = array_map_get_next_key, | 293 | .map_get_next_key = array_map_get_next_key, |
267 | .map_lookup_elem = array_map_lookup_elem, | 294 | .map_lookup_elem = array_map_lookup_elem, |
268 | .map_update_elem = array_map_update_elem, | 295 | .map_update_elem = array_map_update_elem, |
269 | .map_delete_elem = array_map_delete_elem, | 296 | .map_delete_elem = array_map_delete_elem, |
297 | .map_gen_lookup = array_map_gen_lookup, | ||
270 | }; | 298 | }; |
271 | 299 | ||
272 | static struct bpf_map_type_list array_type __ro_after_init = { | 300 | const struct bpf_map_ops percpu_array_map_ops = { |
273 | .ops = &array_ops, | ||
274 | .type = BPF_MAP_TYPE_ARRAY, | ||
275 | }; | ||
276 | |||
277 | static const struct bpf_map_ops percpu_array_ops = { | ||
278 | .map_alloc = array_map_alloc, | 301 | .map_alloc = array_map_alloc, |
279 | .map_free = array_map_free, | 302 | .map_free = array_map_free, |
280 | .map_get_next_key = array_map_get_next_key, | 303 | .map_get_next_key = array_map_get_next_key, |
@@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = { | |||
283 | .map_delete_elem = array_map_delete_elem, | 306 | .map_delete_elem = array_map_delete_elem, |
284 | }; | 307 | }; |
285 | 308 | ||
286 | static struct bpf_map_type_list percpu_array_type __ro_after_init = { | ||
287 | .ops = &percpu_array_ops, | ||
288 | .type = BPF_MAP_TYPE_PERCPU_ARRAY, | ||
289 | }; | ||
290 | |||
291 | static int __init register_array_map(void) | ||
292 | { | ||
293 | bpf_register_map_type(&array_type); | ||
294 | bpf_register_map_type(&percpu_array_type); | ||
295 | return 0; | ||
296 | } | ||
297 | late_initcall(register_array_map); | ||
298 | |||
299 | static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) | 309 | static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) |
300 | { | 310 | { |
301 | /* only file descriptors can be stored in this type of map */ | 311 | /* only file descriptors can be stored in this type of map */ |
@@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map) | |||
399 | fd_array_map_delete_elem(map, &i); | 409 | fd_array_map_delete_elem(map, &i); |
400 | } | 410 | } |
401 | 411 | ||
402 | static const struct bpf_map_ops prog_array_ops = { | 412 | const struct bpf_map_ops prog_array_map_ops = { |
403 | .map_alloc = fd_array_map_alloc, | 413 | .map_alloc = fd_array_map_alloc, |
404 | .map_free = fd_array_map_free, | 414 | .map_free = fd_array_map_free, |
405 | .map_get_next_key = array_map_get_next_key, | 415 | .map_get_next_key = array_map_get_next_key, |
@@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = { | |||
409 | .map_fd_put_ptr = prog_fd_array_put_ptr, | 419 | .map_fd_put_ptr = prog_fd_array_put_ptr, |
410 | }; | 420 | }; |
411 | 421 | ||
412 | static struct bpf_map_type_list prog_array_type __ro_after_init = { | ||
413 | .ops = &prog_array_ops, | ||
414 | .type = BPF_MAP_TYPE_PROG_ARRAY, | ||
415 | }; | ||
416 | |||
417 | static int __init register_prog_array_map(void) | ||
418 | { | ||
419 | bpf_register_map_type(&prog_array_type); | ||
420 | return 0; | ||
421 | } | ||
422 | late_initcall(register_prog_array_map); | ||
423 | |||
424 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, | 422 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, |
425 | struct file *map_file) | 423 | struct file *map_file) |
426 | { | 424 | { |
@@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, | |||
511 | rcu_read_unlock(); | 509 | rcu_read_unlock(); |
512 | } | 510 | } |
513 | 511 | ||
514 | static const struct bpf_map_ops perf_event_array_ops = { | 512 | const struct bpf_map_ops perf_event_array_map_ops = { |
515 | .map_alloc = fd_array_map_alloc, | 513 | .map_alloc = fd_array_map_alloc, |
516 | .map_free = fd_array_map_free, | 514 | .map_free = fd_array_map_free, |
517 | .map_get_next_key = array_map_get_next_key, | 515 | .map_get_next_key = array_map_get_next_key, |
@@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = { | |||
522 | .map_release = perf_event_fd_array_release, | 520 | .map_release = perf_event_fd_array_release, |
523 | }; | 521 | }; |
524 | 522 | ||
525 | static struct bpf_map_type_list perf_event_array_type __ro_after_init = { | ||
526 | .ops = &perf_event_array_ops, | ||
527 | .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, | ||
528 | }; | ||
529 | |||
530 | static int __init register_perf_event_array_map(void) | ||
531 | { | ||
532 | bpf_register_map_type(&perf_event_array_type); | ||
533 | return 0; | ||
534 | } | ||
535 | late_initcall(register_perf_event_array_map); | ||
536 | |||
537 | #ifdef CONFIG_CGROUPS | 523 | #ifdef CONFIG_CGROUPS |
538 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, | 524 | static void *cgroup_fd_array_get_ptr(struct bpf_map *map, |
539 | struct file *map_file /* not used */, | 525 | struct file *map_file /* not used */, |
@@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) | |||
554 | fd_array_map_free(map); | 540 | fd_array_map_free(map); |
555 | } | 541 | } |
556 | 542 | ||
557 | static const struct bpf_map_ops cgroup_array_ops = { | 543 | const struct bpf_map_ops cgroup_array_map_ops = { |
558 | .map_alloc = fd_array_map_alloc, | 544 | .map_alloc = fd_array_map_alloc, |
559 | .map_free = cgroup_fd_array_free, | 545 | .map_free = cgroup_fd_array_free, |
560 | .map_get_next_key = array_map_get_next_key, | 546 | .map_get_next_key = array_map_get_next_key, |
@@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = { | |||
563 | .map_fd_get_ptr = cgroup_fd_array_get_ptr, | 549 | .map_fd_get_ptr = cgroup_fd_array_get_ptr, |
564 | .map_fd_put_ptr = cgroup_fd_array_put_ptr, | 550 | .map_fd_put_ptr = cgroup_fd_array_put_ptr, |
565 | }; | 551 | }; |
552 | #endif | ||
566 | 553 | ||
567 | static struct bpf_map_type_list cgroup_array_type __ro_after_init = { | 554 | static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) |
568 | .ops = &cgroup_array_ops, | 555 | { |
569 | .type = BPF_MAP_TYPE_CGROUP_ARRAY, | 556 | struct bpf_map *map, *inner_map_meta; |
570 | }; | 557 | |
558 | inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); | ||
559 | if (IS_ERR(inner_map_meta)) | ||
560 | return inner_map_meta; | ||
571 | 561 | ||
572 | static int __init register_cgroup_array_map(void) | 562 | map = fd_array_map_alloc(attr); |
563 | if (IS_ERR(map)) { | ||
564 | bpf_map_meta_free(inner_map_meta); | ||
565 | return map; | ||
566 | } | ||
567 | |||
568 | map->inner_map_meta = inner_map_meta; | ||
569 | |||
570 | return map; | ||
571 | } | ||
572 | |||
573 | static void array_of_map_free(struct bpf_map *map) | ||
573 | { | 574 | { |
574 | bpf_register_map_type(&cgroup_array_type); | 575 | /* map->inner_map_meta is only accessed by syscall which |
575 | return 0; | 576 | * is protected by fdget/fdput. |
577 | */ | ||
578 | bpf_map_meta_free(map->inner_map_meta); | ||
579 | bpf_fd_array_map_clear(map); | ||
580 | fd_array_map_free(map); | ||
576 | } | 581 | } |
577 | late_initcall(register_cgroup_array_map); | 582 | |
578 | #endif | 583 | static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) |
584 | { | ||
585 | struct bpf_map **inner_map = array_map_lookup_elem(map, key); | ||
586 | |||
587 | if (!inner_map) | ||
588 | return NULL; | ||
589 | |||
590 | return READ_ONCE(*inner_map); | ||
591 | } | ||
592 | |||
593 | const struct bpf_map_ops array_of_maps_map_ops = { | ||
594 | .map_alloc = array_of_map_alloc, | ||
595 | .map_free = array_of_map_free, | ||
596 | .map_get_next_key = array_map_get_next_key, | ||
597 | .map_lookup_elem = array_of_map_lookup_elem, | ||
598 | .map_delete_elem = fd_array_map_delete_elem, | ||
599 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | ||
600 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | ||
601 | }; | ||
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index f62d1d56f41d..e6ef4401a138 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #define LOCAL_FREE_TARGET (128) | 13 | #define LOCAL_FREE_TARGET (128) |
14 | #define LOCAL_NR_SCANS LOCAL_FREE_TARGET | 14 | #define LOCAL_NR_SCANS LOCAL_FREE_TARGET |
15 | 15 | ||
16 | #define PERCPU_FREE_TARGET (16) | 16 | #define PERCPU_FREE_TARGET (4) |
17 | #define PERCPU_NR_SCANS PERCPU_FREE_TARGET | 17 | #define PERCPU_NR_SCANS PERCPU_FREE_TARGET |
18 | 18 | ||
19 | /* Helpers to get the local list index */ | 19 | /* Helpers to get the local list index */ |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da0f53690295..ea6033cba947 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
@@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, | |||
154 | 154 | ||
155 | /** | 155 | /** |
156 | * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering | 156 | * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering |
157 | * @sk: The socken sending or receiving traffic | 157 | * @sk: The socket sending or receiving traffic |
158 | * @skb: The skb that is being sent or received | 158 | * @skb: The skb that is being sent or received |
159 | * @type: The type of program to be exectuted | 159 | * @type: The type of program to be exectuted |
160 | * | 160 | * |
@@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, | |||
189 | prog = rcu_dereference(cgrp->bpf.effective[type]); | 189 | prog = rcu_dereference(cgrp->bpf.effective[type]); |
190 | if (prog) { | 190 | if (prog) { |
191 | unsigned int offset = skb->data - skb_network_header(skb); | 191 | unsigned int offset = skb->data - skb_network_header(skb); |
192 | struct sock *save_sk = skb->sk; | ||
192 | 193 | ||
194 | skb->sk = sk; | ||
193 | __skb_push(skb, offset); | 195 | __skb_push(skb, offset); |
194 | ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; | 196 | ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; |
195 | __skb_pull(skb, offset); | 197 | __skb_pull(skb, offset); |
198 | skb->sk = save_sk; | ||
196 | } | 199 | } |
197 | 200 | ||
198 | rcu_read_unlock(); | 201 | rcu_read_unlock(); |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b4f1cb0c5ac7..dedf367f59bb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns | |||
76 | 76 | ||
77 | struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | 77 | struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) |
78 | { | 78 | { |
79 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | 79 | gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; |
80 | gfp_extra_flags; | ||
81 | struct bpf_prog_aux *aux; | 80 | struct bpf_prog_aux *aux; |
82 | struct bpf_prog *fp; | 81 | struct bpf_prog *fp; |
83 | 82 | ||
@@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc); | |||
107 | struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, | 106 | struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, |
108 | gfp_t gfp_extra_flags) | 107 | gfp_t gfp_extra_flags) |
109 | { | 108 | { |
110 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | 109 | gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; |
111 | gfp_extra_flags; | ||
112 | struct bpf_prog *fp; | 110 | struct bpf_prog *fp; |
113 | u32 pages, delta; | 111 | u32 pages, delta; |
114 | int ret; | 112 | int ret; |
@@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) | |||
394 | 392 | ||
395 | void bpf_prog_kallsyms_add(struct bpf_prog *fp) | 393 | void bpf_prog_kallsyms_add(struct bpf_prog *fp) |
396 | { | 394 | { |
397 | unsigned long flags; | ||
398 | |||
399 | if (!bpf_prog_kallsyms_candidate(fp) || | 395 | if (!bpf_prog_kallsyms_candidate(fp) || |
400 | !capable(CAP_SYS_ADMIN)) | 396 | !capable(CAP_SYS_ADMIN)) |
401 | return; | 397 | return; |
402 | 398 | ||
403 | spin_lock_irqsave(&bpf_lock, flags); | 399 | spin_lock_bh(&bpf_lock); |
404 | bpf_prog_ksym_node_add(fp->aux); | 400 | bpf_prog_ksym_node_add(fp->aux); |
405 | spin_unlock_irqrestore(&bpf_lock, flags); | 401 | spin_unlock_bh(&bpf_lock); |
406 | } | 402 | } |
407 | 403 | ||
408 | void bpf_prog_kallsyms_del(struct bpf_prog *fp) | 404 | void bpf_prog_kallsyms_del(struct bpf_prog *fp) |
409 | { | 405 | { |
410 | unsigned long flags; | ||
411 | |||
412 | if (!bpf_prog_kallsyms_candidate(fp)) | 406 | if (!bpf_prog_kallsyms_candidate(fp)) |
413 | return; | 407 | return; |
414 | 408 | ||
415 | spin_lock_irqsave(&bpf_lock, flags); | 409 | spin_lock_bh(&bpf_lock); |
416 | bpf_prog_ksym_node_del(fp->aux); | 410 | bpf_prog_ksym_node_del(fp->aux); |
417 | spin_unlock_irqrestore(&bpf_lock, flags); | 411 | spin_unlock_bh(&bpf_lock); |
418 | } | 412 | } |
419 | 413 | ||
420 | static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) | 414 | static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) |
@@ -659,8 +653,7 @@ out: | |||
659 | static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, | 653 | static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, |
660 | gfp_t gfp_extra_flags) | 654 | gfp_t gfp_extra_flags) |
661 | { | 655 | { |
662 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | 656 | gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; |
663 | gfp_extra_flags; | ||
664 | struct bpf_prog *fp; | 657 | struct bpf_prog *fp; |
665 | 658 | ||
666 | fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); | 659 | fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 361a69dfe543..004334ea13ba 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/rculist_nulls.h> | 16 | #include <linux/rculist_nulls.h> |
17 | #include "percpu_freelist.h" | 17 | #include "percpu_freelist.h" |
18 | #include "bpf_lru_list.h" | 18 | #include "bpf_lru_list.h" |
19 | #include "map_in_map.h" | ||
19 | 20 | ||
20 | struct bucket { | 21 | struct bucket { |
21 | struct hlist_nulls_head head; | 22 | struct hlist_nulls_head head; |
@@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size | |||
86 | return *(void __percpu **)(l->key + key_size); | 87 | return *(void __percpu **)(l->key + key_size); |
87 | } | 88 | } |
88 | 89 | ||
90 | static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) | ||
91 | { | ||
92 | return *(void **)(l->key + roundup(map->key_size, 8)); | ||
93 | } | ||
94 | |||
89 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) | 95 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) |
90 | { | 96 | { |
91 | return (struct htab_elem *) (htab->elems + i * htab->elem_size); | 97 | return (struct htab_elem *) (htab->elems + i * htab->elem_size); |
@@ -426,7 +432,11 @@ again: | |||
426 | return NULL; | 432 | return NULL; |
427 | } | 433 | } |
428 | 434 | ||
429 | /* Called from syscall or from eBPF program */ | 435 | /* Called from syscall or from eBPF program directly, so |
436 | * arguments have to match bpf_map_lookup_elem() exactly. | ||
437 | * The return value is adjusted by BPF instructions | ||
438 | * in htab_map_gen_lookup(). | ||
439 | */ | ||
430 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) | 440 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) |
431 | { | 441 | { |
432 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 442 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
@@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) | |||
458 | return NULL; | 468 | return NULL; |
459 | } | 469 | } |
460 | 470 | ||
471 | /* inline bpf_map_lookup_elem() call. | ||
472 | * Instead of: | ||
473 | * bpf_prog | ||
474 | * bpf_map_lookup_elem | ||
475 | * map->ops->map_lookup_elem | ||
476 | * htab_map_lookup_elem | ||
477 | * __htab_map_lookup_elem | ||
478 | * do: | ||
479 | * bpf_prog | ||
480 | * __htab_map_lookup_elem | ||
481 | */ | ||
482 | static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) | ||
483 | { | ||
484 | struct bpf_insn *insn = insn_buf; | ||
485 | const int ret = BPF_REG_0; | ||
486 | |||
487 | *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); | ||
488 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); | ||
489 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, | ||
490 | offsetof(struct htab_elem, key) + | ||
491 | round_up(map->key_size, 8)); | ||
492 | return insn - insn_buf; | ||
493 | } | ||
494 | |||
461 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) | 495 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) |
462 | { | 496 | { |
463 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | 497 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
@@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
506 | struct hlist_nulls_head *head; | 540 | struct hlist_nulls_head *head; |
507 | struct htab_elem *l, *next_l; | 541 | struct htab_elem *l, *next_l; |
508 | u32 hash, key_size; | 542 | u32 hash, key_size; |
509 | int i; | 543 | int i = 0; |
510 | 544 | ||
511 | WARN_ON_ONCE(!rcu_read_lock_held()); | 545 | WARN_ON_ONCE(!rcu_read_lock_held()); |
512 | 546 | ||
513 | key_size = map->key_size; | 547 | key_size = map->key_size; |
514 | 548 | ||
549 | if (!key) | ||
550 | goto find_first_elem; | ||
551 | |||
515 | hash = htab_map_hash(key, key_size); | 552 | hash = htab_map_hash(key, key_size); |
516 | 553 | ||
517 | head = select_bucket(htab, hash); | 554 | head = select_bucket(htab, hash); |
@@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
519 | /* lookup the key */ | 556 | /* lookup the key */ |
520 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); | 557 | l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); |
521 | 558 | ||
522 | if (!l) { | 559 | if (!l) |
523 | i = 0; | ||
524 | goto find_first_elem; | 560 | goto find_first_elem; |
525 | } | ||
526 | 561 | ||
527 | /* key was found, get next key in the same bucket */ | 562 | /* key was found, get next key in the same bucket */ |
528 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), | 563 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), |
@@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) | |||
582 | 617 | ||
583 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) | 618 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
584 | { | 619 | { |
620 | struct bpf_map *map = &htab->map; | ||
621 | |||
622 | if (map->ops->map_fd_put_ptr) { | ||
623 | void *ptr = fd_htab_map_get_ptr(map, l); | ||
624 | |||
625 | map->ops->map_fd_put_ptr(ptr); | ||
626 | } | ||
627 | |||
585 | if (htab_is_prealloc(htab)) { | 628 | if (htab_is_prealloc(htab)) { |
586 | pcpu_freelist_push(&htab->freelist, &l->fnode); | 629 | pcpu_freelist_push(&htab->freelist, &l->fnode); |
587 | } else { | 630 | } else { |
@@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab) | |||
1027 | } | 1070 | } |
1028 | } | 1071 | } |
1029 | } | 1072 | } |
1073 | |||
1030 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | 1074 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ |
1031 | static void htab_map_free(struct bpf_map *map) | 1075 | static void htab_map_free(struct bpf_map *map) |
1032 | { | 1076 | { |
@@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map) | |||
1053 | kfree(htab); | 1097 | kfree(htab); |
1054 | } | 1098 | } |
1055 | 1099 | ||
1056 | static const struct bpf_map_ops htab_ops = { | 1100 | const struct bpf_map_ops htab_map_ops = { |
1057 | .map_alloc = htab_map_alloc, | 1101 | .map_alloc = htab_map_alloc, |
1058 | .map_free = htab_map_free, | 1102 | .map_free = htab_map_free, |
1059 | .map_get_next_key = htab_map_get_next_key, | 1103 | .map_get_next_key = htab_map_get_next_key, |
1060 | .map_lookup_elem = htab_map_lookup_elem, | 1104 | .map_lookup_elem = htab_map_lookup_elem, |
1061 | .map_update_elem = htab_map_update_elem, | 1105 | .map_update_elem = htab_map_update_elem, |
1062 | .map_delete_elem = htab_map_delete_elem, | 1106 | .map_delete_elem = htab_map_delete_elem, |
1107 | .map_gen_lookup = htab_map_gen_lookup, | ||
1063 | }; | 1108 | }; |
1064 | 1109 | ||
1065 | static struct bpf_map_type_list htab_type __ro_after_init = { | 1110 | const struct bpf_map_ops htab_lru_map_ops = { |
1066 | .ops = &htab_ops, | ||
1067 | .type = BPF_MAP_TYPE_HASH, | ||
1068 | }; | ||
1069 | |||
1070 | static const struct bpf_map_ops htab_lru_ops = { | ||
1071 | .map_alloc = htab_map_alloc, | 1111 | .map_alloc = htab_map_alloc, |
1072 | .map_free = htab_map_free, | 1112 | .map_free = htab_map_free, |
1073 | .map_get_next_key = htab_map_get_next_key, | 1113 | .map_get_next_key = htab_map_get_next_key, |
@@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = { | |||
1076 | .map_delete_elem = htab_lru_map_delete_elem, | 1116 | .map_delete_elem = htab_lru_map_delete_elem, |
1077 | }; | 1117 | }; |
1078 | 1118 | ||
1079 | static struct bpf_map_type_list htab_lru_type __ro_after_init = { | ||
1080 | .ops = &htab_lru_ops, | ||
1081 | .type = BPF_MAP_TYPE_LRU_HASH, | ||
1082 | }; | ||
1083 | |||
1084 | /* Called from eBPF program */ | 1119 | /* Called from eBPF program */ |
1085 | static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) | 1120 | static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
1086 | { | 1121 | { |
@@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, | |||
1154 | return ret; | 1189 | return ret; |
1155 | } | 1190 | } |
1156 | 1191 | ||
1157 | static const struct bpf_map_ops htab_percpu_ops = { | 1192 | const struct bpf_map_ops htab_percpu_map_ops = { |
1158 | .map_alloc = htab_map_alloc, | 1193 | .map_alloc = htab_map_alloc, |
1159 | .map_free = htab_map_free, | 1194 | .map_free = htab_map_free, |
1160 | .map_get_next_key = htab_map_get_next_key, | 1195 | .map_get_next_key = htab_map_get_next_key, |
@@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = { | |||
1163 | .map_delete_elem = htab_map_delete_elem, | 1198 | .map_delete_elem = htab_map_delete_elem, |
1164 | }; | 1199 | }; |
1165 | 1200 | ||
1166 | static struct bpf_map_type_list htab_percpu_type __ro_after_init = { | 1201 | const struct bpf_map_ops htab_lru_percpu_map_ops = { |
1167 | .ops = &htab_percpu_ops, | ||
1168 | .type = BPF_MAP_TYPE_PERCPU_HASH, | ||
1169 | }; | ||
1170 | |||
1171 | static const struct bpf_map_ops htab_lru_percpu_ops = { | ||
1172 | .map_alloc = htab_map_alloc, | 1202 | .map_alloc = htab_map_alloc, |
1173 | .map_free = htab_map_free, | 1203 | .map_free = htab_map_free, |
1174 | .map_get_next_key = htab_map_get_next_key, | 1204 | .map_get_next_key = htab_map_get_next_key, |
@@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { | |||
1177 | .map_delete_elem = htab_lru_map_delete_elem, | 1207 | .map_delete_elem = htab_lru_map_delete_elem, |
1178 | }; | 1208 | }; |
1179 | 1209 | ||
1180 | static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { | 1210 | static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) |
1181 | .ops = &htab_lru_percpu_ops, | 1211 | { |
1182 | .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, | 1212 | struct bpf_map *map; |
1183 | }; | 1213 | |
1214 | if (attr->value_size != sizeof(u32)) | ||
1215 | return ERR_PTR(-EINVAL); | ||
1216 | |||
1217 | /* pointer is stored internally */ | ||
1218 | attr->value_size = sizeof(void *); | ||
1219 | map = htab_map_alloc(attr); | ||
1220 | attr->value_size = sizeof(u32); | ||
1184 | 1221 | ||
1185 | static int __init register_htab_map(void) | 1222 | return map; |
1223 | } | ||
1224 | |||
1225 | static void fd_htab_map_free(struct bpf_map *map) | ||
1186 | { | 1226 | { |
1187 | bpf_register_map_type(&htab_type); | 1227 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1188 | bpf_register_map_type(&htab_percpu_type); | 1228 | struct hlist_nulls_node *n; |
1189 | bpf_register_map_type(&htab_lru_type); | 1229 | struct hlist_nulls_head *head; |
1190 | bpf_register_map_type(&htab_lru_percpu_type); | 1230 | struct htab_elem *l; |
1191 | return 0; | 1231 | int i; |
1232 | |||
1233 | for (i = 0; i < htab->n_buckets; i++) { | ||
1234 | head = select_bucket(htab, i); | ||
1235 | |||
1236 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { | ||
1237 | void *ptr = fd_htab_map_get_ptr(map, l); | ||
1238 | |||
1239 | map->ops->map_fd_put_ptr(ptr); | ||
1240 | } | ||
1241 | } | ||
1242 | |||
1243 | htab_map_free(map); | ||
1244 | } | ||
1245 | |||
1246 | /* only called from syscall */ | ||
1247 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | ||
1248 | void *key, void *value, u64 map_flags) | ||
1249 | { | ||
1250 | void *ptr; | ||
1251 | int ret; | ||
1252 | u32 ufd = *(u32 *)value; | ||
1253 | |||
1254 | ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); | ||
1255 | if (IS_ERR(ptr)) | ||
1256 | return PTR_ERR(ptr); | ||
1257 | |||
1258 | ret = htab_map_update_elem(map, key, &ptr, map_flags); | ||
1259 | if (ret) | ||
1260 | map->ops->map_fd_put_ptr(ptr); | ||
1261 | |||
1262 | return ret; | ||
1263 | } | ||
1264 | |||
1265 | static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) | ||
1266 | { | ||
1267 | struct bpf_map *map, *inner_map_meta; | ||
1268 | |||
1269 | inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); | ||
1270 | if (IS_ERR(inner_map_meta)) | ||
1271 | return inner_map_meta; | ||
1272 | |||
1273 | map = fd_htab_map_alloc(attr); | ||
1274 | if (IS_ERR(map)) { | ||
1275 | bpf_map_meta_free(inner_map_meta); | ||
1276 | return map; | ||
1277 | } | ||
1278 | |||
1279 | map->inner_map_meta = inner_map_meta; | ||
1280 | |||
1281 | return map; | ||
1192 | } | 1282 | } |
1193 | late_initcall(register_htab_map); | 1283 | |
1284 | static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) | ||
1285 | { | ||
1286 | struct bpf_map **inner_map = htab_map_lookup_elem(map, key); | ||
1287 | |||
1288 | if (!inner_map) | ||
1289 | return NULL; | ||
1290 | |||
1291 | return READ_ONCE(*inner_map); | ||
1292 | } | ||
1293 | |||
1294 | static void htab_of_map_free(struct bpf_map *map) | ||
1295 | { | ||
1296 | bpf_map_meta_free(map->inner_map_meta); | ||
1297 | fd_htab_map_free(map); | ||
1298 | } | ||
1299 | |||
1300 | const struct bpf_map_ops htab_of_maps_map_ops = { | ||
1301 | .map_alloc = htab_of_map_alloc, | ||
1302 | .map_free = htab_of_map_free, | ||
1303 | .map_get_next_key = htab_map_get_next_key, | ||
1304 | .map_lookup_elem = htab_of_map_lookup_elem, | ||
1305 | .map_delete_elem = htab_map_delete_elem, | ||
1306 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | ||
1307 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | ||
1308 | }; | ||
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fddcae801724..9bbd33497d3d 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) | |||
429 | 429 | ||
430 | static int bpf_fill_super(struct super_block *sb, void *data, int silent) | 430 | static int bpf_fill_super(struct super_block *sb, void *data, int silent) |
431 | { | 431 | { |
432 | static struct tree_descr bpf_rfiles[] = { { "" } }; | 432 | static const struct tree_descr bpf_rfiles[] = { { "" } }; |
433 | struct bpf_mount_opts opts; | 433 | struct bpf_mount_opts opts; |
434 | struct inode *inode; | 434 | struct inode *inode; |
435 | int ret; | 435 | int ret; |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b37bd9ab7f57..39cfafd895b8 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c | |||
@@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
505 | return -ENOTSUPP; | 505 | return -ENOTSUPP; |
506 | } | 506 | } |
507 | 507 | ||
508 | static const struct bpf_map_ops trie_ops = { | 508 | const struct bpf_map_ops trie_map_ops = { |
509 | .map_alloc = trie_alloc, | 509 | .map_alloc = trie_alloc, |
510 | .map_free = trie_free, | 510 | .map_free = trie_free, |
511 | .map_get_next_key = trie_get_next_key, | 511 | .map_get_next_key = trie_get_next_key, |
@@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = { | |||
513 | .map_update_elem = trie_update_elem, | 513 | .map_update_elem = trie_update_elem, |
514 | .map_delete_elem = trie_delete_elem, | 514 | .map_delete_elem = trie_delete_elem, |
515 | }; | 515 | }; |
516 | |||
517 | static struct bpf_map_type_list trie_type __ro_after_init = { | ||
518 | .ops = &trie_ops, | ||
519 | .type = BPF_MAP_TYPE_LPM_TRIE, | ||
520 | }; | ||
521 | |||
522 | static int __init register_trie_map(void) | ||
523 | { | ||
524 | bpf_register_map_type(&trie_type); | ||
525 | return 0; | ||
526 | } | ||
527 | late_initcall(register_trie_map); | ||
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/bpf.h> | ||
9 | |||
10 | #include "map_in_map.h" | ||
11 | |||
12 | struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) | ||
13 | { | ||
14 | struct bpf_map *inner_map, *inner_map_meta; | ||
15 | struct fd f; | ||
16 | |||
17 | f = fdget(inner_map_ufd); | ||
18 | inner_map = __bpf_map_get(f); | ||
19 | if (IS_ERR(inner_map)) | ||
20 | return inner_map; | ||
21 | |||
22 | /* prog_array->owner_prog_type and owner_jited | ||
23 | * is a runtime binding. Doing static check alone | ||
24 | * in the verifier is not enough. | ||
25 | */ | ||
26 | if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { | ||
27 | fdput(f); | ||
28 | return ERR_PTR(-ENOTSUPP); | ||
29 | } | ||
30 | |||
31 | /* Does not support >1 level map-in-map */ | ||
32 | if (inner_map->inner_map_meta) { | ||
33 | fdput(f); | ||
34 | return ERR_PTR(-EINVAL); | ||
35 | } | ||
36 | |||
37 | inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); | ||
38 | if (!inner_map_meta) { | ||
39 | fdput(f); | ||
40 | return ERR_PTR(-ENOMEM); | ||
41 | } | ||
42 | |||
43 | inner_map_meta->map_type = inner_map->map_type; | ||
44 | inner_map_meta->key_size = inner_map->key_size; | ||
45 | inner_map_meta->value_size = inner_map->value_size; | ||
46 | inner_map_meta->map_flags = inner_map->map_flags; | ||
47 | inner_map_meta->ops = inner_map->ops; | ||
48 | inner_map_meta->max_entries = inner_map->max_entries; | ||
49 | |||
50 | fdput(f); | ||
51 | return inner_map_meta; | ||
52 | } | ||
53 | |||
54 | void bpf_map_meta_free(struct bpf_map *map_meta) | ||
55 | { | ||
56 | kfree(map_meta); | ||
57 | } | ||
58 | |||
59 | bool bpf_map_meta_equal(const struct bpf_map *meta0, | ||
60 | const struct bpf_map *meta1) | ||
61 | { | ||
62 | /* No need to compare ops because it is covered by map_type */ | ||
63 | return meta0->map_type == meta1->map_type && | ||
64 | meta0->key_size == meta1->key_size && | ||
65 | meta0->value_size == meta1->value_size && | ||
66 | meta0->map_flags == meta1->map_flags && | ||
67 | meta0->max_entries == meta1->max_entries; | ||
68 | } | ||
69 | |||
70 | void *bpf_map_fd_get_ptr(struct bpf_map *map, | ||
71 | struct file *map_file /* not used */, | ||
72 | int ufd) | ||
73 | { | ||
74 | struct bpf_map *inner_map; | ||
75 | struct fd f; | ||
76 | |||
77 | f = fdget(ufd); | ||
78 | inner_map = __bpf_map_get(f); | ||
79 | if (IS_ERR(inner_map)) | ||
80 | return inner_map; | ||
81 | |||
82 | if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) | ||
83 | inner_map = bpf_map_inc(inner_map, false); | ||
84 | else | ||
85 | inner_map = ERR_PTR(-EINVAL); | ||
86 | |||
87 | fdput(f); | ||
88 | return inner_map; | ||
89 | } | ||
90 | |||
91 | void bpf_map_fd_put_ptr(void *ptr) | ||
92 | { | ||
93 | /* ptr->ops->map_free() has to go through one | ||
94 | * rcu grace period by itself. | ||
95 | */ | ||
96 | bpf_map_put(ptr); | ||
97 | } | ||
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #ifndef __MAP_IN_MAP_H__ | ||
8 | #define __MAP_IN_MAP_H__ | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | |||
12 | struct file; | ||
13 | struct bpf_map; | ||
14 | |||
15 | struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); | ||
16 | void bpf_map_meta_free(struct bpf_map *map_meta); | ||
17 | bool bpf_map_meta_equal(const struct bpf_map *meta0, | ||
18 | const struct bpf_map *meta1); | ||
19 | void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, | ||
20 | int ufd); | ||
21 | void bpf_map_fd_put_ptr(void *ptr); | ||
22 | |||
23 | #endif | ||
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 22aa45cd0324..4dfd6f2ec2f9 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
@@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map) | |||
264 | put_callchain_buffers(); | 264 | put_callchain_buffers(); |
265 | } | 265 | } |
266 | 266 | ||
267 | static const struct bpf_map_ops stack_map_ops = { | 267 | const struct bpf_map_ops stack_map_ops = { |
268 | .map_alloc = stack_map_alloc, | 268 | .map_alloc = stack_map_alloc, |
269 | .map_free = stack_map_free, | 269 | .map_free = stack_map_free, |
270 | .map_get_next_key = stack_map_get_next_key, | 270 | .map_get_next_key = stack_map_get_next_key, |
@@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = { | |||
272 | .map_update_elem = stack_map_update_elem, | 272 | .map_update_elem = stack_map_update_elem, |
273 | .map_delete_elem = stack_map_delete_elem, | 273 | .map_delete_elem = stack_map_delete_elem, |
274 | }; | 274 | }; |
275 | |||
276 | static struct bpf_map_type_list stack_map_type __ro_after_init = { | ||
277 | .ops = &stack_map_ops, | ||
278 | .type = BPF_MAP_TYPE_STACK_TRACE, | ||
279 | }; | ||
280 | |||
281 | static int __init register_stack_map(void) | ||
282 | { | ||
283 | bpf_register_map_type(&stack_map_type); | ||
284 | return 0; | ||
285 | } | ||
286 | late_initcall(register_stack_map); | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 821f9e807de5..265a0d854e33 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active); | |||
27 | 27 | ||
28 | int sysctl_unprivileged_bpf_disabled __read_mostly; | 28 | int sysctl_unprivileged_bpf_disabled __read_mostly; |
29 | 29 | ||
30 | static LIST_HEAD(bpf_map_types); | 30 | static const struct bpf_map_ops * const bpf_map_types[] = { |
31 | #define BPF_PROG_TYPE(_id, _ops) | ||
32 | #define BPF_MAP_TYPE(_id, _ops) \ | ||
33 | [_id] = &_ops, | ||
34 | #include <linux/bpf_types.h> | ||
35 | #undef BPF_PROG_TYPE | ||
36 | #undef BPF_MAP_TYPE | ||
37 | }; | ||
31 | 38 | ||
32 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | 39 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) |
33 | { | 40 | { |
34 | struct bpf_map_type_list *tl; | ||
35 | struct bpf_map *map; | 41 | struct bpf_map *map; |
36 | 42 | ||
37 | list_for_each_entry(tl, &bpf_map_types, list_node) { | 43 | if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || |
38 | if (tl->type == attr->map_type) { | 44 | !bpf_map_types[attr->map_type]) |
39 | map = tl->ops->map_alloc(attr); | 45 | return ERR_PTR(-EINVAL); |
40 | if (IS_ERR(map)) | ||
41 | return map; | ||
42 | map->ops = tl->ops; | ||
43 | map->map_type = attr->map_type; | ||
44 | return map; | ||
45 | } | ||
46 | } | ||
47 | return ERR_PTR(-EINVAL); | ||
48 | } | ||
49 | 46 | ||
50 | /* boot time registration of different map implementations */ | 47 | map = bpf_map_types[attr->map_type]->map_alloc(attr); |
51 | void bpf_register_map_type(struct bpf_map_type_list *tl) | 48 | if (IS_ERR(map)) |
52 | { | 49 | return map; |
53 | list_add(&tl->list_node, &bpf_map_types); | 50 | map->ops = bpf_map_types[attr->map_type]; |
51 | map->map_type = attr->map_type; | ||
52 | return map; | ||
54 | } | 53 | } |
55 | 54 | ||
56 | void *bpf_map_area_alloc(size_t size) | 55 | void *bpf_map_area_alloc(size_t size) |
@@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size) | |||
68 | return area; | 67 | return area; |
69 | } | 68 | } |
70 | 69 | ||
71 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, | 70 | return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); |
72 | PAGE_KERNEL); | ||
73 | } | 71 | } |
74 | 72 | ||
75 | void bpf_map_area_free(void *area) | 73 | void bpf_map_area_free(void *area) |
@@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map) | |||
215 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | 213 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ |
216 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | 214 | sizeof(attr->CMD##_LAST_FIELD)) != NULL |
217 | 215 | ||
218 | #define BPF_MAP_CREATE_LAST_FIELD map_flags | 216 | #define BPF_MAP_CREATE_LAST_FIELD inner_map_fd |
219 | /* called via syscall */ | 217 | /* called via syscall */ |
220 | static int map_create(union bpf_attr *attr) | 218 | static int map_create(union bpf_attr *attr) |
221 | { | 219 | { |
@@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
352 | err = bpf_percpu_array_copy(map, key, value); | 350 | err = bpf_percpu_array_copy(map, key, value); |
353 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { | 351 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { |
354 | err = bpf_stackmap_copy(map, key, value); | 352 | err = bpf_stackmap_copy(map, key, value); |
353 | } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || | ||
354 | map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | ||
355 | err = -ENOTSUPP; | ||
355 | } else { | 356 | } else { |
356 | rcu_read_lock(); | 357 | rcu_read_lock(); |
357 | ptr = map->ops->map_lookup_elem(map, key); | 358 | ptr = map->ops->map_lookup_elem(map, key); |
@@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr) | |||
438 | err = bpf_percpu_array_update(map, key, value, attr->flags); | 439 | err = bpf_percpu_array_update(map, key, value, attr->flags); |
439 | } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || | 440 | } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || |
440 | map->map_type == BPF_MAP_TYPE_PROG_ARRAY || | 441 | map->map_type == BPF_MAP_TYPE_PROG_ARRAY || |
441 | map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { | 442 | map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || |
443 | map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { | ||
442 | rcu_read_lock(); | 444 | rcu_read_lock(); |
443 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, | 445 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, |
444 | attr->flags); | 446 | attr->flags); |
445 | rcu_read_unlock(); | 447 | rcu_read_unlock(); |
448 | } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | ||
449 | rcu_read_lock(); | ||
450 | err = bpf_fd_htab_map_update_elem(map, f.file, key, value, | ||
451 | attr->flags); | ||
452 | rcu_read_unlock(); | ||
446 | } else { | 453 | } else { |
447 | rcu_read_lock(); | 454 | rcu_read_lock(); |
448 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 455 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
@@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr) | |||
528 | if (IS_ERR(map)) | 535 | if (IS_ERR(map)) |
529 | return PTR_ERR(map); | 536 | return PTR_ERR(map); |
530 | 537 | ||
531 | err = -ENOMEM; | 538 | if (ukey) { |
532 | key = kmalloc(map->key_size, GFP_USER); | 539 | err = -ENOMEM; |
533 | if (!key) | 540 | key = kmalloc(map->key_size, GFP_USER); |
534 | goto err_put; | 541 | if (!key) |
542 | goto err_put; | ||
535 | 543 | ||
536 | err = -EFAULT; | 544 | err = -EFAULT; |
537 | if (copy_from_user(key, ukey, map->key_size) != 0) | 545 | if (copy_from_user(key, ukey, map->key_size) != 0) |
538 | goto free_key; | 546 | goto free_key; |
547 | } else { | ||
548 | key = NULL; | ||
549 | } | ||
539 | 550 | ||
540 | err = -ENOMEM; | 551 | err = -ENOMEM; |
541 | next_key = kmalloc(map->key_size, GFP_USER); | 552 | next_key = kmalloc(map->key_size, GFP_USER); |
@@ -564,87 +575,23 @@ err_put: | |||
564 | return err; | 575 | return err; |
565 | } | 576 | } |
566 | 577 | ||
567 | static LIST_HEAD(bpf_prog_types); | 578 | static const struct bpf_verifier_ops * const bpf_prog_types[] = { |
579 | #define BPF_PROG_TYPE(_id, _ops) \ | ||
580 | [_id] = &_ops, | ||
581 | #define BPF_MAP_TYPE(_id, _ops) | ||
582 | #include <linux/bpf_types.h> | ||
583 | #undef BPF_PROG_TYPE | ||
584 | #undef BPF_MAP_TYPE | ||
585 | }; | ||
568 | 586 | ||
569 | static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | 587 | static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) |
570 | { | 588 | { |
571 | struct bpf_prog_type_list *tl; | 589 | if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) |
572 | 590 | return -EINVAL; | |
573 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | ||
574 | if (tl->type == type) { | ||
575 | prog->aux->ops = tl->ops; | ||
576 | prog->type = type; | ||
577 | return 0; | ||
578 | } | ||
579 | } | ||
580 | |||
581 | return -EINVAL; | ||
582 | } | ||
583 | |||
584 | void bpf_register_prog_type(struct bpf_prog_type_list *tl) | ||
585 | { | ||
586 | list_add(&tl->list_node, &bpf_prog_types); | ||
587 | } | ||
588 | |||
589 | /* fixup insn->imm field of bpf_call instructions: | ||
590 | * if (insn->imm == BPF_FUNC_map_lookup_elem) | ||
591 | * insn->imm = bpf_map_lookup_elem - __bpf_call_base; | ||
592 | * else if (insn->imm == BPF_FUNC_map_update_elem) | ||
593 | * insn->imm = bpf_map_update_elem - __bpf_call_base; | ||
594 | * else ... | ||
595 | * | ||
596 | * this function is called after eBPF program passed verification | ||
597 | */ | ||
598 | static void fixup_bpf_calls(struct bpf_prog *prog) | ||
599 | { | ||
600 | const struct bpf_func_proto *fn; | ||
601 | int i; | ||
602 | 591 | ||
603 | for (i = 0; i < prog->len; i++) { | 592 | prog->aux->ops = bpf_prog_types[type]; |
604 | struct bpf_insn *insn = &prog->insnsi[i]; | 593 | prog->type = type; |
605 | 594 | return 0; | |
606 | if (insn->code == (BPF_JMP | BPF_CALL)) { | ||
607 | /* we reach here when program has bpf_call instructions | ||
608 | * and it passed bpf_check(), means that | ||
609 | * ops->get_func_proto must have been supplied, check it | ||
610 | */ | ||
611 | BUG_ON(!prog->aux->ops->get_func_proto); | ||
612 | |||
613 | if (insn->imm == BPF_FUNC_get_route_realm) | ||
614 | prog->dst_needed = 1; | ||
615 | if (insn->imm == BPF_FUNC_get_prandom_u32) | ||
616 | bpf_user_rnd_init_once(); | ||
617 | if (insn->imm == BPF_FUNC_xdp_adjust_head) | ||
618 | prog->xdp_adjust_head = 1; | ||
619 | if (insn->imm == BPF_FUNC_tail_call) { | ||
620 | /* If we tail call into other programs, we | ||
621 | * cannot make any assumptions since they | ||
622 | * can be replaced dynamically during runtime | ||
623 | * in the program array. | ||
624 | */ | ||
625 | prog->cb_access = 1; | ||
626 | prog->xdp_adjust_head = 1; | ||
627 | |||
628 | /* mark bpf_tail_call as different opcode | ||
629 | * to avoid conditional branch in | ||
630 | * interpeter for every normal call | ||
631 | * and to prevent accidental JITing by | ||
632 | * JIT compiler that doesn't support | ||
633 | * bpf_tail_call yet | ||
634 | */ | ||
635 | insn->imm = 0; | ||
636 | insn->code |= BPF_X; | ||
637 | continue; | ||
638 | } | ||
639 | |||
640 | fn = prog->aux->ops->get_func_proto(insn->imm); | ||
641 | /* all functions that have prototype and verifier allowed | ||
642 | * programs to call them, must be real in-kernel functions | ||
643 | */ | ||
644 | BUG_ON(!fn->func); | ||
645 | insn->imm = fn->func - __bpf_call_base; | ||
646 | } | ||
647 | } | ||
648 | } | 595 | } |
649 | 596 | ||
650 | /* drop refcnt on maps used by eBPF program and free auxilary data */ | 597 | /* drop refcnt on maps used by eBPF program and free auxilary data */ |
@@ -836,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) | |||
836 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); | 783 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); |
837 | 784 | ||
838 | /* last field in 'union bpf_attr' used by this command */ | 785 | /* last field in 'union bpf_attr' used by this command */ |
839 | #define BPF_PROG_LOAD_LAST_FIELD kern_version | 786 | #define BPF_PROG_LOAD_LAST_FIELD prog_flags |
840 | 787 | ||
841 | static int bpf_prog_load(union bpf_attr *attr) | 788 | static int bpf_prog_load(union bpf_attr *attr) |
842 | { | 789 | { |
@@ -849,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
849 | if (CHECK_ATTR(BPF_PROG_LOAD)) | 796 | if (CHECK_ATTR(BPF_PROG_LOAD)) |
850 | return -EINVAL; | 797 | return -EINVAL; |
851 | 798 | ||
799 | if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) | ||
800 | return -EINVAL; | ||
801 | |||
852 | /* copy eBPF program license from user space */ | 802 | /* copy eBPF program license from user space */ |
853 | if (strncpy_from_user(license, u64_to_user_ptr(attr->license), | 803 | if (strncpy_from_user(license, u64_to_user_ptr(attr->license), |
854 | sizeof(license) - 1) < 0) | 804 | sizeof(license) - 1) < 0) |
@@ -900,9 +850,6 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
900 | if (err < 0) | 850 | if (err < 0) |
901 | goto free_used_maps; | 851 | goto free_used_maps; |
902 | 852 | ||
903 | /* fixup BPF_CALL->imm field */ | ||
904 | fixup_bpf_calls(prog); | ||
905 | |||
906 | /* eBPF program is ready to be JITed */ | 853 | /* eBPF program is ready to be JITed */ |
907 | prog = bpf_prog_select_runtime(prog, &err); | 854 | prog = bpf_prog_select_runtime(prog, &err); |
908 | if (err < 0) | 855 | if (err < 0) |
@@ -1028,6 +975,28 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
1028 | } | 975 | } |
1029 | #endif /* CONFIG_CGROUP_BPF */ | 976 | #endif /* CONFIG_CGROUP_BPF */ |
1030 | 977 | ||
978 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration | ||
979 | |||
980 | static int bpf_prog_test_run(const union bpf_attr *attr, | ||
981 | union bpf_attr __user *uattr) | ||
982 | { | ||
983 | struct bpf_prog *prog; | ||
984 | int ret = -ENOTSUPP; | ||
985 | |||
986 | if (CHECK_ATTR(BPF_PROG_TEST_RUN)) | ||
987 | return -EINVAL; | ||
988 | |||
989 | prog = bpf_prog_get(attr->test.prog_fd); | ||
990 | if (IS_ERR(prog)) | ||
991 | return PTR_ERR(prog); | ||
992 | |||
993 | if (prog->aux->ops->test_run) | ||
994 | ret = prog->aux->ops->test_run(prog, attr, uattr); | ||
995 | |||
996 | bpf_prog_put(prog); | ||
997 | return ret; | ||
998 | } | ||
999 | |||
1031 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 1000 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) |
1032 | { | 1001 | { |
1033 | union bpf_attr attr = {}; | 1002 | union bpf_attr attr = {}; |
@@ -1094,7 +1063,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
1094 | case BPF_OBJ_GET: | 1063 | case BPF_OBJ_GET: |
1095 | err = bpf_obj_get(&attr); | 1064 | err = bpf_obj_get(&attr); |
1096 | break; | 1065 | break; |
1097 | |||
1098 | #ifdef CONFIG_CGROUP_BPF | 1066 | #ifdef CONFIG_CGROUP_BPF |
1099 | case BPF_PROG_ATTACH: | 1067 | case BPF_PROG_ATTACH: |
1100 | err = bpf_prog_attach(&attr); | 1068 | err = bpf_prog_attach(&attr); |
@@ -1103,7 +1071,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
1103 | err = bpf_prog_detach(&attr); | 1071 | err = bpf_prog_detach(&attr); |
1104 | break; | 1072 | break; |
1105 | #endif | 1073 | #endif |
1106 | 1074 | case BPF_PROG_TEST_RUN: | |
1075 | err = bpf_prog_test_run(&attr, uattr); | ||
1076 | break; | ||
1107 | default: | 1077 | default: |
1108 | err = -EINVAL; | 1078 | err = -EINVAL; |
1109 | break; | 1079 | break; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a834068a400e..1eddb713b815 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -140,9 +140,11 @@ struct bpf_verifier_stack_elem { | |||
140 | struct bpf_verifier_stack_elem *next; | 140 | struct bpf_verifier_stack_elem *next; |
141 | }; | 141 | }; |
142 | 142 | ||
143 | #define BPF_COMPLEXITY_LIMIT_INSNS 65536 | 143 | #define BPF_COMPLEXITY_LIMIT_INSNS 98304 |
144 | #define BPF_COMPLEXITY_LIMIT_STACK 1024 | 144 | #define BPF_COMPLEXITY_LIMIT_STACK 1024 |
145 | 145 | ||
146 | #define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) | ||
147 | |||
146 | struct bpf_call_arg_meta { | 148 | struct bpf_call_arg_meta { |
147 | struct bpf_map *map_ptr; | 149 | struct bpf_map *map_ptr; |
148 | bool raw_mode; | 150 | bool raw_mode; |
@@ -239,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state) | |||
239 | if (reg->max_value != BPF_REGISTER_MAX_RANGE) | 241 | if (reg->max_value != BPF_REGISTER_MAX_RANGE) |
240 | verbose(",max_value=%llu", | 242 | verbose(",max_value=%llu", |
241 | (unsigned long long)reg->max_value); | 243 | (unsigned long long)reg->max_value); |
244 | if (reg->min_align) | ||
245 | verbose(",min_align=%u", reg->min_align); | ||
246 | if (reg->aux_off) | ||
247 | verbose(",aux_off=%u", reg->aux_off); | ||
248 | if (reg->aux_off_align) | ||
249 | verbose(",aux_off_align=%u", reg->aux_off_align); | ||
242 | } | 250 | } |
243 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | 251 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { |
244 | if (state->stack_slot_type[i] == STACK_SPILL) | 252 | if (state->stack_slot_type[i] == STACK_SPILL) |
@@ -296,7 +304,8 @@ static const char *const bpf_jmp_string[16] = { | |||
296 | [BPF_EXIT >> 4] = "exit", | 304 | [BPF_EXIT >> 4] = "exit", |
297 | }; | 305 | }; |
298 | 306 | ||
299 | static void print_bpf_insn(struct bpf_insn *insn) | 307 | static void print_bpf_insn(const struct bpf_verifier_env *env, |
308 | const struct bpf_insn *insn) | ||
300 | { | 309 | { |
301 | u8 class = BPF_CLASS(insn->code); | 310 | u8 class = BPF_CLASS(insn->code); |
302 | 311 | ||
@@ -360,9 +369,19 @@ static void print_bpf_insn(struct bpf_insn *insn) | |||
360 | insn->code, | 369 | insn->code, |
361 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | 370 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], |
362 | insn->src_reg, insn->imm); | 371 | insn->src_reg, insn->imm); |
363 | } else if (BPF_MODE(insn->code) == BPF_IMM) { | 372 | } else if (BPF_MODE(insn->code) == BPF_IMM && |
364 | verbose("(%02x) r%d = 0x%x\n", | 373 | BPF_SIZE(insn->code) == BPF_DW) { |
365 | insn->code, insn->dst_reg, insn->imm); | 374 | /* At this point, we already made sure that the second |
375 | * part of the ldimm64 insn is accessible. | ||
376 | */ | ||
377 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; | ||
378 | bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; | ||
379 | |||
380 | if (map_ptr && !env->allow_ptr_leaks) | ||
381 | imm = 0; | ||
382 | |||
383 | verbose("(%02x) r%d = 0x%llx\n", insn->code, | ||
384 | insn->dst_reg, (unsigned long long)imm); | ||
366 | } else { | 385 | } else { |
367 | verbose("BUG_ld_%02x\n", insn->code); | 386 | verbose("BUG_ld_%02x\n", insn->code); |
368 | return; | 387 | return; |
@@ -453,6 +472,9 @@ static void init_reg_state(struct bpf_reg_state *regs) | |||
453 | regs[i].imm = 0; | 472 | regs[i].imm = 0; |
454 | regs[i].min_value = BPF_REGISTER_MIN_RANGE; | 473 | regs[i].min_value = BPF_REGISTER_MIN_RANGE; |
455 | regs[i].max_value = BPF_REGISTER_MAX_RANGE; | 474 | regs[i].max_value = BPF_REGISTER_MAX_RANGE; |
475 | regs[i].min_align = 0; | ||
476 | regs[i].aux_off = 0; | ||
477 | regs[i].aux_off_align = 0; | ||
456 | } | 478 | } |
457 | 479 | ||
458 | /* frame pointer */ | 480 | /* frame pointer */ |
@@ -479,6 +501,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | |||
479 | { | 501 | { |
480 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | 502 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; |
481 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | 503 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; |
504 | regs[regno].min_align = 0; | ||
482 | } | 505 | } |
483 | 506 | ||
484 | static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, | 507 | static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, |
@@ -766,17 +789,33 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | |||
766 | } | 789 | } |
767 | 790 | ||
768 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, | 791 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, |
769 | int off, int size) | 792 | int off, int size, bool strict) |
770 | { | 793 | { |
771 | if (reg->id && size != 1) { | 794 | int ip_align; |
772 | verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); | 795 | int reg_off; |
773 | return -EACCES; | 796 | |
797 | /* Byte size accesses are always allowed. */ | ||
798 | if (!strict || size == 1) | ||
799 | return 0; | ||
800 | |||
801 | reg_off = reg->off; | ||
802 | if (reg->id) { | ||
803 | if (reg->aux_off_align % size) { | ||
804 | verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", | ||
805 | reg->aux_off_align, size); | ||
806 | return -EACCES; | ||
807 | } | ||
808 | reg_off += reg->aux_off; | ||
774 | } | 809 | } |
775 | 810 | ||
776 | /* skb->data is NET_IP_ALIGN-ed */ | 811 | /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking |
777 | if ((NET_IP_ALIGN + reg->off + off) % size != 0) { | 812 | * we force this to 2 which is universally what architectures use |
813 | * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. | ||
814 | */ | ||
815 | ip_align = strict ? 2 : NET_IP_ALIGN; | ||
816 | if ((ip_align + reg_off + off) % size != 0) { | ||
778 | verbose("misaligned packet access off %d+%d+%d size %d\n", | 817 | verbose("misaligned packet access off %d+%d+%d size %d\n", |
779 | NET_IP_ALIGN, reg->off, off, size); | 818 | ip_align, reg_off, off, size); |
780 | return -EACCES; | 819 | return -EACCES; |
781 | } | 820 | } |
782 | 821 | ||
@@ -784,9 +823,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, | |||
784 | } | 823 | } |
785 | 824 | ||
786 | static int check_val_ptr_alignment(const struct bpf_reg_state *reg, | 825 | static int check_val_ptr_alignment(const struct bpf_reg_state *reg, |
787 | int size) | 826 | int size, bool strict) |
788 | { | 827 | { |
789 | if (size != 1) { | 828 | if (strict && size != 1) { |
790 | verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); | 829 | verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); |
791 | return -EACCES; | 830 | return -EACCES; |
792 | } | 831 | } |
@@ -794,16 +833,20 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg, | |||
794 | return 0; | 833 | return 0; |
795 | } | 834 | } |
796 | 835 | ||
797 | static int check_ptr_alignment(const struct bpf_reg_state *reg, | 836 | static int check_ptr_alignment(struct bpf_verifier_env *env, |
837 | const struct bpf_reg_state *reg, | ||
798 | int off, int size) | 838 | int off, int size) |
799 | { | 839 | { |
840 | bool strict = env->strict_alignment; | ||
841 | |||
842 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | ||
843 | strict = true; | ||
844 | |||
800 | switch (reg->type) { | 845 | switch (reg->type) { |
801 | case PTR_TO_PACKET: | 846 | case PTR_TO_PACKET: |
802 | return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : | 847 | return check_pkt_ptr_alignment(reg, off, size, strict); |
803 | check_pkt_ptr_alignment(reg, off, size); | ||
804 | case PTR_TO_MAP_VALUE_ADJ: | 848 | case PTR_TO_MAP_VALUE_ADJ: |
805 | return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : | 849 | return check_val_ptr_alignment(reg, size, strict); |
806 | check_val_ptr_alignment(reg, size); | ||
807 | default: | 850 | default: |
808 | if (off % size != 0) { | 851 | if (off % size != 0) { |
809 | verbose("misaligned access off %d size %d\n", | 852 | verbose("misaligned access off %d size %d\n", |
@@ -836,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
836 | if (size < 0) | 879 | if (size < 0) |
837 | return size; | 880 | return size; |
838 | 881 | ||
839 | err = check_ptr_alignment(reg, off, size); | 882 | err = check_ptr_alignment(env, reg, off, size); |
840 | if (err) | 883 | if (err) |
841 | return err; | 884 | return err; |
842 | 885 | ||
@@ -870,6 +913,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
870 | value_regno); | 913 | value_regno); |
871 | /* note that reg.[id|off|range] == 0 */ | 914 | /* note that reg.[id|off|range] == 0 */ |
872 | state->regs[value_regno].type = reg_type; | 915 | state->regs[value_regno].type = reg_type; |
916 | state->regs[value_regno].aux_off = 0; | ||
917 | state->regs[value_regno].aux_off_align = 0; | ||
873 | } | 918 | } |
874 | 919 | ||
875 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { | 920 | } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { |
@@ -1215,6 +1260,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
1215 | func_id != BPF_FUNC_current_task_under_cgroup) | 1260 | func_id != BPF_FUNC_current_task_under_cgroup) |
1216 | goto error; | 1261 | goto error; |
1217 | break; | 1262 | break; |
1263 | case BPF_MAP_TYPE_ARRAY_OF_MAPS: | ||
1264 | case BPF_MAP_TYPE_HASH_OF_MAPS: | ||
1265 | if (func_id != BPF_FUNC_map_lookup_elem) | ||
1266 | goto error; | ||
1218 | default: | 1267 | default: |
1219 | break; | 1268 | break; |
1220 | } | 1269 | } |
@@ -1291,7 +1340,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | |||
1291 | } | 1340 | } |
1292 | } | 1341 | } |
1293 | 1342 | ||
1294 | static int check_call(struct bpf_verifier_env *env, int func_id) | 1343 | static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) |
1295 | { | 1344 | { |
1296 | struct bpf_verifier_state *state = &env->cur_state; | 1345 | struct bpf_verifier_state *state = &env->cur_state; |
1297 | const struct bpf_func_proto *fn = NULL; | 1346 | const struct bpf_func_proto *fn = NULL; |
@@ -1375,6 +1424,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) | |||
1375 | } else if (fn->ret_type == RET_VOID) { | 1424 | } else if (fn->ret_type == RET_VOID) { |
1376 | regs[BPF_REG_0].type = NOT_INIT; | 1425 | regs[BPF_REG_0].type = NOT_INIT; |
1377 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { | 1426 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { |
1427 | struct bpf_insn_aux_data *insn_aux; | ||
1428 | |||
1378 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; | 1429 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; |
1379 | regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; | 1430 | regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; |
1380 | /* remember map_ptr, so that check_map_access() | 1431 | /* remember map_ptr, so that check_map_access() |
@@ -1387,6 +1438,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id) | |||
1387 | } | 1438 | } |
1388 | regs[BPF_REG_0].map_ptr = meta.map_ptr; | 1439 | regs[BPF_REG_0].map_ptr = meta.map_ptr; |
1389 | regs[BPF_REG_0].id = ++env->id_gen; | 1440 | regs[BPF_REG_0].id = ++env->id_gen; |
1441 | insn_aux = &env->insn_aux_data[insn_idx]; | ||
1442 | if (!insn_aux->map_ptr) | ||
1443 | insn_aux->map_ptr = meta.map_ptr; | ||
1444 | else if (insn_aux->map_ptr != meta.map_ptr) | ||
1445 | insn_aux->map_ptr = BPF_MAP_PTR_POISON; | ||
1390 | } else { | 1446 | } else { |
1391 | verbose("unknown return type %d of func %s#%d\n", | 1447 | verbose("unknown return type %d of func %s#%d\n", |
1392 | fn->ret_type, func_id_name(func_id), func_id); | 1448 | fn->ret_type, func_id_name(func_id), func_id); |
@@ -1431,6 +1487,8 @@ add_imm: | |||
1431 | */ | 1487 | */ |
1432 | dst_reg->off += imm; | 1488 | dst_reg->off += imm; |
1433 | } else { | 1489 | } else { |
1490 | bool had_id; | ||
1491 | |||
1434 | if (src_reg->type == PTR_TO_PACKET) { | 1492 | if (src_reg->type == PTR_TO_PACKET) { |
1435 | /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ | 1493 | /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ |
1436 | tmp_reg = *dst_reg; /* save r7 state */ | 1494 | tmp_reg = *dst_reg; /* save r7 state */ |
@@ -1464,14 +1522,23 @@ add_imm: | |||
1464 | src_reg->imm); | 1522 | src_reg->imm); |
1465 | return -EACCES; | 1523 | return -EACCES; |
1466 | } | 1524 | } |
1525 | |||
1526 | had_id = (dst_reg->id != 0); | ||
1527 | |||
1467 | /* dst_reg stays as pkt_ptr type and since some positive | 1528 | /* dst_reg stays as pkt_ptr type and since some positive |
1468 | * integer value was added to the pointer, increment its 'id' | 1529 | * integer value was added to the pointer, increment its 'id' |
1469 | */ | 1530 | */ |
1470 | dst_reg->id = ++env->id_gen; | 1531 | dst_reg->id = ++env->id_gen; |
1471 | 1532 | ||
1472 | /* something was added to pkt_ptr, set range and off to zero */ | 1533 | /* something was added to pkt_ptr, set range to zero */ |
1534 | dst_reg->aux_off += dst_reg->off; | ||
1473 | dst_reg->off = 0; | 1535 | dst_reg->off = 0; |
1474 | dst_reg->range = 0; | 1536 | dst_reg->range = 0; |
1537 | if (had_id) | ||
1538 | dst_reg->aux_off_align = min(dst_reg->aux_off_align, | ||
1539 | src_reg->min_align); | ||
1540 | else | ||
1541 | dst_reg->aux_off_align = src_reg->min_align; | ||
1475 | } | 1542 | } |
1476 | return 0; | 1543 | return 0; |
1477 | } | 1544 | } |
@@ -1645,6 +1712,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg) | |||
1645 | reg->min_value = BPF_REGISTER_MIN_RANGE; | 1712 | reg->min_value = BPF_REGISTER_MIN_RANGE; |
1646 | } | 1713 | } |
1647 | 1714 | ||
1715 | static u32 calc_align(u32 imm) | ||
1716 | { | ||
1717 | if (!imm) | ||
1718 | return 1U << 31; | ||
1719 | return imm - ((imm - 1) & imm); | ||
1720 | } | ||
1721 | |||
1648 | static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | 1722 | static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, |
1649 | struct bpf_insn *insn) | 1723 | struct bpf_insn *insn) |
1650 | { | 1724 | { |
@@ -1652,8 +1726,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
1652 | s64 min_val = BPF_REGISTER_MIN_RANGE; | 1726 | s64 min_val = BPF_REGISTER_MIN_RANGE; |
1653 | u64 max_val = BPF_REGISTER_MAX_RANGE; | 1727 | u64 max_val = BPF_REGISTER_MAX_RANGE; |
1654 | u8 opcode = BPF_OP(insn->code); | 1728 | u8 opcode = BPF_OP(insn->code); |
1729 | u32 dst_align, src_align; | ||
1655 | 1730 | ||
1656 | dst_reg = ®s[insn->dst_reg]; | 1731 | dst_reg = ®s[insn->dst_reg]; |
1732 | src_align = 0; | ||
1657 | if (BPF_SRC(insn->code) == BPF_X) { | 1733 | if (BPF_SRC(insn->code) == BPF_X) { |
1658 | check_reg_overflow(®s[insn->src_reg]); | 1734 | check_reg_overflow(®s[insn->src_reg]); |
1659 | min_val = regs[insn->src_reg].min_value; | 1735 | min_val = regs[insn->src_reg].min_value; |
@@ -1669,12 +1745,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
1669 | regs[insn->src_reg].type != UNKNOWN_VALUE) { | 1745 | regs[insn->src_reg].type != UNKNOWN_VALUE) { |
1670 | min_val = BPF_REGISTER_MIN_RANGE; | 1746 | min_val = BPF_REGISTER_MIN_RANGE; |
1671 | max_val = BPF_REGISTER_MAX_RANGE; | 1747 | max_val = BPF_REGISTER_MAX_RANGE; |
1748 | src_align = 0; | ||
1749 | } else { | ||
1750 | src_align = regs[insn->src_reg].min_align; | ||
1672 | } | 1751 | } |
1673 | } else if (insn->imm < BPF_REGISTER_MAX_RANGE && | 1752 | } else if (insn->imm < BPF_REGISTER_MAX_RANGE && |
1674 | (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { | 1753 | (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { |
1675 | min_val = max_val = insn->imm; | 1754 | min_val = max_val = insn->imm; |
1755 | src_align = calc_align(insn->imm); | ||
1676 | } | 1756 | } |
1677 | 1757 | ||
1758 | dst_align = dst_reg->min_align; | ||
1759 | |||
1678 | /* We don't know anything about what was done to this register, mark it | 1760 | /* We don't know anything about what was done to this register, mark it |
1679 | * as unknown. | 1761 | * as unknown. |
1680 | */ | 1762 | */ |
@@ -1699,18 +1781,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
1699 | dst_reg->min_value += min_val; | 1781 | dst_reg->min_value += min_val; |
1700 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | 1782 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) |
1701 | dst_reg->max_value += max_val; | 1783 | dst_reg->max_value += max_val; |
1784 | dst_reg->min_align = min(src_align, dst_align); | ||
1702 | break; | 1785 | break; |
1703 | case BPF_SUB: | 1786 | case BPF_SUB: |
1704 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | 1787 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) |
1705 | dst_reg->min_value -= min_val; | 1788 | dst_reg->min_value -= min_val; |
1706 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | 1789 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) |
1707 | dst_reg->max_value -= max_val; | 1790 | dst_reg->max_value -= max_val; |
1791 | dst_reg->min_align = min(src_align, dst_align); | ||
1708 | break; | 1792 | break; |
1709 | case BPF_MUL: | 1793 | case BPF_MUL: |
1710 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | 1794 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) |
1711 | dst_reg->min_value *= min_val; | 1795 | dst_reg->min_value *= min_val; |
1712 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | 1796 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) |
1713 | dst_reg->max_value *= max_val; | 1797 | dst_reg->max_value *= max_val; |
1798 | dst_reg->min_align = max(src_align, dst_align); | ||
1714 | break; | 1799 | break; |
1715 | case BPF_AND: | 1800 | case BPF_AND: |
1716 | /* Disallow AND'ing of negative numbers, ain't nobody got time | 1801 | /* Disallow AND'ing of negative numbers, ain't nobody got time |
@@ -1722,17 +1807,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
1722 | else | 1807 | else |
1723 | dst_reg->min_value = 0; | 1808 | dst_reg->min_value = 0; |
1724 | dst_reg->max_value = max_val; | 1809 | dst_reg->max_value = max_val; |
1810 | dst_reg->min_align = max(src_align, dst_align); | ||
1725 | break; | 1811 | break; |
1726 | case BPF_LSH: | 1812 | case BPF_LSH: |
1727 | /* Gotta have special overflow logic here, if we're shifting | 1813 | /* Gotta have special overflow logic here, if we're shifting |
1728 | * more than MAX_RANGE then just assume we have an invalid | 1814 | * more than MAX_RANGE then just assume we have an invalid |
1729 | * range. | 1815 | * range. |
1730 | */ | 1816 | */ |
1731 | if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) | 1817 | if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { |
1732 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | 1818 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; |
1733 | else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | 1819 | dst_reg->min_align = 1; |
1734 | dst_reg->min_value <<= min_val; | 1820 | } else { |
1735 | 1821 | if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) | |
1822 | dst_reg->min_value <<= min_val; | ||
1823 | if (!dst_reg->min_align) | ||
1824 | dst_reg->min_align = 1; | ||
1825 | dst_reg->min_align <<= min_val; | ||
1826 | } | ||
1736 | if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) | 1827 | if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) |
1737 | dst_reg->max_value = BPF_REGISTER_MAX_RANGE; | 1828 | dst_reg->max_value = BPF_REGISTER_MAX_RANGE; |
1738 | else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | 1829 | else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) |
@@ -1742,11 +1833,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
1742 | /* RSH by a negative number is undefined, and the BPF_RSH is an | 1833 | /* RSH by a negative number is undefined, and the BPF_RSH is an |
1743 | * unsigned shift, so make the appropriate casts. | 1834 | * unsigned shift, so make the appropriate casts. |
1744 | */ | 1835 | */ |
1745 | if (min_val < 0 || dst_reg->min_value < 0) | 1836 | if (min_val < 0 || dst_reg->min_value < 0) { |
1746 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; | 1837 | dst_reg->min_value = BPF_REGISTER_MIN_RANGE; |
1747 | else | 1838 | } else { |
1748 | dst_reg->min_value = | 1839 | dst_reg->min_value = |
1749 | (u64)(dst_reg->min_value) >> min_val; | 1840 | (u64)(dst_reg->min_value) >> min_val; |
1841 | } | ||
1842 | if (min_val < 0) { | ||
1843 | dst_reg->min_align = 1; | ||
1844 | } else { | ||
1845 | dst_reg->min_align >>= (u64) min_val; | ||
1846 | if (!dst_reg->min_align) | ||
1847 | dst_reg->min_align = 1; | ||
1848 | } | ||
1750 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) | 1849 | if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) |
1751 | dst_reg->max_value >>= max_val; | 1850 | dst_reg->max_value >>= max_val; |
1752 | break; | 1851 | break; |
@@ -1848,6 +1947,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
1848 | regs[insn->dst_reg].imm = insn->imm; | 1947 | regs[insn->dst_reg].imm = insn->imm; |
1849 | regs[insn->dst_reg].max_value = insn->imm; | 1948 | regs[insn->dst_reg].max_value = insn->imm; |
1850 | regs[insn->dst_reg].min_value = insn->imm; | 1949 | regs[insn->dst_reg].min_value = insn->imm; |
1950 | regs[insn->dst_reg].min_align = calc_align(insn->imm); | ||
1851 | } | 1951 | } |
1852 | 1952 | ||
1853 | } else if (opcode > BPF_END) { | 1953 | } else if (opcode > BPF_END) { |
@@ -1911,6 +2011,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
1911 | return 0; | 2011 | return 0; |
1912 | } else if (opcode == BPF_ADD && | 2012 | } else if (opcode == BPF_ADD && |
1913 | BPF_CLASS(insn->code) == BPF_ALU64 && | 2013 | BPF_CLASS(insn->code) == BPF_ALU64 && |
2014 | dst_reg->type == PTR_TO_STACK && | ||
2015 | ((BPF_SRC(insn->code) == BPF_X && | ||
2016 | regs[insn->src_reg].type == CONST_IMM) || | ||
2017 | BPF_SRC(insn->code) == BPF_K)) { | ||
2018 | if (BPF_SRC(insn->code) == BPF_X) | ||
2019 | dst_reg->imm += regs[insn->src_reg].imm; | ||
2020 | else | ||
2021 | dst_reg->imm += insn->imm; | ||
2022 | return 0; | ||
2023 | } else if (opcode == BPF_ADD && | ||
2024 | BPF_CLASS(insn->code) == BPF_ALU64 && | ||
1914 | (dst_reg->type == PTR_TO_PACKET || | 2025 | (dst_reg->type == PTR_TO_PACKET || |
1915 | (BPF_SRC(insn->code) == BPF_X && | 2026 | (BPF_SRC(insn->code) == BPF_X && |
1916 | regs[insn->src_reg].type == PTR_TO_PACKET))) { | 2027 | regs[insn->src_reg].type == PTR_TO_PACKET))) { |
@@ -2112,14 +2223,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, | |||
2112 | struct bpf_reg_state *reg = ®s[regno]; | 2223 | struct bpf_reg_state *reg = ®s[regno]; |
2113 | 2224 | ||
2114 | if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { | 2225 | if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { |
2115 | reg->type = type; | 2226 | if (type == UNKNOWN_VALUE) { |
2227 | __mark_reg_unknown_value(regs, regno); | ||
2228 | } else if (reg->map_ptr->inner_map_meta) { | ||
2229 | reg->type = CONST_PTR_TO_MAP; | ||
2230 | reg->map_ptr = reg->map_ptr->inner_map_meta; | ||
2231 | } else { | ||
2232 | reg->type = type; | ||
2233 | } | ||
2116 | /* We don't need id from this point onwards anymore, thus we | 2234 | /* We don't need id from this point onwards anymore, thus we |
2117 | * should better reset it, so that state pruning has chances | 2235 | * should better reset it, so that state pruning has chances |
2118 | * to take effect. | 2236 | * to take effect. |
2119 | */ | 2237 | */ |
2120 | reg->id = 0; | 2238 | reg->id = 0; |
2121 | if (type == UNKNOWN_VALUE) | ||
2122 | __mark_reg_unknown_value(regs, regno); | ||
2123 | } | 2239 | } |
2124 | } | 2240 | } |
2125 | 2241 | ||
@@ -2524,6 +2640,7 @@ peek_stack: | |||
2524 | env->explored_states[t + 1] = STATE_LIST_MARK; | 2640 | env->explored_states[t + 1] = STATE_LIST_MARK; |
2525 | } else { | 2641 | } else { |
2526 | /* conditional jump with two edges */ | 2642 | /* conditional jump with two edges */ |
2643 | env->explored_states[t] = STATE_LIST_MARK; | ||
2527 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | 2644 | ret = push_insn(t, t + 1, FALLTHROUGH, env); |
2528 | if (ret == 1) | 2645 | if (ret == 1) |
2529 | goto peek_stack; | 2646 | goto peek_stack; |
@@ -2682,6 +2799,12 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
2682 | rcur->type != NOT_INIT)) | 2799 | rcur->type != NOT_INIT)) |
2683 | continue; | 2800 | continue; |
2684 | 2801 | ||
2802 | /* Don't care about the reg->id in this case. */ | ||
2803 | if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && | ||
2804 | rcur->type == PTR_TO_MAP_VALUE_OR_NULL && | ||
2805 | rold->map_ptr == rcur->map_ptr) | ||
2806 | continue; | ||
2807 | |||
2685 | if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && | 2808 | if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && |
2686 | compare_ptrs_to_packet(rold, rcur)) | 2809 | compare_ptrs_to_packet(rold, rcur)) |
2687 | continue; | 2810 | continue; |
@@ -2816,15 +2939,22 @@ static int do_check(struct bpf_verifier_env *env) | |||
2816 | goto process_bpf_exit; | 2939 | goto process_bpf_exit; |
2817 | } | 2940 | } |
2818 | 2941 | ||
2819 | if (log_level && do_print_state) { | 2942 | if (need_resched()) |
2820 | verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); | 2943 | cond_resched(); |
2944 | |||
2945 | if (log_level > 1 || (log_level && do_print_state)) { | ||
2946 | if (log_level > 1) | ||
2947 | verbose("%d:", insn_idx); | ||
2948 | else | ||
2949 | verbose("\nfrom %d to %d:", | ||
2950 | prev_insn_idx, insn_idx); | ||
2821 | print_verifier_state(&env->cur_state); | 2951 | print_verifier_state(&env->cur_state); |
2822 | do_print_state = false; | 2952 | do_print_state = false; |
2823 | } | 2953 | } |
2824 | 2954 | ||
2825 | if (log_level) { | 2955 | if (log_level) { |
2826 | verbose("%d: ", insn_idx); | 2956 | verbose("%d: ", insn_idx); |
2827 | print_bpf_insn(insn); | 2957 | print_bpf_insn(env, insn); |
2828 | } | 2958 | } |
2829 | 2959 | ||
2830 | err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); | 2960 | err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); |
@@ -2960,7 +3090,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
2960 | return -EINVAL; | 3090 | return -EINVAL; |
2961 | } | 3091 | } |
2962 | 3092 | ||
2963 | err = check_call(env, insn->imm); | 3093 | err = check_call(env, insn->imm, insn_idx); |
2964 | if (err) | 3094 | if (err) |
2965 | return err; | 3095 | return err; |
2966 | 3096 | ||
@@ -3044,16 +3174,33 @@ process_bpf_exit: | |||
3044 | return 0; | 3174 | return 0; |
3045 | } | 3175 | } |
3046 | 3176 | ||
3177 | static int check_map_prealloc(struct bpf_map *map) | ||
3178 | { | ||
3179 | return (map->map_type != BPF_MAP_TYPE_HASH && | ||
3180 | map->map_type != BPF_MAP_TYPE_PERCPU_HASH && | ||
3181 | map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || | ||
3182 | !(map->map_flags & BPF_F_NO_PREALLOC); | ||
3183 | } | ||
3184 | |||
3047 | static int check_map_prog_compatibility(struct bpf_map *map, | 3185 | static int check_map_prog_compatibility(struct bpf_map *map, |
3048 | struct bpf_prog *prog) | 3186 | struct bpf_prog *prog) |
3049 | 3187 | ||
3050 | { | 3188 | { |
3051 | if (prog->type == BPF_PROG_TYPE_PERF_EVENT && | 3189 | /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use |
3052 | (map->map_type == BPF_MAP_TYPE_HASH || | 3190 | * preallocated hash maps, since doing memory allocation |
3053 | map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && | 3191 | * in overflow_handler can crash depending on where nmi got |
3054 | (map->map_flags & BPF_F_NO_PREALLOC)) { | 3192 | * triggered. |
3055 | verbose("perf_event programs can only use preallocated hash map\n"); | 3193 | */ |
3056 | return -EINVAL; | 3194 | if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { |
3195 | if (!check_map_prealloc(map)) { | ||
3196 | verbose("perf_event programs can only use preallocated hash map\n"); | ||
3197 | return -EINVAL; | ||
3198 | } | ||
3199 | if (map->inner_map_meta && | ||
3200 | !check_map_prealloc(map->inner_map_meta)) { | ||
3201 | verbose("perf_event programs can only use preallocated inner hash map\n"); | ||
3202 | return -EINVAL; | ||
3203 | } | ||
3057 | } | 3204 | } |
3058 | return 0; | 3205 | return 0; |
3059 | } | 3206 | } |
@@ -3182,6 +3329,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) | |||
3182 | insn->src_reg = 0; | 3329 | insn->src_reg = 0; |
3183 | } | 3330 | } |
3184 | 3331 | ||
3332 | /* single env->prog->insni[off] instruction was replaced with the range | ||
3333 | * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying | ||
3334 | * [0, off) and [off, end) to new locations, so the patched range stays zero | ||
3335 | */ | ||
3336 | static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, | ||
3337 | u32 off, u32 cnt) | ||
3338 | { | ||
3339 | struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; | ||
3340 | |||
3341 | if (cnt == 1) | ||
3342 | return 0; | ||
3343 | new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); | ||
3344 | if (!new_data) | ||
3345 | return -ENOMEM; | ||
3346 | memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); | ||
3347 | memcpy(new_data + off + cnt - 1, old_data + off, | ||
3348 | sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); | ||
3349 | env->insn_aux_data = new_data; | ||
3350 | vfree(old_data); | ||
3351 | return 0; | ||
3352 | } | ||
3353 | |||
3354 | static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, | ||
3355 | const struct bpf_insn *patch, u32 len) | ||
3356 | { | ||
3357 | struct bpf_prog *new_prog; | ||
3358 | |||
3359 | new_prog = bpf_patch_insn_single(env->prog, off, patch, len); | ||
3360 | if (!new_prog) | ||
3361 | return NULL; | ||
3362 | if (adjust_insn_aux_data(env, new_prog->len, off, len)) | ||
3363 | return NULL; | ||
3364 | return new_prog; | ||
3365 | } | ||
3366 | |||
3185 | /* convert load instructions that access fields of 'struct __sk_buff' | 3367 | /* convert load instructions that access fields of 'struct __sk_buff' |
3186 | * into sequence of instructions that access fields of 'struct sk_buff' | 3368 | * into sequence of instructions that access fields of 'struct sk_buff' |
3187 | */ | 3369 | */ |
@@ -3201,10 +3383,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3201 | verbose("bpf verifier is misconfigured\n"); | 3383 | verbose("bpf verifier is misconfigured\n"); |
3202 | return -EINVAL; | 3384 | return -EINVAL; |
3203 | } else if (cnt) { | 3385 | } else if (cnt) { |
3204 | new_prog = bpf_patch_insn_single(env->prog, 0, | 3386 | new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); |
3205 | insn_buf, cnt); | ||
3206 | if (!new_prog) | 3387 | if (!new_prog) |
3207 | return -ENOMEM; | 3388 | return -ENOMEM; |
3389 | |||
3208 | env->prog = new_prog; | 3390 | env->prog = new_prog; |
3209 | delta += cnt - 1; | 3391 | delta += cnt - 1; |
3210 | } | 3392 | } |
@@ -3229,7 +3411,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3229 | else | 3411 | else |
3230 | continue; | 3412 | continue; |
3231 | 3413 | ||
3232 | if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) | 3414 | if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) |
3233 | continue; | 3415 | continue; |
3234 | 3416 | ||
3235 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); | 3417 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); |
@@ -3238,8 +3420,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3238 | return -EINVAL; | 3420 | return -EINVAL; |
3239 | } | 3421 | } |
3240 | 3422 | ||
3241 | new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, | 3423 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); |
3242 | cnt); | ||
3243 | if (!new_prog) | 3424 | if (!new_prog) |
3244 | return -ENOMEM; | 3425 | return -ENOMEM; |
3245 | 3426 | ||
@@ -3253,6 +3434,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3253 | return 0; | 3434 | return 0; |
3254 | } | 3435 | } |
3255 | 3436 | ||
3437 | /* fixup insn->imm field of bpf_call instructions | ||
3438 | * and inline eligible helpers as explicit sequence of BPF instructions | ||
3439 | * | ||
3440 | * this function is called after eBPF program passed verification | ||
3441 | */ | ||
3442 | static int fixup_bpf_calls(struct bpf_verifier_env *env) | ||
3443 | { | ||
3444 | struct bpf_prog *prog = env->prog; | ||
3445 | struct bpf_insn *insn = prog->insnsi; | ||
3446 | const struct bpf_func_proto *fn; | ||
3447 | const int insn_cnt = prog->len; | ||
3448 | struct bpf_insn insn_buf[16]; | ||
3449 | struct bpf_prog *new_prog; | ||
3450 | struct bpf_map *map_ptr; | ||
3451 | int i, cnt, delta = 0; | ||
3452 | |||
3453 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
3454 | if (insn->code != (BPF_JMP | BPF_CALL)) | ||
3455 | continue; | ||
3456 | |||
3457 | if (insn->imm == BPF_FUNC_get_route_realm) | ||
3458 | prog->dst_needed = 1; | ||
3459 | if (insn->imm == BPF_FUNC_get_prandom_u32) | ||
3460 | bpf_user_rnd_init_once(); | ||
3461 | if (insn->imm == BPF_FUNC_tail_call) { | ||
3462 | /* If we tail call into other programs, we | ||
3463 | * cannot make any assumptions since they can | ||
3464 | * be replaced dynamically during runtime in | ||
3465 | * the program array. | ||
3466 | */ | ||
3467 | prog->cb_access = 1; | ||
3468 | |||
3469 | /* mark bpf_tail_call as different opcode to avoid | ||
3470 | * conditional branch in the interpeter for every normal | ||
3471 | * call and to prevent accidental JITing by JIT compiler | ||
3472 | * that doesn't support bpf_tail_call yet | ||
3473 | */ | ||
3474 | insn->imm = 0; | ||
3475 | insn->code |= BPF_X; | ||
3476 | continue; | ||
3477 | } | ||
3478 | |||
3479 | if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { | ||
3480 | map_ptr = env->insn_aux_data[i + delta].map_ptr; | ||
3481 | if (map_ptr == BPF_MAP_PTR_POISON || | ||
3482 | !map_ptr->ops->map_gen_lookup) | ||
3483 | goto patch_call_imm; | ||
3484 | |||
3485 | cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); | ||
3486 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | ||
3487 | verbose("bpf verifier is misconfigured\n"); | ||
3488 | return -EINVAL; | ||
3489 | } | ||
3490 | |||
3491 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, | ||
3492 | cnt); | ||
3493 | if (!new_prog) | ||
3494 | return -ENOMEM; | ||
3495 | |||
3496 | delta += cnt - 1; | ||
3497 | |||
3498 | /* keep walking new program and skip insns we just inserted */ | ||
3499 | env->prog = prog = new_prog; | ||
3500 | insn = new_prog->insnsi + i + delta; | ||
3501 | continue; | ||
3502 | } | ||
3503 | |||
3504 | patch_call_imm: | ||
3505 | fn = prog->aux->ops->get_func_proto(insn->imm); | ||
3506 | /* all functions that have prototype and verifier allowed | ||
3507 | * programs to call them, must be real in-kernel functions | ||
3508 | */ | ||
3509 | if (!fn->func) { | ||
3510 | verbose("kernel subsystem misconfigured func %s#%d\n", | ||
3511 | func_id_name(insn->imm), insn->imm); | ||
3512 | return -EFAULT; | ||
3513 | } | ||
3514 | insn->imm = fn->func - __bpf_call_base; | ||
3515 | } | ||
3516 | |||
3517 | return 0; | ||
3518 | } | ||
3519 | |||
3256 | static void free_states(struct bpf_verifier_env *env) | 3520 | static void free_states(struct bpf_verifier_env *env) |
3257 | { | 3521 | { |
3258 | struct bpf_verifier_state_list *sl, *sln; | 3522 | struct bpf_verifier_state_list *sl, *sln; |
@@ -3320,6 +3584,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
3320 | } else { | 3584 | } else { |
3321 | log_level = 0; | 3585 | log_level = 0; |
3322 | } | 3586 | } |
3587 | if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) | ||
3588 | env->strict_alignment = true; | ||
3589 | else | ||
3590 | env->strict_alignment = false; | ||
3323 | 3591 | ||
3324 | ret = replace_map_fd_with_map_ptr(env); | 3592 | ret = replace_map_fd_with_map_ptr(env); |
3325 | if (ret < 0) | 3593 | if (ret < 0) |
@@ -3348,6 +3616,9 @@ skip_full_check: | |||
3348 | /* program is valid, convert *(u32*)(ctx + off) accesses */ | 3616 | /* program is valid, convert *(u32*)(ctx + off) accesses */ |
3349 | ret = convert_ctx_accesses(env); | 3617 | ret = convert_ctx_accesses(env); |
3350 | 3618 | ||
3619 | if (ret == 0) | ||
3620 | ret = fixup_bpf_calls(env); | ||
3621 | |||
3351 | if (log_level && log_len >= log_size - 1) { | 3622 | if (log_level && log_len >= log_size - 1) { |
3352 | BUG_ON(log_len >= log_size); | 3623 | BUG_ON(log_len >= log_size); |
3353 | /* verifier log exceeded user supplied buffer */ | 3624 | /* verifier log exceeded user supplied buffer */ |
@@ -3422,6 +3693,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, | |||
3422 | mutex_lock(&bpf_verifier_lock); | 3693 | mutex_lock(&bpf_verifier_lock); |
3423 | 3694 | ||
3424 | log_level = 0; | 3695 | log_level = 0; |
3696 | env->strict_alignment = false; | ||
3425 | 3697 | ||
3426 | env->explored_states = kcalloc(env->prog->len, | 3698 | env->explored_states = kcalloc(env->prog->len, |
3427 | sizeof(struct bpf_verifier_state_list *), | 3699 | sizeof(struct bpf_verifier_state_list *), |
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 9203bfb05603..00f4d6bf048f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/kernfs.h> | 5 | #include <linux/kernfs.h> |
6 | #include <linux/workqueue.h> | 6 | #include <linux/workqueue.h> |
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/refcount.h> | ||
8 | 9 | ||
9 | /* | 10 | /* |
10 | * A cgroup can be associated with multiple css_sets as different tasks may | 11 | * A cgroup can be associated with multiple css_sets as different tasks may |
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset) | |||
134 | * can see it. Similar to atomic_dec_and_lock(), but for an | 135 | * can see it. Similar to atomic_dec_and_lock(), but for an |
135 | * rwlock | 136 | * rwlock |
136 | */ | 137 | */ |
137 | if (atomic_add_unless(&cset->refcount, -1, 1)) | 138 | if (refcount_dec_not_one(&cset->refcount)) |
138 | return; | 139 | return; |
139 | 140 | ||
140 | spin_lock_irqsave(&css_set_lock, flags); | 141 | spin_lock_irqsave(&css_set_lock, flags); |
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset) | |||
147 | */ | 148 | */ |
148 | static inline void get_css_set(struct css_set *cset) | 149 | static inline void get_css_set(struct css_set *cset) |
149 | { | 150 | { |
150 | atomic_inc(&cset->refcount); | 151 | refcount_inc(&cset->refcount); |
151 | } | 152 | } |
152 | 153 | ||
153 | bool cgroup_ssid_enabled(int ssid); | 154 | bool cgroup_ssid_enabled(int ssid); |
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, | |||
163 | 164 | ||
164 | void cgroup_free_root(struct cgroup_root *root); | 165 | void cgroup_free_root(struct cgroup_root *root); |
165 | void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); | 166 | void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); |
166 | int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); | 167 | int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); |
167 | int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); | 168 | int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); |
168 | struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, | 169 | struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, |
169 | struct cgroup_root *root, unsigned long magic, | 170 | struct cgroup_root *root, unsigned long magic, |
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1dc22f6b49f5..85d75152402d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c | |||
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
346 | 346 | ||
347 | spin_lock_irq(&css_set_lock); | 347 | spin_lock_irq(&css_set_lock); |
348 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 348 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
349 | count += atomic_read(&link->cset->refcount); | 349 | count += refcount_read(&link->cset->refcount); |
350 | spin_unlock_irq(&css_set_lock); | 350 | spin_unlock_irq(&css_set_lock); |
351 | return count; | 351 | return count; |
352 | } | 352 | } |
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, | |||
1072 | struct cgroup_subsys *ss; | 1072 | struct cgroup_subsys *ss; |
1073 | struct dentry *dentry; | 1073 | struct dentry *dentry; |
1074 | int i, ret; | 1074 | int i, ret; |
1075 | bool new_root = false; | ||
1075 | 1076 | ||
1076 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); | 1077 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
1077 | 1078 | ||
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, | |||
1181 | ret = -ENOMEM; | 1182 | ret = -ENOMEM; |
1182 | goto out_unlock; | 1183 | goto out_unlock; |
1183 | } | 1184 | } |
1185 | new_root = true; | ||
1184 | 1186 | ||
1185 | init_cgroup_root(root, &opts); | 1187 | init_cgroup_root(root, &opts); |
1186 | 1188 | ||
1187 | ret = cgroup_setup_root(root, opts.subsys_mask); | 1189 | ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); |
1188 | if (ret) | 1190 | if (ret) |
1189 | cgroup_free_root(root); | 1191 | cgroup_free_root(root); |
1190 | 1192 | ||
@@ -1201,6 +1203,18 @@ out_free: | |||
1201 | CGROUP_SUPER_MAGIC, ns); | 1203 | CGROUP_SUPER_MAGIC, ns); |
1202 | 1204 | ||
1203 | /* | 1205 | /* |
1206 | * There's a race window after we release cgroup_mutex and before | ||
1207 | * allocating a superblock. Make sure a concurrent process won't | ||
1208 | * be able to re-use the root during this window by delaying the | ||
1209 | * initialization of root refcnt. | ||
1210 | */ | ||
1211 | if (new_root) { | ||
1212 | mutex_lock(&cgroup_mutex); | ||
1213 | percpu_ref_reinit(&root->cgrp.self.refcnt); | ||
1214 | mutex_unlock(&cgroup_mutex); | ||
1215 | } | ||
1216 | |||
1217 | /* | ||
1204 | * If @pinned_sb, we're reusing an existing root and holding an | 1218 | * If @pinned_sb, we're reusing an existing root and holding an |
1205 | * extra ref on its sb. Mount is complete. Put the extra ref. | 1219 | * extra ref on its sb. Mount is complete. Put the extra ref. |
1206 | */ | 1220 | */ |
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | |||
1286 | u64 count; | 1300 | u64 count; |
1287 | 1301 | ||
1288 | rcu_read_lock(); | 1302 | rcu_read_lock(); |
1289 | count = atomic_read(&task_css_set(current)->refcount); | 1303 | count = refcount_read(&task_css_set(current)->refcount); |
1290 | rcu_read_unlock(); | 1304 | rcu_read_unlock(); |
1291 | return count; | 1305 | return count; |
1292 | } | 1306 | } |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 687f5e0194ef..c3c9a0e1b3c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly; | |||
189 | 189 | ||
190 | /* cgroup namespace for init task */ | 190 | /* cgroup namespace for init task */ |
191 | struct cgroup_namespace init_cgroup_ns = { | 191 | struct cgroup_namespace init_cgroup_ns = { |
192 | .count = { .counter = 2, }, | 192 | .count = REFCOUNT_INIT(2), |
193 | .user_ns = &init_user_ns, | 193 | .user_ns = &init_user_ns, |
194 | .ns.ops = &cgroupns_operations, | 194 | .ns.ops = &cgroupns_operations, |
195 | .ns.inum = PROC_CGROUP_INIT_INO, | 195 | .ns.inum = PROC_CGROUP_INIT_INO, |
@@ -436,7 +436,12 @@ out_unlock: | |||
436 | return css; | 436 | return css; |
437 | } | 437 | } |
438 | 438 | ||
439 | static void cgroup_get(struct cgroup *cgrp) | 439 | static void __maybe_unused cgroup_get(struct cgroup *cgrp) |
440 | { | ||
441 | css_get(&cgrp->self); | ||
442 | } | ||
443 | |||
444 | static void cgroup_get_live(struct cgroup *cgrp) | ||
440 | { | 445 | { |
441 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | 446 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); |
442 | css_get(&cgrp->self); | 447 | css_get(&cgrp->self); |
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css); | |||
554 | * haven't been created. | 559 | * haven't been created. |
555 | */ | 560 | */ |
556 | struct css_set init_css_set = { | 561 | struct css_set init_css_set = { |
557 | .refcount = ATOMIC_INIT(1), | 562 | .refcount = REFCOUNT_INIT(1), |
558 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), | 563 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
559 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), | 564 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
560 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), | 565 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset) | |||
724 | 729 | ||
725 | lockdep_assert_held(&css_set_lock); | 730 | lockdep_assert_held(&css_set_lock); |
726 | 731 | ||
727 | if (!atomic_dec_and_test(&cset->refcount)) | 732 | if (!refcount_dec_and_test(&cset->refcount)) |
728 | return; | 733 | return; |
729 | 734 | ||
730 | /* This css_set is dead. unlink it and release cgroup and css refs */ | 735 | /* This css_set is dead. unlink it and release cgroup and css refs */ |
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
932 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); | 937 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); |
933 | 938 | ||
934 | if (cgroup_parent(cgrp)) | 939 | if (cgroup_parent(cgrp)) |
935 | cgroup_get(cgrp); | 940 | cgroup_get_live(cgrp); |
936 | } | 941 | } |
937 | 942 | ||
938 | /** | 943 | /** |
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
977 | return NULL; | 982 | return NULL; |
978 | } | 983 | } |
979 | 984 | ||
980 | atomic_set(&cset->refcount, 1); | 985 | refcount_set(&cset->refcount, 1); |
981 | INIT_LIST_HEAD(&cset->tasks); | 986 | INIT_LIST_HEAD(&cset->tasks); |
982 | INIT_LIST_HEAD(&cset->mg_tasks); | 987 | INIT_LIST_HEAD(&cset->mg_tasks); |
983 | INIT_LIST_HEAD(&cset->task_iters); | 988 | INIT_LIST_HEAD(&cset->task_iters); |
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) | |||
1640 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1645 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
1641 | } | 1646 | } |
1642 | 1647 | ||
1643 | int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) | 1648 | int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) |
1644 | { | 1649 | { |
1645 | LIST_HEAD(tmp_links); | 1650 | LIST_HEAD(tmp_links); |
1646 | struct cgroup *root_cgrp = &root->cgrp; | 1651 | struct cgroup *root_cgrp = &root->cgrp; |
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) | |||
1656 | root_cgrp->id = ret; | 1661 | root_cgrp->id = ret; |
1657 | root_cgrp->ancestor_ids[0] = ret; | 1662 | root_cgrp->ancestor_ids[0] = ret; |
1658 | 1663 | ||
1659 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, | 1664 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, |
1660 | GFP_KERNEL); | 1665 | ref_flags, GFP_KERNEL); |
1661 | if (ret) | 1666 | if (ret) |
1662 | goto out; | 1667 | goto out; |
1663 | 1668 | ||
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1802 | return ERR_PTR(-EINVAL); | 1807 | return ERR_PTR(-EINVAL); |
1803 | } | 1808 | } |
1804 | cgrp_dfl_visible = true; | 1809 | cgrp_dfl_visible = true; |
1805 | cgroup_get(&cgrp_dfl_root.cgrp); | 1810 | cgroup_get_live(&cgrp_dfl_root.cgrp); |
1806 | 1811 | ||
1807 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, | 1812 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, |
1808 | CGROUP2_SUPER_MAGIC, ns); | 1813 | CGROUP2_SUPER_MAGIC, ns); |
@@ -2576,7 +2581,7 @@ restart: | |||
2576 | if (!css || !percpu_ref_is_dying(&css->refcnt)) | 2581 | if (!css || !percpu_ref_is_dying(&css->refcnt)) |
2577 | continue; | 2582 | continue; |
2578 | 2583 | ||
2579 | cgroup_get(dsct); | 2584 | cgroup_get_live(dsct); |
2580 | prepare_to_wait(&dsct->offline_waitq, &wait, | 2585 | prepare_to_wait(&dsct->offline_waitq, &wait, |
2581 | TASK_UNINTERRUPTIBLE); | 2586 | TASK_UNINTERRUPTIBLE); |
2582 | 2587 | ||
@@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
3947 | { | 3952 | { |
3948 | lockdep_assert_held(&cgroup_mutex); | 3953 | lockdep_assert_held(&cgroup_mutex); |
3949 | 3954 | ||
3950 | cgroup_get(cgrp); | 3955 | cgroup_get_live(cgrp); |
3951 | 3956 | ||
3952 | memset(css, 0, sizeof(*css)); | 3957 | memset(css, 0, sizeof(*css)); |
3953 | css->cgroup = cgrp; | 3958 | css->cgroup = cgrp; |
@@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
4123 | /* allocation complete, commit to creation */ | 4128 | /* allocation complete, commit to creation */ |
4124 | list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); | 4129 | list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); |
4125 | atomic_inc(&root->nr_cgrps); | 4130 | atomic_inc(&root->nr_cgrps); |
4126 | cgroup_get(parent); | 4131 | cgroup_get_live(parent); |
4127 | 4132 | ||
4128 | /* | 4133 | /* |
4129 | * @cgrp is now fully operational. If something fails after this | 4134 | * @cgrp is now fully operational. If something fails after this |
@@ -4513,7 +4518,7 @@ int __init cgroup_init(void) | |||
4513 | hash_add(css_set_table, &init_css_set.hlist, | 4518 | hash_add(css_set_table, &init_css_set.hlist, |
4514 | css_set_hash(init_css_set.subsys)); | 4519 | css_set_hash(init_css_set.subsys)); |
4515 | 4520 | ||
4516 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); | 4521 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); |
4517 | 4522 | ||
4518 | mutex_unlock(&cgroup_mutex); | 4523 | mutex_unlock(&cgroup_mutex); |
4519 | 4524 | ||
@@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path) | |||
4947 | if (kn) { | 4952 | if (kn) { |
4948 | if (kernfs_type(kn) == KERNFS_DIR) { | 4953 | if (kernfs_type(kn) == KERNFS_DIR) { |
4949 | cgrp = kn->priv; | 4954 | cgrp = kn->priv; |
4950 | cgroup_get(cgrp); | 4955 | cgroup_get_live(cgrp); |
4951 | } else { | 4956 | } else { |
4952 | cgrp = ERR_PTR(-ENOTDIR); | 4957 | cgrp = ERR_PTR(-ENOTDIR); |
4953 | } | 4958 | } |
@@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
5027 | 5032 | ||
5028 | /* Socket clone path */ | 5033 | /* Socket clone path */ |
5029 | if (skcd->val) { | 5034 | if (skcd->val) { |
5035 | /* | ||
5036 | * We might be cloning a socket which is left in an empty | ||
5037 | * cgroup and the cgroup might have already been rmdir'd. | ||
5038 | * Don't use cgroup_get_live(). | ||
5039 | */ | ||
5030 | cgroup_get(sock_cgroup_ptr(skcd)); | 5040 | cgroup_get(sock_cgroup_ptr(skcd)); |
5031 | return; | 5041 | return; |
5032 | } | 5042 | } |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f41292be0fb..f6501f4f6040 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void) | |||
2121 | { | 2121 | { |
2122 | int err = 0; | 2122 | int err = 0; |
2123 | 2123 | ||
2124 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) | 2124 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); |
2125 | BUG(); | 2125 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); |
2126 | if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) | ||
2127 | BUG(); | ||
2128 | 2126 | ||
2129 | cpumask_setall(top_cpuset.cpus_allowed); | 2127 | cpumask_setall(top_cpuset.cpus_allowed); |
2130 | nodes_setall(top_cpuset.mems_allowed); | 2128 | nodes_setall(top_cpuset.mems_allowed); |
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void) | |||
2139 | if (err < 0) | 2137 | if (err < 0) |
2140 | return err; | 2138 | return err; |
2141 | 2139 | ||
2142 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | 2140 | BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); |
2143 | BUG(); | ||
2144 | 2141 | ||
2145 | return 0; | 2142 | return 0; |
2146 | } | 2143 | } |
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2354 | rebuild_sched_domains(); | 2351 | rebuild_sched_domains(); |
2355 | } | 2352 | } |
2356 | 2353 | ||
2357 | void cpuset_update_active_cpus(bool cpu_online) | 2354 | void cpuset_update_active_cpus(void) |
2358 | { | 2355 | { |
2359 | /* | 2356 | /* |
2360 | * We're inside cpu hotplug critical region which usually nests | 2357 | * We're inside cpu hotplug critical region which usually nests |
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 96d38dab6fb2..66129eb4371d 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c | |||
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) | |||
31 | kfree(new_ns); | 31 | kfree(new_ns); |
32 | return ERR_PTR(ret); | 32 | return ERR_PTR(ret); |
33 | } | 33 | } |
34 | atomic_set(&new_ns->count, 1); | 34 | refcount_set(&new_ns->count, 1); |
35 | new_ns->ns.ops = &cgroupns_operations; | 35 | new_ns->ns.ops = &cgroupns_operations; |
36 | return new_ns; | 36 | return new_ns; |
37 | } | 37 | } |
diff --git a/kernel/compat.c b/kernel/compat.c index 19aec5d98108..933bcb31ae10 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, | |||
108 | COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, | 108 | COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, |
109 | struct timezone __user *, tz) | 109 | struct timezone __user *, tz) |
110 | { | 110 | { |
111 | struct timespec64 new_ts; | ||
111 | struct timeval user_tv; | 112 | struct timeval user_tv; |
112 | struct timespec new_ts; | ||
113 | struct timezone new_tz; | 113 | struct timezone new_tz; |
114 | 114 | ||
115 | if (tv) { | 115 | if (tv) { |
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, | |||
123 | return -EFAULT; | 123 | return -EFAULT; |
124 | } | 124 | } |
125 | 125 | ||
126 | return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | 126 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); |
127 | } | 127 | } |
128 | 128 | ||
129 | static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) | 129 | static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) |
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | |||
240 | struct compat_timespec __user *, rmtp) | 240 | struct compat_timespec __user *, rmtp) |
241 | { | 241 | { |
242 | struct timespec tu, rmt; | 242 | struct timespec tu, rmt; |
243 | struct timespec64 tu64; | ||
243 | mm_segment_t oldfs; | 244 | mm_segment_t oldfs; |
244 | long ret; | 245 | long ret; |
245 | 246 | ||
246 | if (compat_get_timespec(&tu, rqtp)) | 247 | if (compat_get_timespec(&tu, rqtp)) |
247 | return -EFAULT; | 248 | return -EFAULT; |
248 | 249 | ||
249 | if (!timespec_valid(&tu)) | 250 | tu64 = timespec_to_timespec64(tu); |
251 | if (!timespec64_valid(&tu64)) | ||
250 | return -EINVAL; | 252 | return -EINVAL; |
251 | 253 | ||
252 | oldfs = get_fs(); | 254 | oldfs = get_fs(); |
253 | set_fs(KERNEL_DS); | 255 | set_fs(KERNEL_DS); |
254 | ret = hrtimer_nanosleep(&tu, | 256 | ret = hrtimer_nanosleep(&tu64, |
255 | rmtp ? (struct timespec __user *)&rmt : NULL, | 257 | rmtp ? (struct timespec __user *)&rmt : NULL, |
256 | HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 258 | HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
257 | set_fs(oldfs); | 259 | set_fs(oldfs); |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 37b223e4fc05..9ae6fbe5b5cf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init); | |||
1125 | 1125 | ||
1126 | #endif /* CONFIG_PM_SLEEP_SMP */ | 1126 | #endif /* CONFIG_PM_SLEEP_SMP */ |
1127 | 1127 | ||
1128 | int __boot_cpu_id; | ||
1129 | |||
1128 | #endif /* CONFIG_SMP */ | 1130 | #endif /* CONFIG_SMP */ |
1129 | 1131 | ||
1130 | /* Boot processor state steps */ | 1132 | /* Boot processor state steps */ |
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void) | |||
1815 | set_cpu_active(cpu, true); | 1817 | set_cpu_active(cpu, true); |
1816 | set_cpu_present(cpu, true); | 1818 | set_cpu_present(cpu, true); |
1817 | set_cpu_possible(cpu, true); | 1819 | set_cpu_possible(cpu, true); |
1820 | |||
1821 | #ifdef CONFIG_SMP | ||
1822 | __boot_cpu_id = cpu; | ||
1823 | #endif | ||
1818 | } | 1824 | } |
1819 | 1825 | ||
1820 | /* | 1826 | /* |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c new file mode 100644 index 000000000000..fcbd568f1e95 --- /dev/null +++ b/kernel/crash_core.c | |||
@@ -0,0 +1,439 @@ | |||
1 | /* | ||
2 | * crash.c - kernel crash support code. | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/crash_core.h> | ||
10 | #include <linux/utsname.h> | ||
11 | #include <linux/vmalloc.h> | ||
12 | |||
13 | #include <asm/page.h> | ||
14 | #include <asm/sections.h> | ||
15 | |||
16 | /* vmcoreinfo stuff */ | ||
17 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | ||
18 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | ||
19 | size_t vmcoreinfo_size; | ||
20 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | ||
21 | |||
22 | /* | ||
23 | * parsing the "crashkernel" commandline | ||
24 | * | ||
25 | * this code is intended to be called from architecture specific code | ||
26 | */ | ||
27 | |||
28 | |||
29 | /* | ||
30 | * This function parses command lines in the format | ||
31 | * | ||
32 | * crashkernel=ramsize-range:size[,...][@offset] | ||
33 | * | ||
34 | * The function returns 0 on success and -EINVAL on failure. | ||
35 | */ | ||
36 | static int __init parse_crashkernel_mem(char *cmdline, | ||
37 | unsigned long long system_ram, | ||
38 | unsigned long long *crash_size, | ||
39 | unsigned long long *crash_base) | ||
40 | { | ||
41 | char *cur = cmdline, *tmp; | ||
42 | |||
43 | /* for each entry of the comma-separated list */ | ||
44 | do { | ||
45 | unsigned long long start, end = ULLONG_MAX, size; | ||
46 | |||
47 | /* get the start of the range */ | ||
48 | start = memparse(cur, &tmp); | ||
49 | if (cur == tmp) { | ||
50 | pr_warn("crashkernel: Memory value expected\n"); | ||
51 | return -EINVAL; | ||
52 | } | ||
53 | cur = tmp; | ||
54 | if (*cur != '-') { | ||
55 | pr_warn("crashkernel: '-' expected\n"); | ||
56 | return -EINVAL; | ||
57 | } | ||
58 | cur++; | ||
59 | |||
60 | /* if no ':' is here, than we read the end */ | ||
61 | if (*cur != ':') { | ||
62 | end = memparse(cur, &tmp); | ||
63 | if (cur == tmp) { | ||
64 | pr_warn("crashkernel: Memory value expected\n"); | ||
65 | return -EINVAL; | ||
66 | } | ||
67 | cur = tmp; | ||
68 | if (end <= start) { | ||
69 | pr_warn("crashkernel: end <= start\n"); | ||
70 | return -EINVAL; | ||
71 | } | ||
72 | } | ||
73 | |||
74 | if (*cur != ':') { | ||
75 | pr_warn("crashkernel: ':' expected\n"); | ||
76 | return -EINVAL; | ||
77 | } | ||
78 | cur++; | ||
79 | |||
80 | size = memparse(cur, &tmp); | ||
81 | if (cur == tmp) { | ||
82 | pr_warn("Memory value expected\n"); | ||
83 | return -EINVAL; | ||
84 | } | ||
85 | cur = tmp; | ||
86 | if (size >= system_ram) { | ||
87 | pr_warn("crashkernel: invalid size\n"); | ||
88 | return -EINVAL; | ||
89 | } | ||
90 | |||
91 | /* match ? */ | ||
92 | if (system_ram >= start && system_ram < end) { | ||
93 | *crash_size = size; | ||
94 | break; | ||
95 | } | ||
96 | } while (*cur++ == ','); | ||
97 | |||
98 | if (*crash_size > 0) { | ||
99 | while (*cur && *cur != ' ' && *cur != '@') | ||
100 | cur++; | ||
101 | if (*cur == '@') { | ||
102 | cur++; | ||
103 | *crash_base = memparse(cur, &tmp); | ||
104 | if (cur == tmp) { | ||
105 | pr_warn("Memory value expected after '@'\n"); | ||
106 | return -EINVAL; | ||
107 | } | ||
108 | } | ||
109 | } | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * That function parses "simple" (old) crashkernel command lines like | ||
116 | * | ||
117 | * crashkernel=size[@offset] | ||
118 | * | ||
119 | * It returns 0 on success and -EINVAL on failure. | ||
120 | */ | ||
121 | static int __init parse_crashkernel_simple(char *cmdline, | ||
122 | unsigned long long *crash_size, | ||
123 | unsigned long long *crash_base) | ||
124 | { | ||
125 | char *cur = cmdline; | ||
126 | |||
127 | *crash_size = memparse(cmdline, &cur); | ||
128 | if (cmdline == cur) { | ||
129 | pr_warn("crashkernel: memory value expected\n"); | ||
130 | return -EINVAL; | ||
131 | } | ||
132 | |||
133 | if (*cur == '@') | ||
134 | *crash_base = memparse(cur+1, &cur); | ||
135 | else if (*cur != ' ' && *cur != '\0') { | ||
136 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | |||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | #define SUFFIX_HIGH 0 | ||
144 | #define SUFFIX_LOW 1 | ||
145 | #define SUFFIX_NULL 2 | ||
146 | static __initdata char *suffix_tbl[] = { | ||
147 | [SUFFIX_HIGH] = ",high", | ||
148 | [SUFFIX_LOW] = ",low", | ||
149 | [SUFFIX_NULL] = NULL, | ||
150 | }; | ||
151 | |||
152 | /* | ||
153 | * That function parses "suffix" crashkernel command lines like | ||
154 | * | ||
155 | * crashkernel=size,[high|low] | ||
156 | * | ||
157 | * It returns 0 on success and -EINVAL on failure. | ||
158 | */ | ||
159 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
160 | unsigned long long *crash_size, | ||
161 | const char *suffix) | ||
162 | { | ||
163 | char *cur = cmdline; | ||
164 | |||
165 | *crash_size = memparse(cmdline, &cur); | ||
166 | if (cmdline == cur) { | ||
167 | pr_warn("crashkernel: memory value expected\n"); | ||
168 | return -EINVAL; | ||
169 | } | ||
170 | |||
171 | /* check with suffix */ | ||
172 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
173 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
174 | return -EINVAL; | ||
175 | } | ||
176 | cur += strlen(suffix); | ||
177 | if (*cur != ' ' && *cur != '\0') { | ||
178 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | |||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | static __init char *get_last_crashkernel(char *cmdline, | ||
186 | const char *name, | ||
187 | const char *suffix) | ||
188 | { | ||
189 | char *p = cmdline, *ck_cmdline = NULL; | ||
190 | |||
191 | /* find crashkernel and use the last one if there are more */ | ||
192 | p = strstr(p, name); | ||
193 | while (p) { | ||
194 | char *end_p = strchr(p, ' '); | ||
195 | char *q; | ||
196 | |||
197 | if (!end_p) | ||
198 | end_p = p + strlen(p); | ||
199 | |||
200 | if (!suffix) { | ||
201 | int i; | ||
202 | |||
203 | /* skip the one with any known suffix */ | ||
204 | for (i = 0; suffix_tbl[i]; i++) { | ||
205 | q = end_p - strlen(suffix_tbl[i]); | ||
206 | if (!strncmp(q, suffix_tbl[i], | ||
207 | strlen(suffix_tbl[i]))) | ||
208 | goto next; | ||
209 | } | ||
210 | ck_cmdline = p; | ||
211 | } else { | ||
212 | q = end_p - strlen(suffix); | ||
213 | if (!strncmp(q, suffix, strlen(suffix))) | ||
214 | ck_cmdline = p; | ||
215 | } | ||
216 | next: | ||
217 | p = strstr(p+1, name); | ||
218 | } | ||
219 | |||
220 | if (!ck_cmdline) | ||
221 | return NULL; | ||
222 | |||
223 | return ck_cmdline; | ||
224 | } | ||
225 | |||
226 | static int __init __parse_crashkernel(char *cmdline, | ||
227 | unsigned long long system_ram, | ||
228 | unsigned long long *crash_size, | ||
229 | unsigned long long *crash_base, | ||
230 | const char *name, | ||
231 | const char *suffix) | ||
232 | { | ||
233 | char *first_colon, *first_space; | ||
234 | char *ck_cmdline; | ||
235 | |||
236 | BUG_ON(!crash_size || !crash_base); | ||
237 | *crash_size = 0; | ||
238 | *crash_base = 0; | ||
239 | |||
240 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); | ||
241 | |||
242 | if (!ck_cmdline) | ||
243 | return -EINVAL; | ||
244 | |||
245 | ck_cmdline += strlen(name); | ||
246 | |||
247 | if (suffix) | ||
248 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
249 | suffix); | ||
250 | /* | ||
251 | * if the commandline contains a ':', then that's the extended | ||
252 | * syntax -- if not, it must be the classic syntax | ||
253 | */ | ||
254 | first_colon = strchr(ck_cmdline, ':'); | ||
255 | first_space = strchr(ck_cmdline, ' '); | ||
256 | if (first_colon && (!first_space || first_colon < first_space)) | ||
257 | return parse_crashkernel_mem(ck_cmdline, system_ram, | ||
258 | crash_size, crash_base); | ||
259 | |||
260 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * That function is the entry point for command line parsing and should be | ||
265 | * called from the arch-specific code. | ||
266 | */ | ||
267 | int __init parse_crashkernel(char *cmdline, | ||
268 | unsigned long long system_ram, | ||
269 | unsigned long long *crash_size, | ||
270 | unsigned long long *crash_base) | ||
271 | { | ||
272 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
273 | "crashkernel=", NULL); | ||
274 | } | ||
275 | |||
276 | int __init parse_crashkernel_high(char *cmdline, | ||
277 | unsigned long long system_ram, | ||
278 | unsigned long long *crash_size, | ||
279 | unsigned long long *crash_base) | ||
280 | { | ||
281 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
282 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
283 | } | ||
284 | |||
285 | int __init parse_crashkernel_low(char *cmdline, | ||
286 | unsigned long long system_ram, | ||
287 | unsigned long long *crash_size, | ||
288 | unsigned long long *crash_base) | ||
289 | { | ||
290 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
291 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); | ||
292 | } | ||
293 | |||
294 | Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, | ||
295 | void *data, size_t data_len) | ||
296 | { | ||
297 | struct elf_note *note = (struct elf_note *)buf; | ||
298 | |||
299 | note->n_namesz = strlen(name) + 1; | ||
300 | note->n_descsz = data_len; | ||
301 | note->n_type = type; | ||
302 | buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); | ||
303 | memcpy(buf, name, note->n_namesz); | ||
304 | buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); | ||
305 | memcpy(buf, data, data_len); | ||
306 | buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); | ||
307 | |||
308 | return buf; | ||
309 | } | ||
310 | |||
311 | void final_note(Elf_Word *buf) | ||
312 | { | ||
313 | memset(buf, 0, sizeof(struct elf_note)); | ||
314 | } | ||
315 | |||
316 | static void update_vmcoreinfo_note(void) | ||
317 | { | ||
318 | u32 *buf = vmcoreinfo_note; | ||
319 | |||
320 | if (!vmcoreinfo_size) | ||
321 | return; | ||
322 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | ||
323 | vmcoreinfo_size); | ||
324 | final_note(buf); | ||
325 | } | ||
326 | |||
327 | void crash_save_vmcoreinfo(void) | ||
328 | { | ||
329 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | ||
330 | update_vmcoreinfo_note(); | ||
331 | } | ||
332 | |||
333 | void vmcoreinfo_append_str(const char *fmt, ...) | ||
334 | { | ||
335 | va_list args; | ||
336 | char buf[0x50]; | ||
337 | size_t r; | ||
338 | |||
339 | va_start(args, fmt); | ||
340 | r = vscnprintf(buf, sizeof(buf), fmt, args); | ||
341 | va_end(args); | ||
342 | |||
343 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | ||
344 | |||
345 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | ||
346 | |||
347 | vmcoreinfo_size += r; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * provide an empty default implementation here -- architecture | ||
352 | * code may override this | ||
353 | */ | ||
354 | void __weak arch_crash_save_vmcoreinfo(void) | ||
355 | {} | ||
356 | |||
357 | phys_addr_t __weak paddr_vmcoreinfo_note(void) | ||
358 | { | ||
359 | return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); | ||
360 | } | ||
361 | |||
362 | static int __init crash_save_vmcoreinfo_init(void) | ||
363 | { | ||
364 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | ||
365 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | ||
366 | |||
367 | VMCOREINFO_SYMBOL(init_uts_ns); | ||
368 | VMCOREINFO_SYMBOL(node_online_map); | ||
369 | #ifdef CONFIG_MMU | ||
370 | VMCOREINFO_SYMBOL(swapper_pg_dir); | ||
371 | #endif | ||
372 | VMCOREINFO_SYMBOL(_stext); | ||
373 | VMCOREINFO_SYMBOL(vmap_area_list); | ||
374 | |||
375 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
376 | VMCOREINFO_SYMBOL(mem_map); | ||
377 | VMCOREINFO_SYMBOL(contig_page_data); | ||
378 | #endif | ||
379 | #ifdef CONFIG_SPARSEMEM | ||
380 | VMCOREINFO_SYMBOL(mem_section); | ||
381 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | ||
382 | VMCOREINFO_STRUCT_SIZE(mem_section); | ||
383 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | ||
384 | #endif | ||
385 | VMCOREINFO_STRUCT_SIZE(page); | ||
386 | VMCOREINFO_STRUCT_SIZE(pglist_data); | ||
387 | VMCOREINFO_STRUCT_SIZE(zone); | ||
388 | VMCOREINFO_STRUCT_SIZE(free_area); | ||
389 | VMCOREINFO_STRUCT_SIZE(list_head); | ||
390 | VMCOREINFO_SIZE(nodemask_t); | ||
391 | VMCOREINFO_OFFSET(page, flags); | ||
392 | VMCOREINFO_OFFSET(page, _refcount); | ||
393 | VMCOREINFO_OFFSET(page, mapping); | ||
394 | VMCOREINFO_OFFSET(page, lru); | ||
395 | VMCOREINFO_OFFSET(page, _mapcount); | ||
396 | VMCOREINFO_OFFSET(page, private); | ||
397 | VMCOREINFO_OFFSET(page, compound_dtor); | ||
398 | VMCOREINFO_OFFSET(page, compound_order); | ||
399 | VMCOREINFO_OFFSET(page, compound_head); | ||
400 | VMCOREINFO_OFFSET(pglist_data, node_zones); | ||
401 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | ||
402 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
403 | VMCOREINFO_OFFSET(pglist_data, node_mem_map); | ||
404 | #endif | ||
405 | VMCOREINFO_OFFSET(pglist_data, node_start_pfn); | ||
406 | VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); | ||
407 | VMCOREINFO_OFFSET(pglist_data, node_id); | ||
408 | VMCOREINFO_OFFSET(zone, free_area); | ||
409 | VMCOREINFO_OFFSET(zone, vm_stat); | ||
410 | VMCOREINFO_OFFSET(zone, spanned_pages); | ||
411 | VMCOREINFO_OFFSET(free_area, free_list); | ||
412 | VMCOREINFO_OFFSET(list_head, next); | ||
413 | VMCOREINFO_OFFSET(list_head, prev); | ||
414 | VMCOREINFO_OFFSET(vmap_area, va_start); | ||
415 | VMCOREINFO_OFFSET(vmap_area, list); | ||
416 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | ||
417 | log_buf_vmcoreinfo_setup(); | ||
418 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
419 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | ||
420 | VMCOREINFO_NUMBER(PG_lru); | ||
421 | VMCOREINFO_NUMBER(PG_private); | ||
422 | VMCOREINFO_NUMBER(PG_swapcache); | ||
423 | VMCOREINFO_NUMBER(PG_slab); | ||
424 | #ifdef CONFIG_MEMORY_FAILURE | ||
425 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
426 | #endif | ||
427 | VMCOREINFO_NUMBER(PG_head_mask); | ||
428 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
429 | #ifdef CONFIG_HUGETLB_PAGE | ||
430 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); | ||
431 | #endif | ||
432 | |||
433 | arch_crash_save_vmcoreinfo(); | ||
434 | update_vmcoreinfo_note(); | ||
435 | |||
436 | return 0; | ||
437 | } | ||
438 | |||
439 | subsys_initcall(crash_save_vmcoreinfo_init); | ||
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c04917cad1bf..1b2be63c8528 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, | |||
229 | } | 229 | } |
230 | 230 | ||
231 | if (regs) { | 231 | if (regs) { |
232 | mm_segment_t fs; | ||
233 | |||
232 | if (crosstask) | 234 | if (crosstask) |
233 | goto exit_put; | 235 | goto exit_put; |
234 | 236 | ||
235 | if (add_mark) | 237 | if (add_mark) |
236 | perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); | 238 | perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); |
239 | |||
240 | fs = get_fs(); | ||
241 | set_fs(USER_DS); | ||
237 | perf_callchain_user(&ctx, regs); | 242 | perf_callchain_user(&ctx, regs); |
243 | set_fs(fs); | ||
238 | } | 244 | } |
239 | } | 245 | } |
240 | 246 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index ff01cba86f43..6e75a5c9412d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -48,6 +48,8 @@ | |||
48 | #include <linux/parser.h> | 48 | #include <linux/parser.h> |
49 | #include <linux/sched/clock.h> | 49 | #include <linux/sched/clock.h> |
50 | #include <linux/sched/mm.h> | 50 | #include <linux/sched/mm.h> |
51 | #include <linux/proc_ns.h> | ||
52 | #include <linux/mount.h> | ||
51 | 53 | ||
52 | #include "internal.h" | 54 | #include "internal.h" |
53 | 55 | ||
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); | |||
379 | 381 | ||
380 | static atomic_t nr_mmap_events __read_mostly; | 382 | static atomic_t nr_mmap_events __read_mostly; |
381 | static atomic_t nr_comm_events __read_mostly; | 383 | static atomic_t nr_comm_events __read_mostly; |
384 | static atomic_t nr_namespaces_events __read_mostly; | ||
382 | static atomic_t nr_task_events __read_mostly; | 385 | static atomic_t nr_task_events __read_mostly; |
383 | static atomic_t nr_freq_events __read_mostly; | 386 | static atomic_t nr_freq_events __read_mostly; |
384 | static atomic_t nr_switch_events __read_mostly; | 387 | static atomic_t nr_switch_events __read_mostly; |
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event) | |||
3991 | atomic_dec(&nr_mmap_events); | 3994 | atomic_dec(&nr_mmap_events); |
3992 | if (event->attr.comm) | 3995 | if (event->attr.comm) |
3993 | atomic_dec(&nr_comm_events); | 3996 | atomic_dec(&nr_comm_events); |
3997 | if (event->attr.namespaces) | ||
3998 | atomic_dec(&nr_namespaces_events); | ||
3994 | if (event->attr.task) | 3999 | if (event->attr.task) |
3995 | atomic_dec(&nr_task_events); | 4000 | atomic_dec(&nr_task_events); |
3996 | if (event->attr.freq) | 4001 | if (event->attr.freq) |
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task, | |||
6491 | void perf_event_fork(struct task_struct *task) | 6496 | void perf_event_fork(struct task_struct *task) |
6492 | { | 6497 | { |
6493 | perf_event_task(task, NULL, 1); | 6498 | perf_event_task(task, NULL, 1); |
6499 | perf_event_namespaces(task); | ||
6494 | } | 6500 | } |
6495 | 6501 | ||
6496 | /* | 6502 | /* |
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec) | |||
6593 | } | 6599 | } |
6594 | 6600 | ||
6595 | /* | 6601 | /* |
6602 | * namespaces tracking | ||
6603 | */ | ||
6604 | |||
6605 | struct perf_namespaces_event { | ||
6606 | struct task_struct *task; | ||
6607 | |||
6608 | struct { | ||
6609 | struct perf_event_header header; | ||
6610 | |||
6611 | u32 pid; | ||
6612 | u32 tid; | ||
6613 | u64 nr_namespaces; | ||
6614 | struct perf_ns_link_info link_info[NR_NAMESPACES]; | ||
6615 | } event_id; | ||
6616 | }; | ||
6617 | |||
6618 | static int perf_event_namespaces_match(struct perf_event *event) | ||
6619 | { | ||
6620 | return event->attr.namespaces; | ||
6621 | } | ||
6622 | |||
6623 | static void perf_event_namespaces_output(struct perf_event *event, | ||
6624 | void *data) | ||
6625 | { | ||
6626 | struct perf_namespaces_event *namespaces_event = data; | ||
6627 | struct perf_output_handle handle; | ||
6628 | struct perf_sample_data sample; | ||
6629 | int ret; | ||
6630 | |||
6631 | if (!perf_event_namespaces_match(event)) | ||
6632 | return; | ||
6633 | |||
6634 | perf_event_header__init_id(&namespaces_event->event_id.header, | ||
6635 | &sample, event); | ||
6636 | ret = perf_output_begin(&handle, event, | ||
6637 | namespaces_event->event_id.header.size); | ||
6638 | if (ret) | ||
6639 | return; | ||
6640 | |||
6641 | namespaces_event->event_id.pid = perf_event_pid(event, | ||
6642 | namespaces_event->task); | ||
6643 | namespaces_event->event_id.tid = perf_event_tid(event, | ||
6644 | namespaces_event->task); | ||
6645 | |||
6646 | perf_output_put(&handle, namespaces_event->event_id); | ||
6647 | |||
6648 | perf_event__output_id_sample(event, &handle, &sample); | ||
6649 | |||
6650 | perf_output_end(&handle); | ||
6651 | } | ||
6652 | |||
6653 | static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, | ||
6654 | struct task_struct *task, | ||
6655 | const struct proc_ns_operations *ns_ops) | ||
6656 | { | ||
6657 | struct path ns_path; | ||
6658 | struct inode *ns_inode; | ||
6659 | void *error; | ||
6660 | |||
6661 | error = ns_get_path(&ns_path, task, ns_ops); | ||
6662 | if (!error) { | ||
6663 | ns_inode = ns_path.dentry->d_inode; | ||
6664 | ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); | ||
6665 | ns_link_info->ino = ns_inode->i_ino; | ||
6666 | } | ||
6667 | } | ||
6668 | |||
6669 | void perf_event_namespaces(struct task_struct *task) | ||
6670 | { | ||
6671 | struct perf_namespaces_event namespaces_event; | ||
6672 | struct perf_ns_link_info *ns_link_info; | ||
6673 | |||
6674 | if (!atomic_read(&nr_namespaces_events)) | ||
6675 | return; | ||
6676 | |||
6677 | namespaces_event = (struct perf_namespaces_event){ | ||
6678 | .task = task, | ||
6679 | .event_id = { | ||
6680 | .header = { | ||
6681 | .type = PERF_RECORD_NAMESPACES, | ||
6682 | .misc = 0, | ||
6683 | .size = sizeof(namespaces_event.event_id), | ||
6684 | }, | ||
6685 | /* .pid */ | ||
6686 | /* .tid */ | ||
6687 | .nr_namespaces = NR_NAMESPACES, | ||
6688 | /* .link_info[NR_NAMESPACES] */ | ||
6689 | }, | ||
6690 | }; | ||
6691 | |||
6692 | ns_link_info = namespaces_event.event_id.link_info; | ||
6693 | |||
6694 | perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], | ||
6695 | task, &mntns_operations); | ||
6696 | |||
6697 | #ifdef CONFIG_USER_NS | ||
6698 | perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], | ||
6699 | task, &userns_operations); | ||
6700 | #endif | ||
6701 | #ifdef CONFIG_NET_NS | ||
6702 | perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], | ||
6703 | task, &netns_operations); | ||
6704 | #endif | ||
6705 | #ifdef CONFIG_UTS_NS | ||
6706 | perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], | ||
6707 | task, &utsns_operations); | ||
6708 | #endif | ||
6709 | #ifdef CONFIG_IPC_NS | ||
6710 | perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], | ||
6711 | task, &ipcns_operations); | ||
6712 | #endif | ||
6713 | #ifdef CONFIG_PID_NS | ||
6714 | perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], | ||
6715 | task, &pidns_operations); | ||
6716 | #endif | ||
6717 | #ifdef CONFIG_CGROUPS | ||
6718 | perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], | ||
6719 | task, &cgroupns_operations); | ||
6720 | #endif | ||
6721 | |||
6722 | perf_iterate_sb(perf_event_namespaces_output, | ||
6723 | &namespaces_event, | ||
6724 | NULL); | ||
6725 | } | ||
6726 | |||
6727 | /* | ||
6596 | * mmap tracking | 6728 | * mmap tracking |
6597 | */ | 6729 | */ |
6598 | 6730 | ||
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event) | |||
9146 | atomic_inc(&nr_mmap_events); | 9278 | atomic_inc(&nr_mmap_events); |
9147 | if (event->attr.comm) | 9279 | if (event->attr.comm) |
9148 | atomic_inc(&nr_comm_events); | 9280 | atomic_inc(&nr_comm_events); |
9281 | if (event->attr.namespaces) | ||
9282 | atomic_inc(&nr_namespaces_events); | ||
9149 | if (event->attr.task) | 9283 | if (event->attr.task) |
9150 | atomic_inc(&nr_task_events); | 9284 | atomic_inc(&nr_task_events); |
9151 | if (event->attr.freq) | 9285 | if (event->attr.freq) |
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9691 | return -EACCES; | 9825 | return -EACCES; |
9692 | } | 9826 | } |
9693 | 9827 | ||
9828 | if (attr.namespaces) { | ||
9829 | if (!capable(CAP_SYS_ADMIN)) | ||
9830 | return -EACCES; | ||
9831 | } | ||
9832 | |||
9694 | if (attr.freq) { | 9833 | if (attr.freq) { |
9695 | if (attr.sample_freq > sysctl_perf_event_sample_rate) | 9834 | if (attr.sample_freq > sysctl_perf_event_sample_rate) |
9696 | return -EINVAL; | 9835 | return -EINVAL; |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 257fa460b846..2831480c63a2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
297 | rb->paused = 1; | 297 | rb->paused = 1; |
298 | } | 298 | } |
299 | 299 | ||
300 | void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) | ||
301 | { | ||
302 | /* | ||
303 | * OVERWRITE is determined by perf_aux_output_end() and can't | ||
304 | * be passed in directly. | ||
305 | */ | ||
306 | if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) | ||
307 | return; | ||
308 | |||
309 | handle->aux_flags |= flags; | ||
310 | } | ||
311 | EXPORT_SYMBOL_GPL(perf_aux_output_flag); | ||
312 | |||
300 | /* | 313 | /* |
301 | * This is called before hardware starts writing to the AUX area to | 314 | * This is called before hardware starts writing to the AUX area to |
302 | * obtain an output handle and make sure there's room in the buffer. | 315 | * obtain an output handle and make sure there's room in the buffer. |
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
360 | handle->event = event; | 373 | handle->event = event; |
361 | handle->head = aux_head; | 374 | handle->head = aux_head; |
362 | handle->size = 0; | 375 | handle->size = 0; |
376 | handle->aux_flags = 0; | ||
363 | 377 | ||
364 | /* | 378 | /* |
365 | * In overwrite mode, AUX data stores do not depend on aux_tail, | 379 | * In overwrite mode, AUX data stores do not depend on aux_tail, |
@@ -408,34 +422,32 @@ err: | |||
408 | * of the AUX buffer management code is that after pmu::stop(), the AUX | 422 | * of the AUX buffer management code is that after pmu::stop(), the AUX |
409 | * transaction must be stopped and therefore drop the AUX reference count. | 423 | * transaction must be stopped and therefore drop the AUX reference count. |
410 | */ | 424 | */ |
411 | void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, | 425 | void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) |
412 | bool truncated) | ||
413 | { | 426 | { |
427 | bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); | ||
414 | struct ring_buffer *rb = handle->rb; | 428 | struct ring_buffer *rb = handle->rb; |
415 | bool wakeup = truncated; | ||
416 | unsigned long aux_head; | 429 | unsigned long aux_head; |
417 | u64 flags = 0; | ||
418 | |||
419 | if (truncated) | ||
420 | flags |= PERF_AUX_FLAG_TRUNCATED; | ||
421 | 430 | ||
422 | /* in overwrite mode, driver provides aux_head via handle */ | 431 | /* in overwrite mode, driver provides aux_head via handle */ |
423 | if (rb->aux_overwrite) { | 432 | if (rb->aux_overwrite) { |
424 | flags |= PERF_AUX_FLAG_OVERWRITE; | 433 | handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; |
425 | 434 | ||
426 | aux_head = handle->head; | 435 | aux_head = handle->head; |
427 | local_set(&rb->aux_head, aux_head); | 436 | local_set(&rb->aux_head, aux_head); |
428 | } else { | 437 | } else { |
438 | handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; | ||
439 | |||
429 | aux_head = local_read(&rb->aux_head); | 440 | aux_head = local_read(&rb->aux_head); |
430 | local_add(size, &rb->aux_head); | 441 | local_add(size, &rb->aux_head); |
431 | } | 442 | } |
432 | 443 | ||
433 | if (size || flags) { | 444 | if (size || handle->aux_flags) { |
434 | /* | 445 | /* |
435 | * Only send RECORD_AUX if we have something useful to communicate | 446 | * Only send RECORD_AUX if we have something useful to communicate |
436 | */ | 447 | */ |
437 | 448 | ||
438 | perf_event_aux_event(handle->event, aux_head, size, flags); | 449 | perf_event_aux_event(handle->event, aux_head, size, |
450 | handle->aux_flags); | ||
439 | } | 451 | } |
440 | 452 | ||
441 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); | 453 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); |
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, | |||
446 | } | 458 | } |
447 | 459 | ||
448 | if (wakeup) { | 460 | if (wakeup) { |
449 | if (truncated) | 461 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) |
450 | handle->event->pending_disable = 1; | 462 | handle->event->pending_disable = 1; |
451 | perf_output_wakeup(handle); | 463 | perf_output_wakeup(handle); |
452 | } | 464 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93d..aa1076c5e4a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -87,6 +87,7 @@ | |||
87 | #include <linux/compiler.h> | 87 | #include <linux/compiler.h> |
88 | #include <linux/sysctl.h> | 88 | #include <linux/sysctl.h> |
89 | #include <linux/kcov.h> | 89 | #include <linux/kcov.h> |
90 | #include <linux/livepatch.h> | ||
90 | 91 | ||
91 | #include <asm/pgtable.h> | 92 | #include <asm/pgtable.h> |
92 | #include <asm/pgalloc.h> | 93 | #include <asm/pgalloc.h> |
@@ -178,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
178 | */ | 179 | */ |
179 | #define NR_CACHED_STACKS 2 | 180 | #define NR_CACHED_STACKS 2 |
180 | static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); | 181 | static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); |
182 | |||
183 | static int free_vm_stack_cache(unsigned int cpu) | ||
184 | { | ||
185 | struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); | ||
186 | int i; | ||
187 | |||
188 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
189 | struct vm_struct *vm_stack = cached_vm_stacks[i]; | ||
190 | |||
191 | if (!vm_stack) | ||
192 | continue; | ||
193 | |||
194 | vfree(vm_stack->addr); | ||
195 | cached_vm_stacks[i] = NULL; | ||
196 | } | ||
197 | |||
198 | return 0; | ||
199 | } | ||
181 | #endif | 200 | #endif |
182 | 201 | ||
183 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | 202 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) |
@@ -202,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
202 | 221 | ||
203 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | 222 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, |
204 | VMALLOC_START, VMALLOC_END, | 223 | VMALLOC_START, VMALLOC_END, |
205 | THREADINFO_GFP | __GFP_HIGHMEM, | 224 | THREADINFO_GFP, |
206 | PAGE_KERNEL, | 225 | PAGE_KERNEL, |
207 | 0, node, __builtin_return_address(0)); | 226 | 0, node, __builtin_return_address(0)); |
208 | 227 | ||
@@ -466,6 +485,11 @@ void __init fork_init(void) | |||
466 | for (i = 0; i < UCOUNT_COUNTS; i++) { | 485 | for (i = 0; i < UCOUNT_COUNTS; i++) { |
467 | init_user_ns.ucount_max[i] = max_threads/2; | 486 | init_user_ns.ucount_max[i] = max_threads/2; |
468 | } | 487 | } |
488 | |||
489 | #ifdef CONFIG_VMAP_STACK | ||
490 | cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", | ||
491 | NULL, free_vm_stack_cache); | ||
492 | #endif | ||
469 | } | 493 | } |
470 | 494 | ||
471 | int __weak arch_dup_task_struct(struct task_struct *dst, | 495 | int __weak arch_dup_task_struct(struct task_struct *dst, |
@@ -536,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
536 | set_task_stack_end_magic(tsk); | 560 | set_task_stack_end_magic(tsk); |
537 | 561 | ||
538 | #ifdef CONFIG_CC_STACKPROTECTOR | 562 | #ifdef CONFIG_CC_STACKPROTECTOR |
539 | tsk->stack_canary = get_random_int(); | 563 | tsk->stack_canary = get_random_long(); |
540 | #endif | 564 | #endif |
541 | 565 | ||
542 | /* | 566 | /* |
@@ -1313,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
1313 | if (atomic_dec_and_test(&sighand->count)) { | 1337 | if (atomic_dec_and_test(&sighand->count)) { |
1314 | signalfd_cleanup(sighand); | 1338 | signalfd_cleanup(sighand); |
1315 | /* | 1339 | /* |
1316 | * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it | 1340 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
1317 | * without an RCU grace period, see __lock_task_sighand(). | 1341 | * without an RCU grace period, see __lock_task_sighand(). |
1318 | */ | 1342 | */ |
1319 | kmem_cache_free(sighand_cachep, sighand); | 1343 | kmem_cache_free(sighand_cachep, sighand); |
@@ -1438,6 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1438 | #ifdef CONFIG_RT_MUTEXES | 1462 | #ifdef CONFIG_RT_MUTEXES |
1439 | p->pi_waiters = RB_ROOT; | 1463 | p->pi_waiters = RB_ROOT; |
1440 | p->pi_waiters_leftmost = NULL; | 1464 | p->pi_waiters_leftmost = NULL; |
1465 | p->pi_top_task = NULL; | ||
1441 | p->pi_blocked_on = NULL; | 1466 | p->pi_blocked_on = NULL; |
1442 | #endif | 1467 | #endif |
1443 | } | 1468 | } |
@@ -1679,9 +1704,12 @@ static __latent_entropy struct task_struct *copy_process( | |||
1679 | goto bad_fork_cleanup_perf; | 1704 | goto bad_fork_cleanup_perf; |
1680 | /* copy all the process information */ | 1705 | /* copy all the process information */ |
1681 | shm_init_task(p); | 1706 | shm_init_task(p); |
1682 | retval = copy_semundo(clone_flags, p); | 1707 | retval = security_task_alloc(p, clone_flags); |
1683 | if (retval) | 1708 | if (retval) |
1684 | goto bad_fork_cleanup_audit; | 1709 | goto bad_fork_cleanup_audit; |
1710 | retval = copy_semundo(clone_flags, p); | ||
1711 | if (retval) | ||
1712 | goto bad_fork_cleanup_security; | ||
1685 | retval = copy_files(clone_flags, p); | 1713 | retval = copy_files(clone_flags, p); |
1686 | if (retval) | 1714 | if (retval) |
1687 | goto bad_fork_cleanup_semundo; | 1715 | goto bad_fork_cleanup_semundo; |
@@ -1797,6 +1825,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
1797 | p->parent_exec_id = current->self_exec_id; | 1825 | p->parent_exec_id = current->self_exec_id; |
1798 | } | 1826 | } |
1799 | 1827 | ||
1828 | klp_copy_process(p); | ||
1829 | |||
1800 | spin_lock(¤t->sighand->siglock); | 1830 | spin_lock(¤t->sighand->siglock); |
1801 | 1831 | ||
1802 | /* | 1832 | /* |
@@ -1815,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process( | |||
1815 | */ | 1845 | */ |
1816 | recalc_sigpending(); | 1846 | recalc_sigpending(); |
1817 | if (signal_pending(current)) { | 1847 | if (signal_pending(current)) { |
1818 | spin_unlock(¤t->sighand->siglock); | ||
1819 | write_unlock_irq(&tasklist_lock); | ||
1820 | retval = -ERESTARTNOINTR; | 1848 | retval = -ERESTARTNOINTR; |
1821 | goto bad_fork_cancel_cgroup; | 1849 | goto bad_fork_cancel_cgroup; |
1822 | } | 1850 | } |
1851 | if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { | ||
1852 | retval = -ENOMEM; | ||
1853 | goto bad_fork_cancel_cgroup; | ||
1854 | } | ||
1823 | 1855 | ||
1824 | if (likely(p->pid)) { | 1856 | if (likely(p->pid)) { |
1825 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); | 1857 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
@@ -1877,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
1877 | return p; | 1909 | return p; |
1878 | 1910 | ||
1879 | bad_fork_cancel_cgroup: | 1911 | bad_fork_cancel_cgroup: |
1912 | spin_unlock(¤t->sighand->siglock); | ||
1913 | write_unlock_irq(&tasklist_lock); | ||
1880 | cgroup_cancel_fork(p); | 1914 | cgroup_cancel_fork(p); |
1881 | bad_fork_free_pid: | 1915 | bad_fork_free_pid: |
1882 | cgroup_threadgroup_change_end(current); | 1916 | cgroup_threadgroup_change_end(current); |
@@ -1903,6 +1937,8 @@ bad_fork_cleanup_files: | |||
1903 | exit_files(p); /* blocking */ | 1937 | exit_files(p); /* blocking */ |
1904 | bad_fork_cleanup_semundo: | 1938 | bad_fork_cleanup_semundo: |
1905 | exit_sem(p); | 1939 | exit_sem(p); |
1940 | bad_fork_cleanup_security: | ||
1941 | security_task_free(p); | ||
1906 | bad_fork_cleanup_audit: | 1942 | bad_fork_cleanup_audit: |
1907 | audit_free(p); | 1943 | audit_free(p); |
1908 | bad_fork_cleanup_perf: | 1944 | bad_fork_cleanup_perf: |
@@ -2144,7 +2180,7 @@ void __init proc_caches_init(void) | |||
2144 | { | 2180 | { |
2145 | sighand_cachep = kmem_cache_create("sighand_cache", | 2181 | sighand_cachep = kmem_cache_create("sighand_cache", |
2146 | sizeof(struct sighand_struct), 0, | 2182 | sizeof(struct sighand_struct), 0, |
2147 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| | 2183 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
2148 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); | 2184 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); |
2149 | signal_cachep = kmem_cache_create("signal_cache", | 2185 | signal_cachep = kmem_cache_create("signal_cache", |
2150 | sizeof(struct signal_struct), 0, | 2186 | sizeof(struct signal_struct), 0, |
@@ -2352,6 +2388,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
2352 | } | 2388 | } |
2353 | } | 2389 | } |
2354 | 2390 | ||
2391 | perf_event_namespaces(current); | ||
2392 | |||
2355 | bad_unshare_cleanup_cred: | 2393 | bad_unshare_cleanup_cred: |
2356 | if (new_cred) | 2394 | if (new_cred) |
2357 | put_cred(new_cred); | 2395 | put_cred(new_cred); |
diff --git a/kernel/futex.c b/kernel/futex.c index 45858ec73941..357348a6cf6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void) | |||
802 | return 0; | 802 | return 0; |
803 | } | 803 | } |
804 | 804 | ||
805 | static struct futex_pi_state * alloc_pi_state(void) | 805 | static struct futex_pi_state *alloc_pi_state(void) |
806 | { | 806 | { |
807 | struct futex_pi_state *pi_state = current->pi_state_cache; | 807 | struct futex_pi_state *pi_state = current->pi_state_cache; |
808 | 808 | ||
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void) | |||
812 | return pi_state; | 812 | return pi_state; |
813 | } | 813 | } |
814 | 814 | ||
815 | static void get_pi_state(struct futex_pi_state *pi_state) | ||
816 | { | ||
817 | WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); | ||
818 | } | ||
819 | |||
815 | /* | 820 | /* |
816 | * Drops a reference to the pi_state object and frees or caches it | 821 | * Drops a reference to the pi_state object and frees or caches it |
817 | * when the last reference is gone. | 822 | * when the last reference is gone. |
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) | |||
856 | * Look up the task based on what TID userspace gave us. | 861 | * Look up the task based on what TID userspace gave us. |
857 | * We dont trust it. | 862 | * We dont trust it. |
858 | */ | 863 | */ |
859 | static struct task_struct * futex_find_get_task(pid_t pid) | 864 | static struct task_struct *futex_find_get_task(pid_t pid) |
860 | { | 865 | { |
861 | struct task_struct *p; | 866 | struct task_struct *p; |
862 | 867 | ||
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) | |||
916 | pi_state->owner = NULL; | 921 | pi_state->owner = NULL; |
917 | raw_spin_unlock_irq(&curr->pi_lock); | 922 | raw_spin_unlock_irq(&curr->pi_lock); |
918 | 923 | ||
919 | rt_mutex_unlock(&pi_state->pi_mutex); | 924 | get_pi_state(pi_state); |
920 | |||
921 | spin_unlock(&hb->lock); | 925 | spin_unlock(&hb->lock); |
922 | 926 | ||
927 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
928 | put_pi_state(pi_state); | ||
929 | |||
923 | raw_spin_lock_irq(&curr->pi_lock); | 930 | raw_spin_lock_irq(&curr->pi_lock); |
924 | } | 931 | } |
925 | raw_spin_unlock_irq(&curr->pi_lock); | 932 | raw_spin_unlock_irq(&curr->pi_lock); |
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) | |||
973 | * | 980 | * |
974 | * [10] There is no transient state which leaves owner and user space | 981 | * [10] There is no transient state which leaves owner and user space |
975 | * TID out of sync. | 982 | * TID out of sync. |
983 | * | ||
984 | * | ||
985 | * Serialization and lifetime rules: | ||
986 | * | ||
987 | * hb->lock: | ||
988 | * | ||
989 | * hb -> futex_q, relation | ||
990 | * futex_q -> pi_state, relation | ||
991 | * | ||
992 | * (cannot be raw because hb can contain arbitrary amount | ||
993 | * of futex_q's) | ||
994 | * | ||
995 | * pi_mutex->wait_lock: | ||
996 | * | ||
997 | * {uval, pi_state} | ||
998 | * | ||
999 | * (and pi_mutex 'obviously') | ||
1000 | * | ||
1001 | * p->pi_lock: | ||
1002 | * | ||
1003 | * p->pi_state_list -> pi_state->list, relation | ||
1004 | * | ||
1005 | * pi_state->refcount: | ||
1006 | * | ||
1007 | * pi_state lifetime | ||
1008 | * | ||
1009 | * | ||
1010 | * Lock order: | ||
1011 | * | ||
1012 | * hb->lock | ||
1013 | * pi_mutex->wait_lock | ||
1014 | * p->pi_lock | ||
1015 | * | ||
976 | */ | 1016 | */ |
977 | 1017 | ||
978 | /* | 1018 | /* |
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr) | |||
980 | * the pi_state against the user space value. If correct, attach to | 1020 | * the pi_state against the user space value. If correct, attach to |
981 | * it. | 1021 | * it. |
982 | */ | 1022 | */ |
983 | static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | 1023 | static int attach_to_pi_state(u32 __user *uaddr, u32 uval, |
1024 | struct futex_pi_state *pi_state, | ||
984 | struct futex_pi_state **ps) | 1025 | struct futex_pi_state **ps) |
985 | { | 1026 | { |
986 | pid_t pid = uval & FUTEX_TID_MASK; | 1027 | pid_t pid = uval & FUTEX_TID_MASK; |
1028 | u32 uval2; | ||
1029 | int ret; | ||
987 | 1030 | ||
988 | /* | 1031 | /* |
989 | * Userspace might have messed up non-PI and PI futexes [3] | 1032 | * Userspace might have messed up non-PI and PI futexes [3] |
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
991 | if (unlikely(!pi_state)) | 1034 | if (unlikely(!pi_state)) |
992 | return -EINVAL; | 1035 | return -EINVAL; |
993 | 1036 | ||
1037 | /* | ||
1038 | * We get here with hb->lock held, and having found a | ||
1039 | * futex_top_waiter(). This means that futex_lock_pi() of said futex_q | ||
1040 | * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), | ||
1041 | * which in turn means that futex_lock_pi() still has a reference on | ||
1042 | * our pi_state. | ||
1043 | * | ||
1044 | * The waiter holding a reference on @pi_state also protects against | ||
1045 | * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() | ||
1046 | * and futex_wait_requeue_pi() as it cannot go to 0 and consequently | ||
1047 | * free pi_state before we can take a reference ourselves. | ||
1048 | */ | ||
994 | WARN_ON(!atomic_read(&pi_state->refcount)); | 1049 | WARN_ON(!atomic_read(&pi_state->refcount)); |
995 | 1050 | ||
996 | /* | 1051 | /* |
1052 | * Now that we have a pi_state, we can acquire wait_lock | ||
1053 | * and do the state validation. | ||
1054 | */ | ||
1055 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
1056 | |||
1057 | /* | ||
1058 | * Since {uval, pi_state} is serialized by wait_lock, and our current | ||
1059 | * uval was read without holding it, it can have changed. Verify it | ||
1060 | * still is what we expect it to be, otherwise retry the entire | ||
1061 | * operation. | ||
1062 | */ | ||
1063 | if (get_futex_value_locked(&uval2, uaddr)) | ||
1064 | goto out_efault; | ||
1065 | |||
1066 | if (uval != uval2) | ||
1067 | goto out_eagain; | ||
1068 | |||
1069 | /* | ||
997 | * Handle the owner died case: | 1070 | * Handle the owner died case: |
998 | */ | 1071 | */ |
999 | if (uval & FUTEX_OWNER_DIED) { | 1072 | if (uval & FUTEX_OWNER_DIED) { |
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1008 | * is not 0. Inconsistent state. [5] | 1081 | * is not 0. Inconsistent state. [5] |
1009 | */ | 1082 | */ |
1010 | if (pid) | 1083 | if (pid) |
1011 | return -EINVAL; | 1084 | goto out_einval; |
1012 | /* | 1085 | /* |
1013 | * Take a ref on the state and return success. [4] | 1086 | * Take a ref on the state and return success. [4] |
1014 | */ | 1087 | */ |
1015 | goto out_state; | 1088 | goto out_attach; |
1016 | } | 1089 | } |
1017 | 1090 | ||
1018 | /* | 1091 | /* |
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1024 | * Take a ref on the state and return success. [6] | 1097 | * Take a ref on the state and return success. [6] |
1025 | */ | 1098 | */ |
1026 | if (!pid) | 1099 | if (!pid) |
1027 | goto out_state; | 1100 | goto out_attach; |
1028 | } else { | 1101 | } else { |
1029 | /* | 1102 | /* |
1030 | * If the owner died bit is not set, then the pi_state | 1103 | * If the owner died bit is not set, then the pi_state |
1031 | * must have an owner. [7] | 1104 | * must have an owner. [7] |
1032 | */ | 1105 | */ |
1033 | if (!pi_state->owner) | 1106 | if (!pi_state->owner) |
1034 | return -EINVAL; | 1107 | goto out_einval; |
1035 | } | 1108 | } |
1036 | 1109 | ||
1037 | /* | 1110 | /* |
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1040 | * user space TID. [9/10] | 1113 | * user space TID. [9/10] |
1041 | */ | 1114 | */ |
1042 | if (pid != task_pid_vnr(pi_state->owner)) | 1115 | if (pid != task_pid_vnr(pi_state->owner)) |
1043 | return -EINVAL; | 1116 | goto out_einval; |
1044 | out_state: | 1117 | |
1045 | atomic_inc(&pi_state->refcount); | 1118 | out_attach: |
1119 | get_pi_state(pi_state); | ||
1120 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
1046 | *ps = pi_state; | 1121 | *ps = pi_state; |
1047 | return 0; | 1122 | return 0; |
1123 | |||
1124 | out_einval: | ||
1125 | ret = -EINVAL; | ||
1126 | goto out_error; | ||
1127 | |||
1128 | out_eagain: | ||
1129 | ret = -EAGAIN; | ||
1130 | goto out_error; | ||
1131 | |||
1132 | out_efault: | ||
1133 | ret = -EFAULT; | ||
1134 | goto out_error; | ||
1135 | |||
1136 | out_error: | ||
1137 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
1138 | return ret; | ||
1048 | } | 1139 | } |
1049 | 1140 | ||
1050 | /* | 1141 | /* |
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
1095 | 1186 | ||
1096 | /* | 1187 | /* |
1097 | * No existing pi state. First waiter. [2] | 1188 | * No existing pi state. First waiter. [2] |
1189 | * | ||
1190 | * This creates pi_state, we have hb->lock held, this means nothing can | ||
1191 | * observe this state, wait_lock is irrelevant. | ||
1098 | */ | 1192 | */ |
1099 | pi_state = alloc_pi_state(); | 1193 | pi_state = alloc_pi_state(); |
1100 | 1194 | ||
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
1119 | return 0; | 1213 | return 0; |
1120 | } | 1214 | } |
1121 | 1215 | ||
1122 | static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 1216 | static int lookup_pi_state(u32 __user *uaddr, u32 uval, |
1217 | struct futex_hash_bucket *hb, | ||
1123 | union futex_key *key, struct futex_pi_state **ps) | 1218 | union futex_key *key, struct futex_pi_state **ps) |
1124 | { | 1219 | { |
1125 | struct futex_q *match = futex_top_waiter(hb, key); | 1220 | struct futex_q *top_waiter = futex_top_waiter(hb, key); |
1126 | 1221 | ||
1127 | /* | 1222 | /* |
1128 | * If there is a waiter on that futex, validate it and | 1223 | * If there is a waiter on that futex, validate it and |
1129 | * attach to the pi_state when the validation succeeds. | 1224 | * attach to the pi_state when the validation succeeds. |
1130 | */ | 1225 | */ |
1131 | if (match) | 1226 | if (top_waiter) |
1132 | return attach_to_pi_state(uval, match->pi_state, ps); | 1227 | return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); |
1133 | 1228 | ||
1134 | /* | 1229 | /* |
1135 | * We are the first waiter - try to look up the owner based on | 1230 | * We are the first waiter - try to look up the owner based on |
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |||
1148 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 1243 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
1149 | return -EFAULT; | 1244 | return -EFAULT; |
1150 | 1245 | ||
1151 | /*If user space value changed, let the caller retry */ | 1246 | /* If user space value changed, let the caller retry */ |
1152 | return curval != uval ? -EAGAIN : 0; | 1247 | return curval != uval ? -EAGAIN : 0; |
1153 | } | 1248 | } |
1154 | 1249 | ||
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
1176 | struct task_struct *task, int set_waiters) | 1271 | struct task_struct *task, int set_waiters) |
1177 | { | 1272 | { |
1178 | u32 uval, newval, vpid = task_pid_vnr(task); | 1273 | u32 uval, newval, vpid = task_pid_vnr(task); |
1179 | struct futex_q *match; | 1274 | struct futex_q *top_waiter; |
1180 | int ret; | 1275 | int ret; |
1181 | 1276 | ||
1182 | /* | 1277 | /* |
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
1202 | * Lookup existing state first. If it exists, try to attach to | 1297 | * Lookup existing state first. If it exists, try to attach to |
1203 | * its pi_state. | 1298 | * its pi_state. |
1204 | */ | 1299 | */ |
1205 | match = futex_top_waiter(hb, key); | 1300 | top_waiter = futex_top_waiter(hb, key); |
1206 | if (match) | 1301 | if (top_waiter) |
1207 | return attach_to_pi_state(uval, match->pi_state, ps); | 1302 | return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); |
1208 | 1303 | ||
1209 | /* | 1304 | /* |
1210 | * No waiter and user TID is 0. We are here because the | 1305 | * No waiter and user TID is 0. We are here because the |
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) | |||
1285 | wake_q_add(wake_q, p); | 1380 | wake_q_add(wake_q, p); |
1286 | __unqueue_futex(q); | 1381 | __unqueue_futex(q); |
1287 | /* | 1382 | /* |
1288 | * The waiting task can free the futex_q as soon as | 1383 | * The waiting task can free the futex_q as soon as q->lock_ptr = NULL |
1289 | * q->lock_ptr = NULL is written, without taking any locks. A | 1384 | * is written, without taking any locks. This is possible in the event |
1290 | * memory barrier is required here to prevent the following | 1385 | * of a spurious wakeup, for example. A memory barrier is required here |
1291 | * store to lock_ptr from getting ahead of the plist_del. | 1386 | * to prevent the following store to lock_ptr from getting ahead of the |
1387 | * plist_del in __unqueue_futex(). | ||
1292 | */ | 1388 | */ |
1293 | smp_wmb(); | 1389 | smp_store_release(&q->lock_ptr, NULL); |
1294 | q->lock_ptr = NULL; | ||
1295 | } | 1390 | } |
1296 | 1391 | ||
1297 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, | 1392 | /* |
1298 | struct futex_hash_bucket *hb) | 1393 | * Caller must hold a reference on @pi_state. |
1394 | */ | ||
1395 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) | ||
1299 | { | 1396 | { |
1300 | struct task_struct *new_owner; | ||
1301 | struct futex_pi_state *pi_state = this->pi_state; | ||
1302 | u32 uninitialized_var(curval), newval; | 1397 | u32 uninitialized_var(curval), newval; |
1398 | struct task_struct *new_owner; | ||
1399 | bool postunlock = false; | ||
1303 | DEFINE_WAKE_Q(wake_q); | 1400 | DEFINE_WAKE_Q(wake_q); |
1304 | bool deboost; | ||
1305 | int ret = 0; | 1401 | int ret = 0; |
1306 | 1402 | ||
1307 | if (!pi_state) | ||
1308 | return -EINVAL; | ||
1309 | |||
1310 | /* | ||
1311 | * If current does not own the pi_state then the futex is | ||
1312 | * inconsistent and user space fiddled with the futex value. | ||
1313 | */ | ||
1314 | if (pi_state->owner != current) | ||
1315 | return -EINVAL; | ||
1316 | |||
1317 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
1318 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 1403 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
1404 | if (WARN_ON_ONCE(!new_owner)) { | ||
1405 | /* | ||
1406 | * As per the comment in futex_unlock_pi() this should not happen. | ||
1407 | * | ||
1408 | * When this happens, give up our locks and try again, giving | ||
1409 | * the futex_lock_pi() instance time to complete, either by | ||
1410 | * waiting on the rtmutex or removing itself from the futex | ||
1411 | * queue. | ||
1412 | */ | ||
1413 | ret = -EAGAIN; | ||
1414 | goto out_unlock; | ||
1415 | } | ||
1319 | 1416 | ||
1320 | /* | 1417 | /* |
1321 | * It is possible that the next waiter (the one that brought | 1418 | * We pass it to the next owner. The WAITERS bit is always kept |
1322 | * this owner to the kernel) timed out and is no longer | 1419 | * enabled while there is PI state around. We cleanup the owner |
1323 | * waiting on the lock. | 1420 | * died bit, because we are the owner. |
1324 | */ | ||
1325 | if (!new_owner) | ||
1326 | new_owner = this->task; | ||
1327 | |||
1328 | /* | ||
1329 | * We pass it to the next owner. The WAITERS bit is always | ||
1330 | * kept enabled while there is PI state around. We cleanup the | ||
1331 | * owner died bit, because we are the owner. | ||
1332 | */ | 1421 | */ |
1333 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1422 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
1334 | 1423 | ||
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, | |||
1337 | 1426 | ||
1338 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { | 1427 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { |
1339 | ret = -EFAULT; | 1428 | ret = -EFAULT; |
1429 | |||
1340 | } else if (curval != uval) { | 1430 | } else if (curval != uval) { |
1341 | /* | 1431 | /* |
1342 | * If a unconditional UNLOCK_PI operation (user space did not | 1432 | * If a unconditional UNLOCK_PI operation (user space did not |
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, | |||
1349 | else | 1439 | else |
1350 | ret = -EINVAL; | 1440 | ret = -EINVAL; |
1351 | } | 1441 | } |
1352 | if (ret) { | 1442 | |
1353 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1443 | if (ret) |
1354 | return ret; | 1444 | goto out_unlock; |
1355 | } | 1445 | |
1446 | /* | ||
1447 | * This is a point of no return; once we modify the uval there is no | ||
1448 | * going back and subsequent operations must not fail. | ||
1449 | */ | ||
1356 | 1450 | ||
1357 | raw_spin_lock(&pi_state->owner->pi_lock); | 1451 | raw_spin_lock(&pi_state->owner->pi_lock); |
1358 | WARN_ON(list_empty(&pi_state->list)); | 1452 | WARN_ON(list_empty(&pi_state->list)); |
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, | |||
1365 | pi_state->owner = new_owner; | 1459 | pi_state->owner = new_owner; |
1366 | raw_spin_unlock(&new_owner->pi_lock); | 1460 | raw_spin_unlock(&new_owner->pi_lock); |
1367 | 1461 | ||
1368 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1462 | postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); |
1369 | 1463 | ||
1370 | deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); | 1464 | out_unlock: |
1465 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
1371 | 1466 | ||
1372 | /* | 1467 | if (postunlock) |
1373 | * First unlock HB so the waiter does not spin on it once he got woken | 1468 | rt_mutex_postunlock(&wake_q); |
1374 | * up. Second wake up the waiter before the priority is adjusted. If we | ||
1375 | * deboost first (and lose our higher priority), then the task might get | ||
1376 | * scheduled away before the wake up can take place. | ||
1377 | */ | ||
1378 | spin_unlock(&hb->lock); | ||
1379 | wake_up_q(&wake_q); | ||
1380 | if (deboost) | ||
1381 | rt_mutex_adjust_prio(current); | ||
1382 | 1469 | ||
1383 | return 0; | 1470 | return ret; |
1384 | } | 1471 | } |
1385 | 1472 | ||
1386 | /* | 1473 | /* |
@@ -1826,7 +1913,7 @@ retry_private: | |||
1826 | * If that call succeeds then we have pi_state and an | 1913 | * If that call succeeds then we have pi_state and an |
1827 | * initial refcount on it. | 1914 | * initial refcount on it. |
1828 | */ | 1915 | */ |
1829 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state); | 1916 | ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); |
1830 | } | 1917 | } |
1831 | 1918 | ||
1832 | switch (ret) { | 1919 | switch (ret) { |
@@ -1909,7 +1996,7 @@ retry_private: | |||
1909 | * refcount on the pi_state and store the pointer in | 1996 | * refcount on the pi_state and store the pointer in |
1910 | * the futex_q object of the waiter. | 1997 | * the futex_q object of the waiter. |
1911 | */ | 1998 | */ |
1912 | atomic_inc(&pi_state->refcount); | 1999 | get_pi_state(pi_state); |
1913 | this->pi_state = pi_state; | 2000 | this->pi_state = pi_state; |
1914 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | 2001 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, |
1915 | this->rt_waiter, | 2002 | this->rt_waiter, |
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb) | |||
2009 | hb_waiters_dec(hb); | 2096 | hb_waiters_dec(hb); |
2010 | } | 2097 | } |
2011 | 2098 | ||
2012 | /** | 2099 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
2013 | * queue_me() - Enqueue the futex_q on the futex_hash_bucket | ||
2014 | * @q: The futex_q to enqueue | ||
2015 | * @hb: The destination hash bucket | ||
2016 | * | ||
2017 | * The hb->lock must be held by the caller, and is released here. A call to | ||
2018 | * queue_me() is typically paired with exactly one call to unqueue_me(). The | ||
2019 | * exceptions involve the PI related operations, which may use unqueue_me_pi() | ||
2020 | * or nothing if the unqueue is done as part of the wake process and the unqueue | ||
2021 | * state is implicit in the state of woken task (see futex_wait_requeue_pi() for | ||
2022 | * an example). | ||
2023 | */ | ||
2024 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | ||
2025 | __releases(&hb->lock) | ||
2026 | { | 2100 | { |
2027 | int prio; | 2101 | int prio; |
2028 | 2102 | ||
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
2039 | plist_node_init(&q->list, prio); | 2113 | plist_node_init(&q->list, prio); |
2040 | plist_add(&q->list, &hb->chain); | 2114 | plist_add(&q->list, &hb->chain); |
2041 | q->task = current; | 2115 | q->task = current; |
2116 | } | ||
2117 | |||
2118 | /** | ||
2119 | * queue_me() - Enqueue the futex_q on the futex_hash_bucket | ||
2120 | * @q: The futex_q to enqueue | ||
2121 | * @hb: The destination hash bucket | ||
2122 | * | ||
2123 | * The hb->lock must be held by the caller, and is released here. A call to | ||
2124 | * queue_me() is typically paired with exactly one call to unqueue_me(). The | ||
2125 | * exceptions involve the PI related operations, which may use unqueue_me_pi() | ||
2126 | * or nothing if the unqueue is done as part of the wake process and the unqueue | ||
2127 | * state is implicit in the state of woken task (see futex_wait_requeue_pi() for | ||
2128 | * an example). | ||
2129 | */ | ||
2130 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | ||
2131 | __releases(&hb->lock) | ||
2132 | { | ||
2133 | __queue_me(q, hb); | ||
2042 | spin_unlock(&hb->lock); | 2134 | spin_unlock(&hb->lock); |
2043 | } | 2135 | } |
2044 | 2136 | ||
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2125 | { | 2217 | { |
2126 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 2218 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
2127 | struct futex_pi_state *pi_state = q->pi_state; | 2219 | struct futex_pi_state *pi_state = q->pi_state; |
2128 | struct task_struct *oldowner = pi_state->owner; | ||
2129 | u32 uval, uninitialized_var(curval), newval; | 2220 | u32 uval, uninitialized_var(curval), newval; |
2221 | struct task_struct *oldowner; | ||
2130 | int ret; | 2222 | int ret; |
2131 | 2223 | ||
2224 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
2225 | |||
2226 | oldowner = pi_state->owner; | ||
2132 | /* Owner died? */ | 2227 | /* Owner died? */ |
2133 | if (!pi_state->owner) | 2228 | if (!pi_state->owner) |
2134 | newtid |= FUTEX_OWNER_DIED; | 2229 | newtid |= FUTEX_OWNER_DIED; |
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2136 | /* | 2231 | /* |
2137 | * We are here either because we stole the rtmutex from the | 2232 | * We are here either because we stole the rtmutex from the |
2138 | * previous highest priority waiter or we are the highest priority | 2233 | * previous highest priority waiter or we are the highest priority |
2139 | * waiter but failed to get the rtmutex the first time. | 2234 | * waiter but have failed to get the rtmutex the first time. |
2235 | * | ||
2140 | * We have to replace the newowner TID in the user space variable. | 2236 | * We have to replace the newowner TID in the user space variable. |
2141 | * This must be atomic as we have to preserve the owner died bit here. | 2237 | * This must be atomic as we have to preserve the owner died bit here. |
2142 | * | 2238 | * |
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2144 | * because we can fault here. Imagine swapped out pages or a fork | 2240 | * because we can fault here. Imagine swapped out pages or a fork |
2145 | * that marked all the anonymous memory readonly for cow. | 2241 | * that marked all the anonymous memory readonly for cow. |
2146 | * | 2242 | * |
2147 | * Modifying pi_state _before_ the user space value would | 2243 | * Modifying pi_state _before_ the user space value would leave the |
2148 | * leave the pi_state in an inconsistent state when we fault | 2244 | * pi_state in an inconsistent state when we fault here, because we |
2149 | * here, because we need to drop the hash bucket lock to | 2245 | * need to drop the locks to handle the fault. This might be observed |
2150 | * handle the fault. This might be observed in the PID check | 2246 | * in the PID check in lookup_pi_state. |
2151 | * in lookup_pi_state. | ||
2152 | */ | 2247 | */ |
2153 | retry: | 2248 | retry: |
2154 | if (get_futex_value_locked(&uval, uaddr)) | 2249 | if (get_futex_value_locked(&uval, uaddr)) |
2155 | goto handle_fault; | 2250 | goto handle_fault; |
2156 | 2251 | ||
2157 | while (1) { | 2252 | for (;;) { |
2158 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 2253 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
2159 | 2254 | ||
2160 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 2255 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
@@ -2169,47 +2264,60 @@ retry: | |||
2169 | * itself. | 2264 | * itself. |
2170 | */ | 2265 | */ |
2171 | if (pi_state->owner != NULL) { | 2266 | if (pi_state->owner != NULL) { |
2172 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 2267 | raw_spin_lock(&pi_state->owner->pi_lock); |
2173 | WARN_ON(list_empty(&pi_state->list)); | 2268 | WARN_ON(list_empty(&pi_state->list)); |
2174 | list_del_init(&pi_state->list); | 2269 | list_del_init(&pi_state->list); |
2175 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | 2270 | raw_spin_unlock(&pi_state->owner->pi_lock); |
2176 | } | 2271 | } |
2177 | 2272 | ||
2178 | pi_state->owner = newowner; | 2273 | pi_state->owner = newowner; |
2179 | 2274 | ||
2180 | raw_spin_lock_irq(&newowner->pi_lock); | 2275 | raw_spin_lock(&newowner->pi_lock); |
2181 | WARN_ON(!list_empty(&pi_state->list)); | 2276 | WARN_ON(!list_empty(&pi_state->list)); |
2182 | list_add(&pi_state->list, &newowner->pi_state_list); | 2277 | list_add(&pi_state->list, &newowner->pi_state_list); |
2183 | raw_spin_unlock_irq(&newowner->pi_lock); | 2278 | raw_spin_unlock(&newowner->pi_lock); |
2279 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2280 | |||
2184 | return 0; | 2281 | return 0; |
2185 | 2282 | ||
2186 | /* | 2283 | /* |
2187 | * To handle the page fault we need to drop the hash bucket | 2284 | * To handle the page fault we need to drop the locks here. That gives |
2188 | * lock here. That gives the other task (either the highest priority | 2285 | * the other task (either the highest priority waiter itself or the |
2189 | * waiter itself or the task which stole the rtmutex) the | 2286 | * task which stole the rtmutex) the chance to try the fixup of the |
2190 | * chance to try the fixup of the pi_state. So once we are | 2287 | * pi_state. So once we are back from handling the fault we need to |
2191 | * back from handling the fault we need to check the pi_state | 2288 | * check the pi_state after reacquiring the locks and before trying to |
2192 | * after reacquiring the hash bucket lock and before trying to | 2289 | * do another fixup. When the fixup has been done already we simply |
2193 | * do another fixup. When the fixup has been done already we | 2290 | * return. |
2194 | * simply return. | 2291 | * |
2292 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely | ||
2293 | * drop hb->lock since the caller owns the hb -> futex_q relation. | ||
2294 | * Dropping the pi_mutex->wait_lock requires the state revalidate. | ||
2195 | */ | 2295 | */ |
2196 | handle_fault: | 2296 | handle_fault: |
2297 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2197 | spin_unlock(q->lock_ptr); | 2298 | spin_unlock(q->lock_ptr); |
2198 | 2299 | ||
2199 | ret = fault_in_user_writeable(uaddr); | 2300 | ret = fault_in_user_writeable(uaddr); |
2200 | 2301 | ||
2201 | spin_lock(q->lock_ptr); | 2302 | spin_lock(q->lock_ptr); |
2303 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
2202 | 2304 | ||
2203 | /* | 2305 | /* |
2204 | * Check if someone else fixed it for us: | 2306 | * Check if someone else fixed it for us: |
2205 | */ | 2307 | */ |
2206 | if (pi_state->owner != oldowner) | 2308 | if (pi_state->owner != oldowner) { |
2207 | return 0; | 2309 | ret = 0; |
2310 | goto out_unlock; | ||
2311 | } | ||
2208 | 2312 | ||
2209 | if (ret) | 2313 | if (ret) |
2210 | return ret; | 2314 | goto out_unlock; |
2211 | 2315 | ||
2212 | goto retry; | 2316 | goto retry; |
2317 | |||
2318 | out_unlock: | ||
2319 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2320 | return ret; | ||
2213 | } | 2321 | } |
2214 | 2322 | ||
2215 | static long futex_wait_restart(struct restart_block *restart); | 2323 | static long futex_wait_restart(struct restart_block *restart); |
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart); | |||
2231 | */ | 2339 | */ |
2232 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | 2340 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
2233 | { | 2341 | { |
2234 | struct task_struct *owner; | ||
2235 | int ret = 0; | 2342 | int ret = 0; |
2236 | 2343 | ||
2237 | if (locked) { | 2344 | if (locked) { |
2238 | /* | 2345 | /* |
2239 | * Got the lock. We might not be the anticipated owner if we | 2346 | * Got the lock. We might not be the anticipated owner if we |
2240 | * did a lock-steal - fix up the PI-state in that case: | 2347 | * did a lock-steal - fix up the PI-state in that case: |
2348 | * | ||
2349 | * We can safely read pi_state->owner without holding wait_lock | ||
2350 | * because we now own the rt_mutex, only the owner will attempt | ||
2351 | * to change it. | ||
2241 | */ | 2352 | */ |
2242 | if (q->pi_state->owner != current) | 2353 | if (q->pi_state->owner != current) |
2243 | ret = fixup_pi_state_owner(uaddr, q, current); | 2354 | ret = fixup_pi_state_owner(uaddr, q, current); |
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
2245 | } | 2356 | } |
2246 | 2357 | ||
2247 | /* | 2358 | /* |
2248 | * Catch the rare case, where the lock was released when we were on the | ||
2249 | * way back before we locked the hash bucket. | ||
2250 | */ | ||
2251 | if (q->pi_state->owner == current) { | ||
2252 | /* | ||
2253 | * Try to get the rt_mutex now. This might fail as some other | ||
2254 | * task acquired the rt_mutex after we removed ourself from the | ||
2255 | * rt_mutex waiters list. | ||
2256 | */ | ||
2257 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { | ||
2258 | locked = 1; | ||
2259 | goto out; | ||
2260 | } | ||
2261 | |||
2262 | /* | ||
2263 | * pi_state is incorrect, some other task did a lock steal and | ||
2264 | * we returned due to timeout or signal without taking the | ||
2265 | * rt_mutex. Too late. | ||
2266 | */ | ||
2267 | raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); | ||
2268 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | ||
2269 | if (!owner) | ||
2270 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | ||
2271 | raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); | ||
2272 | ret = fixup_pi_state_owner(uaddr, q, owner); | ||
2273 | goto out; | ||
2274 | } | ||
2275 | |||
2276 | /* | ||
2277 | * Paranoia check. If we did not take the lock, then we should not be | 2359 | * Paranoia check. If we did not take the lock, then we should not be |
2278 | * the owner of the rt_mutex. | 2360 | * the owner of the rt_mutex. |
2279 | */ | 2361 | */ |
2280 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 2362 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { |
2281 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 2363 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
2282 | "pi-state %p\n", ret, | 2364 | "pi-state %p\n", ret, |
2283 | q->pi_state->pi_mutex.owner, | 2365 | q->pi_state->pi_mutex.owner, |
2284 | q->pi_state->owner); | 2366 | q->pi_state->owner); |
2367 | } | ||
2285 | 2368 | ||
2286 | out: | 2369 | out: |
2287 | return ret ? ret : locked; | 2370 | return ret ? ret : locked; |
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | |||
2505 | ktime_t *time, int trylock) | 2588 | ktime_t *time, int trylock) |
2506 | { | 2589 | { |
2507 | struct hrtimer_sleeper timeout, *to = NULL; | 2590 | struct hrtimer_sleeper timeout, *to = NULL; |
2591 | struct futex_pi_state *pi_state = NULL; | ||
2592 | struct rt_mutex_waiter rt_waiter; | ||
2508 | struct futex_hash_bucket *hb; | 2593 | struct futex_hash_bucket *hb; |
2509 | struct futex_q q = futex_q_init; | 2594 | struct futex_q q = futex_q_init; |
2510 | int res, ret; | 2595 | int res, ret; |
@@ -2557,25 +2642,68 @@ retry_private: | |||
2557 | } | 2642 | } |
2558 | } | 2643 | } |
2559 | 2644 | ||
2645 | WARN_ON(!q.pi_state); | ||
2646 | |||
2560 | /* | 2647 | /* |
2561 | * Only actually queue now that the atomic ops are done: | 2648 | * Only actually queue now that the atomic ops are done: |
2562 | */ | 2649 | */ |
2563 | queue_me(&q, hb); | 2650 | __queue_me(&q, hb); |
2564 | 2651 | ||
2565 | WARN_ON(!q.pi_state); | 2652 | if (trylock) { |
2566 | /* | 2653 | ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); |
2567 | * Block on the PI mutex: | ||
2568 | */ | ||
2569 | if (!trylock) { | ||
2570 | ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); | ||
2571 | } else { | ||
2572 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
2573 | /* Fixup the trylock return value: */ | 2654 | /* Fixup the trylock return value: */ |
2574 | ret = ret ? 0 : -EWOULDBLOCK; | 2655 | ret = ret ? 0 : -EWOULDBLOCK; |
2656 | goto no_block; | ||
2575 | } | 2657 | } |
2576 | 2658 | ||
2659 | rt_mutex_init_waiter(&rt_waiter); | ||
2660 | |||
2661 | /* | ||
2662 | * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not | ||
2663 | * hold it while doing rt_mutex_start_proxy(), because then it will | ||
2664 | * include hb->lock in the blocking chain, even through we'll not in | ||
2665 | * fact hold it while blocking. This will lead it to report -EDEADLK | ||
2666 | * and BUG when futex_unlock_pi() interleaves with this. | ||
2667 | * | ||
2668 | * Therefore acquire wait_lock while holding hb->lock, but drop the | ||
2669 | * latter before calling rt_mutex_start_proxy_lock(). This still fully | ||
2670 | * serializes against futex_unlock_pi() as that does the exact same | ||
2671 | * lock handoff sequence. | ||
2672 | */ | ||
2673 | raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); | ||
2674 | spin_unlock(q.lock_ptr); | ||
2675 | ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); | ||
2676 | raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); | ||
2677 | |||
2678 | if (ret) { | ||
2679 | if (ret == 1) | ||
2680 | ret = 0; | ||
2681 | |||
2682 | spin_lock(q.lock_ptr); | ||
2683 | goto no_block; | ||
2684 | } | ||
2685 | |||
2686 | |||
2687 | if (unlikely(to)) | ||
2688 | hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); | ||
2689 | |||
2690 | ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); | ||
2691 | |||
2577 | spin_lock(q.lock_ptr); | 2692 | spin_lock(q.lock_ptr); |
2578 | /* | 2693 | /* |
2694 | * If we failed to acquire the lock (signal/timeout), we must | ||
2695 | * first acquire the hb->lock before removing the lock from the | ||
2696 | * rt_mutex waitqueue, such that we can keep the hb and rt_mutex | ||
2697 | * wait lists consistent. | ||
2698 | * | ||
2699 | * In particular; it is important that futex_unlock_pi() can not | ||
2700 | * observe this inconsistency. | ||
2701 | */ | ||
2702 | if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) | ||
2703 | ret = 0; | ||
2704 | |||
2705 | no_block: | ||
2706 | /* | ||
2579 | * Fixup the pi_state owner and possibly acquire the lock if we | 2707 | * Fixup the pi_state owner and possibly acquire the lock if we |
2580 | * haven't already. | 2708 | * haven't already. |
2581 | */ | 2709 | */ |
@@ -2591,12 +2719,19 @@ retry_private: | |||
2591 | * If fixup_owner() faulted and was unable to handle the fault, unlock | 2719 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
2592 | * it and return the fault to userspace. | 2720 | * it and return the fault to userspace. |
2593 | */ | 2721 | */ |
2594 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 2722 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { |
2595 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 2723 | pi_state = q.pi_state; |
2724 | get_pi_state(pi_state); | ||
2725 | } | ||
2596 | 2726 | ||
2597 | /* Unqueue and drop the lock */ | 2727 | /* Unqueue and drop the lock */ |
2598 | unqueue_me_pi(&q); | 2728 | unqueue_me_pi(&q); |
2599 | 2729 | ||
2730 | if (pi_state) { | ||
2731 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
2732 | put_pi_state(pi_state); | ||
2733 | } | ||
2734 | |||
2600 | goto out_put_key; | 2735 | goto out_put_key; |
2601 | 2736 | ||
2602 | out_unlock_put_key: | 2737 | out_unlock_put_key: |
@@ -2605,8 +2740,10 @@ out_unlock_put_key: | |||
2605 | out_put_key: | 2740 | out_put_key: |
2606 | put_futex_key(&q.key); | 2741 | put_futex_key(&q.key); |
2607 | out: | 2742 | out: |
2608 | if (to) | 2743 | if (to) { |
2744 | hrtimer_cancel(&to->timer); | ||
2609 | destroy_hrtimer_on_stack(&to->timer); | 2745 | destroy_hrtimer_on_stack(&to->timer); |
2746 | } | ||
2610 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 2747 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
2611 | 2748 | ||
2612 | uaddr_faulted: | 2749 | uaddr_faulted: |
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |||
2633 | u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); | 2770 | u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); |
2634 | union futex_key key = FUTEX_KEY_INIT; | 2771 | union futex_key key = FUTEX_KEY_INIT; |
2635 | struct futex_hash_bucket *hb; | 2772 | struct futex_hash_bucket *hb; |
2636 | struct futex_q *match; | 2773 | struct futex_q *top_waiter; |
2637 | int ret; | 2774 | int ret; |
2638 | 2775 | ||
2639 | retry: | 2776 | retry: |
@@ -2657,12 +2794,37 @@ retry: | |||
2657 | * all and we at least want to know if user space fiddled | 2794 | * all and we at least want to know if user space fiddled |
2658 | * with the futex value instead of blindly unlocking. | 2795 | * with the futex value instead of blindly unlocking. |
2659 | */ | 2796 | */ |
2660 | match = futex_top_waiter(hb, &key); | 2797 | top_waiter = futex_top_waiter(hb, &key); |
2661 | if (match) { | 2798 | if (top_waiter) { |
2662 | ret = wake_futex_pi(uaddr, uval, match, hb); | 2799 | struct futex_pi_state *pi_state = top_waiter->pi_state; |
2800 | |||
2801 | ret = -EINVAL; | ||
2802 | if (!pi_state) | ||
2803 | goto out_unlock; | ||
2804 | |||
2663 | /* | 2805 | /* |
2664 | * In case of success wake_futex_pi dropped the hash | 2806 | * If current does not own the pi_state then the futex is |
2665 | * bucket lock. | 2807 | * inconsistent and user space fiddled with the futex value. |
2808 | */ | ||
2809 | if (pi_state->owner != current) | ||
2810 | goto out_unlock; | ||
2811 | |||
2812 | get_pi_state(pi_state); | ||
2813 | /* | ||
2814 | * By taking wait_lock while still holding hb->lock, we ensure | ||
2815 | * there is no point where we hold neither; and therefore | ||
2816 | * wake_futex_pi() must observe a state consistent with what we | ||
2817 | * observed. | ||
2818 | */ | ||
2819 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
2820 | spin_unlock(&hb->lock); | ||
2821 | |||
2822 | ret = wake_futex_pi(uaddr, uval, pi_state); | ||
2823 | |||
2824 | put_pi_state(pi_state); | ||
2825 | |||
2826 | /* | ||
2827 | * Success, we're done! No tricky corner cases. | ||
2666 | */ | 2828 | */ |
2667 | if (!ret) | 2829 | if (!ret) |
2668 | goto out_putkey; | 2830 | goto out_putkey; |
@@ -2677,7 +2839,6 @@ retry: | |||
2677 | * setting the FUTEX_WAITERS bit. Try again. | 2839 | * setting the FUTEX_WAITERS bit. Try again. |
2678 | */ | 2840 | */ |
2679 | if (ret == -EAGAIN) { | 2841 | if (ret == -EAGAIN) { |
2680 | spin_unlock(&hb->lock); | ||
2681 | put_futex_key(&key); | 2842 | put_futex_key(&key); |
2682 | goto retry; | 2843 | goto retry; |
2683 | } | 2844 | } |
@@ -2685,7 +2846,7 @@ retry: | |||
2685 | * wake_futex_pi has detected invalid state. Tell user | 2846 | * wake_futex_pi has detected invalid state. Tell user |
2686 | * space. | 2847 | * space. |
2687 | */ | 2848 | */ |
2688 | goto out_unlock; | 2849 | goto out_putkey; |
2689 | } | 2850 | } |
2690 | 2851 | ||
2691 | /* | 2852 | /* |
@@ -2695,8 +2856,10 @@ retry: | |||
2695 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | 2856 | * preserve the WAITERS bit not the OWNER_DIED one. We are the |
2696 | * owner. | 2857 | * owner. |
2697 | */ | 2858 | */ |
2698 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) | 2859 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { |
2860 | spin_unlock(&hb->lock); | ||
2699 | goto pi_faulted; | 2861 | goto pi_faulted; |
2862 | } | ||
2700 | 2863 | ||
2701 | /* | 2864 | /* |
2702 | * If uval has changed, let user space handle it. | 2865 | * If uval has changed, let user space handle it. |
@@ -2710,7 +2873,6 @@ out_putkey: | |||
2710 | return ret; | 2873 | return ret; |
2711 | 2874 | ||
2712 | pi_faulted: | 2875 | pi_faulted: |
2713 | spin_unlock(&hb->lock); | ||
2714 | put_futex_key(&key); | 2876 | put_futex_key(&key); |
2715 | 2877 | ||
2716 | ret = fault_in_user_writeable(uaddr); | 2878 | ret = fault_in_user_writeable(uaddr); |
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2814 | u32 __user *uaddr2) | 2976 | u32 __user *uaddr2) |
2815 | { | 2977 | { |
2816 | struct hrtimer_sleeper timeout, *to = NULL; | 2978 | struct hrtimer_sleeper timeout, *to = NULL; |
2979 | struct futex_pi_state *pi_state = NULL; | ||
2817 | struct rt_mutex_waiter rt_waiter; | 2980 | struct rt_mutex_waiter rt_waiter; |
2818 | struct futex_hash_bucket *hb; | 2981 | struct futex_hash_bucket *hb; |
2819 | union futex_key key2 = FUTEX_KEY_INIT; | 2982 | union futex_key key2 = FUTEX_KEY_INIT; |
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2840 | * The waiter is allocated on our stack, manipulated by the requeue | 3003 | * The waiter is allocated on our stack, manipulated by the requeue |
2841 | * code while we sleep on uaddr. | 3004 | * code while we sleep on uaddr. |
2842 | */ | 3005 | */ |
2843 | debug_rt_mutex_init_waiter(&rt_waiter); | 3006 | rt_mutex_init_waiter(&rt_waiter); |
2844 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); | ||
2845 | RB_CLEAR_NODE(&rt_waiter.tree_entry); | ||
2846 | rt_waiter.task = NULL; | ||
2847 | 3007 | ||
2848 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 3008 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
2849 | if (unlikely(ret != 0)) | 3009 | if (unlikely(ret != 0)) |
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2898 | if (q.pi_state && (q.pi_state->owner != current)) { | 3058 | if (q.pi_state && (q.pi_state->owner != current)) { |
2899 | spin_lock(q.lock_ptr); | 3059 | spin_lock(q.lock_ptr); |
2900 | ret = fixup_pi_state_owner(uaddr2, &q, current); | 3060 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2901 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) | 3061 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { |
2902 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 3062 | pi_state = q.pi_state; |
3063 | get_pi_state(pi_state); | ||
3064 | } | ||
2903 | /* | 3065 | /* |
2904 | * Drop the reference to the pi state which | 3066 | * Drop the reference to the pi state which |
2905 | * the requeue_pi() code acquired for us. | 3067 | * the requeue_pi() code acquired for us. |
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2917 | */ | 3079 | */ |
2918 | WARN_ON(!q.pi_state); | 3080 | WARN_ON(!q.pi_state); |
2919 | pi_mutex = &q.pi_state->pi_mutex; | 3081 | pi_mutex = &q.pi_state->pi_mutex; |
2920 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); | 3082 | ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); |
2921 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
2922 | 3083 | ||
2923 | spin_lock(q.lock_ptr); | 3084 | spin_lock(q.lock_ptr); |
3085 | if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) | ||
3086 | ret = 0; | ||
3087 | |||
3088 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
2924 | /* | 3089 | /* |
2925 | * Fixup the pi_state owner and possibly acquire the lock if we | 3090 | * Fixup the pi_state owner and possibly acquire the lock if we |
2926 | * haven't already. | 3091 | * haven't already. |
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2938 | * the fault, unlock the rt_mutex and return the fault to | 3103 | * the fault, unlock the rt_mutex and return the fault to |
2939 | * userspace. | 3104 | * userspace. |
2940 | */ | 3105 | */ |
2941 | if (ret && rt_mutex_owner(pi_mutex) == current) | 3106 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { |
2942 | rt_mutex_unlock(pi_mutex); | 3107 | pi_state = q.pi_state; |
3108 | get_pi_state(pi_state); | ||
3109 | } | ||
2943 | 3110 | ||
2944 | /* Unqueue and drop the lock. */ | 3111 | /* Unqueue and drop the lock. */ |
2945 | unqueue_me_pi(&q); | 3112 | unqueue_me_pi(&q); |
2946 | } | 3113 | } |
2947 | 3114 | ||
3115 | if (pi_state) { | ||
3116 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
3117 | put_pi_state(pi_state); | ||
3118 | } | ||
3119 | |||
2948 | if (ret == -EINTR) { | 3120 | if (ret == -EINTR) { |
2949 | /* | 3121 | /* |
2950 | * We've already been requeued, but cannot restart by calling | 3122 | * We've already been requeued, but cannot restart by calling |
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 2f9df37940a0..c51a49c9be70 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) | |||
98 | } | 98 | } |
99 | EXPORT_SYMBOL(__gcov_merge_icall_topn); | 99 | EXPORT_SYMBOL(__gcov_merge_icall_topn); |
100 | 100 | ||
101 | void __gcov_exit(void) | ||
102 | { | ||
103 | /* Unused. */ | ||
104 | } | ||
105 | EXPORT_SYMBOL(__gcov_exit); | ||
106 | |||
101 | /** | 107 | /** |
102 | * gcov_enable_events - enable event reporting through gcov_event() | 108 | * gcov_enable_events - enable event reporting through gcov_event() |
103 | * | 109 | * |
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 6a5c239c7669..46a18e72bce6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c | |||
@@ -18,7 +18,9 @@ | |||
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include "gcov.h" | 19 | #include "gcov.h" |
20 | 20 | ||
21 | #if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) | 21 | #if (__GNUC__ >= 7) |
22 | #define GCOV_COUNTERS 9 | ||
23 | #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) | ||
22 | #define GCOV_COUNTERS 10 | 24 | #define GCOV_COUNTERS 10 |
23 | #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 | 25 | #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 |
24 | #define GCOV_COUNTERS 9 | 26 | #define GCOV_COUNTERS 9 |
diff --git a/kernel/groups.c b/kernel/groups.c index 8dd7a61b7115..d09727692a2a 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize) | |||
18 | len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; | 18 | len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; |
19 | gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); | 19 | gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); |
20 | if (!gi) | 20 | if (!gi) |
21 | gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); | 21 | gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); |
22 | if (!gi) | 22 | if (!gi) |
23 | return NULL; | 23 | return NULL; |
24 | 24 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f0f8e2a8496f..751593ed7c0b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ | |||
43 | int __read_mostly sysctl_hung_task_warnings = 10; | 43 | int __read_mostly sysctl_hung_task_warnings = 10; |
44 | 44 | ||
45 | static int __read_mostly did_panic; | 45 | static int __read_mostly did_panic; |
46 | static bool hung_task_show_lock; | ||
46 | 47 | ||
47 | static struct task_struct *watchdog_task; | 48 | static struct task_struct *watchdog_task; |
48 | 49 | ||
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
120 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 121 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
121 | " disables this message.\n"); | 122 | " disables this message.\n"); |
122 | sched_show_task(t); | 123 | sched_show_task(t); |
123 | debug_show_all_locks(); | 124 | hung_task_show_lock = true; |
124 | } | 125 | } |
125 | 126 | ||
126 | touch_nmi_watchdog(); | 127 | touch_nmi_watchdog(); |
127 | 128 | ||
128 | if (sysctl_hung_task_panic) { | 129 | if (sysctl_hung_task_panic) { |
130 | if (hung_task_show_lock) | ||
131 | debug_show_all_locks(); | ||
129 | trigger_all_cpu_backtrace(); | 132 | trigger_all_cpu_backtrace(); |
130 | panic("hung_task: blocked tasks"); | 133 | panic("hung_task: blocked tasks"); |
131 | } | 134 | } |
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
172 | if (test_taint(TAINT_DIE) || did_panic) | 175 | if (test_taint(TAINT_DIE) || did_panic) |
173 | return; | 176 | return; |
174 | 177 | ||
178 | hung_task_show_lock = false; | ||
175 | rcu_read_lock(); | 179 | rcu_read_lock(); |
176 | for_each_process_thread(g, t) { | 180 | for_each_process_thread(g, t) { |
177 | if (!max_count--) | 181 | if (!max_count--) |
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
187 | } | 191 | } |
188 | unlock: | 192 | unlock: |
189 | rcu_read_unlock(); | 193 | rcu_read_unlock(); |
194 | if (hung_task_show_lock) | ||
195 | debug_show_all_locks(); | ||
190 | } | 196 | } |
191 | 197 | ||
192 | static long hung_timeout_jiffies(unsigned long last_checked, | 198 | static long hung_timeout_jiffies(unsigned long last_checked, |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index be3c34e4f2ac..c94da688ee9b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq) | |||
348 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 348 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
349 | raw_spin_unlock_irq(&desc->lock); | 349 | raw_spin_unlock_irq(&desc->lock); |
350 | 350 | ||
351 | action_ret = action->thread_fn(action->irq, action->dev_id); | 351 | action_ret = IRQ_NONE; |
352 | for_each_action_of_desc(desc, action) | ||
353 | action_ret |= action->thread_fn(action->irq, action->dev_id); | ||
354 | |||
352 | if (!noirqdebug) | 355 | if (!noirqdebug) |
353 | note_interrupt(desc, action_ret); | 356 | note_interrupt(desc, action_ret); |
354 | 357 | ||
@@ -877,8 +880,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, | |||
877 | if (!desc) | 880 | if (!desc) |
878 | return; | 881 | return; |
879 | 882 | ||
880 | __irq_do_set_handler(desc, handle, 1, NULL); | ||
881 | desc->irq_common_data.handler_data = data; | 883 | desc->irq_common_data.handler_data = data; |
884 | __irq_do_set_handler(desc, handle, 1, NULL); | ||
882 | 885 | ||
883 | irq_put_desc_busunlock(desc, flags); | 886 | irq_put_desc_busunlock(desc, flags); |
884 | } | 887 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4afe5cc5af1..070be980c37a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) | |||
852 | * This code is triggered unconditionally. Check the affinity | 852 | * This code is triggered unconditionally. Check the affinity |
853 | * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. | 853 | * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. |
854 | */ | 854 | */ |
855 | if (desc->irq_common_data.affinity) | 855 | if (cpumask_available(desc->irq_common_data.affinity)) |
856 | cpumask_copy(mask, desc->irq_common_data.affinity); | 856 | cpumask_copy(mask, desc->irq_common_data.affinity); |
857 | else | 857 | else |
858 | valid = false; | 858 | valid = false; |
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1212 | * set the trigger type must match. Also all must | 1212 | * set the trigger type must match. Also all must |
1213 | * agree on ONESHOT. | 1213 | * agree on ONESHOT. |
1214 | */ | 1214 | */ |
1215 | unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data); | ||
1216 | |||
1215 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 1217 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
1216 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 1218 | (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || |
1217 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) | 1219 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
1218 | goto mismatch; | 1220 | goto mismatch; |
1219 | 1221 | ||
@@ -1557,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) | |||
1557 | struct irq_desc *desc = irq_to_desc(irq); | 1559 | struct irq_desc *desc = irq_to_desc(irq); |
1558 | 1560 | ||
1559 | if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) | 1561 | if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) |
1560 | __free_irq(irq, act->dev_id); | 1562 | __free_irq(irq, act->dev_id); |
1561 | } | 1563 | } |
1562 | EXPORT_SYMBOL_GPL(remove_irq); | 1564 | EXPORT_SYMBOL_GPL(remove_irq); |
1563 | 1565 | ||
@@ -1574,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq); | |||
1574 | * have completed. | 1576 | * have completed. |
1575 | * | 1577 | * |
1576 | * This function must not be called from interrupt context. | 1578 | * This function must not be called from interrupt context. |
1579 | * | ||
1580 | * Returns the devname argument passed to request_irq. | ||
1577 | */ | 1581 | */ |
1578 | void free_irq(unsigned int irq, void *dev_id) | 1582 | const void *free_irq(unsigned int irq, void *dev_id) |
1579 | { | 1583 | { |
1580 | struct irq_desc *desc = irq_to_desc(irq); | 1584 | struct irq_desc *desc = irq_to_desc(irq); |
1585 | struct irqaction *action; | ||
1586 | const char *devname; | ||
1581 | 1587 | ||
1582 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) | 1588 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) |
1583 | return; | 1589 | return NULL; |
1584 | 1590 | ||
1585 | #ifdef CONFIG_SMP | 1591 | #ifdef CONFIG_SMP |
1586 | if (WARN_ON(desc->affinity_notify)) | 1592 | if (WARN_ON(desc->affinity_notify)) |
1587 | desc->affinity_notify = NULL; | 1593 | desc->affinity_notify = NULL; |
1588 | #endif | 1594 | #endif |
1589 | 1595 | ||
1590 | kfree(__free_irq(irq, dev_id)); | 1596 | action = __free_irq(irq, dev_id); |
1597 | devname = action->name; | ||
1598 | kfree(action); | ||
1599 | return devname; | ||
1591 | } | 1600 | } |
1592 | EXPORT_SYMBOL(free_irq); | 1601 | EXPORT_SYMBOL(free_irq); |
1593 | 1602 | ||
diff --git a/kernel/kcov.c b/kernel/kcov.c index 85e5546cd791..cd771993f96f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void) | |||
60 | /* | 60 | /* |
61 | * We are interested in code coverage as a function of a syscall inputs, | 61 | * We are interested in code coverage as a function of a syscall inputs, |
62 | * so we ignore code executed in interrupts. | 62 | * so we ignore code executed in interrupts. |
63 | * The checks for whether we are in an interrupt are open-coded, because | ||
64 | * 1. We can't use in_interrupt() here, since it also returns true | ||
65 | * when we are inside local_bh_disable() section. | ||
66 | * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), | ||
67 | * since that leads to slower generated code (three separate tests, | ||
68 | * one for each of the flags). | ||
69 | */ | 63 | */ |
70 | if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET | 64 | if (!t || !in_task()) |
71 | | NMI_MASK))) | ||
72 | return; | 65 | return; |
73 | mode = READ_ONCE(t->kcov_mode); | 66 | mode = READ_ONCE(t->kcov_mode); |
74 | if (mode == KCOV_MODE_TRACE) { | 67 | if (mode == KCOV_MODE_TRACE) { |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bfe62d5b3872..ae1a3ba24df5 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex); | |||
51 | /* Per cpu memory for storing cpu states in case of system crash. */ | 51 | /* Per cpu memory for storing cpu states in case of system crash. */ |
52 | note_buf_t __percpu *crash_notes; | 52 | note_buf_t __percpu *crash_notes; |
53 | 53 | ||
54 | /* vmcoreinfo stuff */ | ||
55 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | ||
56 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | ||
57 | size_t vmcoreinfo_size; | ||
58 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | ||
59 | |||
60 | /* Flag to indicate we are going to kexec a new kernel */ | 54 | /* Flag to indicate we are going to kexec a new kernel */ |
61 | bool kexec_in_progress = false; | 55 | bool kexec_in_progress = false; |
62 | 56 | ||
@@ -996,34 +990,6 @@ unlock: | |||
996 | return ret; | 990 | return ret; |
997 | } | 991 | } |
998 | 992 | ||
999 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
1000 | size_t data_len) | ||
1001 | { | ||
1002 | struct elf_note note; | ||
1003 | |||
1004 | note.n_namesz = strlen(name) + 1; | ||
1005 | note.n_descsz = data_len; | ||
1006 | note.n_type = type; | ||
1007 | memcpy(buf, ¬e, sizeof(note)); | ||
1008 | buf += (sizeof(note) + 3)/4; | ||
1009 | memcpy(buf, name, note.n_namesz); | ||
1010 | buf += (note.n_namesz + 3)/4; | ||
1011 | memcpy(buf, data, note.n_descsz); | ||
1012 | buf += (note.n_descsz + 3)/4; | ||
1013 | |||
1014 | return buf; | ||
1015 | } | ||
1016 | |||
1017 | static void final_note(u32 *buf) | ||
1018 | { | ||
1019 | struct elf_note note; | ||
1020 | |||
1021 | note.n_namesz = 0; | ||
1022 | note.n_descsz = 0; | ||
1023 | note.n_type = 0; | ||
1024 | memcpy(buf, ¬e, sizeof(note)); | ||
1025 | } | ||
1026 | |||
1027 | void crash_save_cpu(struct pt_regs *regs, int cpu) | 993 | void crash_save_cpu(struct pt_regs *regs, int cpu) |
1028 | { | 994 | { |
1029 | struct elf_prstatus prstatus; | 995 | struct elf_prstatus prstatus; |
@@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init); | |||
1085 | 1051 | ||
1086 | 1052 | ||
1087 | /* | 1053 | /* |
1088 | * parsing the "crashkernel" commandline | ||
1089 | * | ||
1090 | * this code is intended to be called from architecture specific code | ||
1091 | */ | ||
1092 | |||
1093 | |||
1094 | /* | ||
1095 | * This function parses command lines in the format | ||
1096 | * | ||
1097 | * crashkernel=ramsize-range:size[,...][@offset] | ||
1098 | * | ||
1099 | * The function returns 0 on success and -EINVAL on failure. | ||
1100 | */ | ||
1101 | static int __init parse_crashkernel_mem(char *cmdline, | ||
1102 | unsigned long long system_ram, | ||
1103 | unsigned long long *crash_size, | ||
1104 | unsigned long long *crash_base) | ||
1105 | { | ||
1106 | char *cur = cmdline, *tmp; | ||
1107 | |||
1108 | /* for each entry of the comma-separated list */ | ||
1109 | do { | ||
1110 | unsigned long long start, end = ULLONG_MAX, size; | ||
1111 | |||
1112 | /* get the start of the range */ | ||
1113 | start = memparse(cur, &tmp); | ||
1114 | if (cur == tmp) { | ||
1115 | pr_warn("crashkernel: Memory value expected\n"); | ||
1116 | return -EINVAL; | ||
1117 | } | ||
1118 | cur = tmp; | ||
1119 | if (*cur != '-') { | ||
1120 | pr_warn("crashkernel: '-' expected\n"); | ||
1121 | return -EINVAL; | ||
1122 | } | ||
1123 | cur++; | ||
1124 | |||
1125 | /* if no ':' is here, than we read the end */ | ||
1126 | if (*cur != ':') { | ||
1127 | end = memparse(cur, &tmp); | ||
1128 | if (cur == tmp) { | ||
1129 | pr_warn("crashkernel: Memory value expected\n"); | ||
1130 | return -EINVAL; | ||
1131 | } | ||
1132 | cur = tmp; | ||
1133 | if (end <= start) { | ||
1134 | pr_warn("crashkernel: end <= start\n"); | ||
1135 | return -EINVAL; | ||
1136 | } | ||
1137 | } | ||
1138 | |||
1139 | if (*cur != ':') { | ||
1140 | pr_warn("crashkernel: ':' expected\n"); | ||
1141 | return -EINVAL; | ||
1142 | } | ||
1143 | cur++; | ||
1144 | |||
1145 | size = memparse(cur, &tmp); | ||
1146 | if (cur == tmp) { | ||
1147 | pr_warn("Memory value expected\n"); | ||
1148 | return -EINVAL; | ||
1149 | } | ||
1150 | cur = tmp; | ||
1151 | if (size >= system_ram) { | ||
1152 | pr_warn("crashkernel: invalid size\n"); | ||
1153 | return -EINVAL; | ||
1154 | } | ||
1155 | |||
1156 | /* match ? */ | ||
1157 | if (system_ram >= start && system_ram < end) { | ||
1158 | *crash_size = size; | ||
1159 | break; | ||
1160 | } | ||
1161 | } while (*cur++ == ','); | ||
1162 | |||
1163 | if (*crash_size > 0) { | ||
1164 | while (*cur && *cur != ' ' && *cur != '@') | ||
1165 | cur++; | ||
1166 | if (*cur == '@') { | ||
1167 | cur++; | ||
1168 | *crash_base = memparse(cur, &tmp); | ||
1169 | if (cur == tmp) { | ||
1170 | pr_warn("Memory value expected after '@'\n"); | ||
1171 | return -EINVAL; | ||
1172 | } | ||
1173 | } | ||
1174 | } | ||
1175 | |||
1176 | return 0; | ||
1177 | } | ||
1178 | |||
1179 | /* | ||
1180 | * That function parses "simple" (old) crashkernel command lines like | ||
1181 | * | ||
1182 | * crashkernel=size[@offset] | ||
1183 | * | ||
1184 | * It returns 0 on success and -EINVAL on failure. | ||
1185 | */ | ||
1186 | static int __init parse_crashkernel_simple(char *cmdline, | ||
1187 | unsigned long long *crash_size, | ||
1188 | unsigned long long *crash_base) | ||
1189 | { | ||
1190 | char *cur = cmdline; | ||
1191 | |||
1192 | *crash_size = memparse(cmdline, &cur); | ||
1193 | if (cmdline == cur) { | ||
1194 | pr_warn("crashkernel: memory value expected\n"); | ||
1195 | return -EINVAL; | ||
1196 | } | ||
1197 | |||
1198 | if (*cur == '@') | ||
1199 | *crash_base = memparse(cur+1, &cur); | ||
1200 | else if (*cur != ' ' && *cur != '\0') { | ||
1201 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
1202 | return -EINVAL; | ||
1203 | } | ||
1204 | |||
1205 | return 0; | ||
1206 | } | ||
1207 | |||
1208 | #define SUFFIX_HIGH 0 | ||
1209 | #define SUFFIX_LOW 1 | ||
1210 | #define SUFFIX_NULL 2 | ||
1211 | static __initdata char *suffix_tbl[] = { | ||
1212 | [SUFFIX_HIGH] = ",high", | ||
1213 | [SUFFIX_LOW] = ",low", | ||
1214 | [SUFFIX_NULL] = NULL, | ||
1215 | }; | ||
1216 | |||
1217 | /* | ||
1218 | * That function parses "suffix" crashkernel command lines like | ||
1219 | * | ||
1220 | * crashkernel=size,[high|low] | ||
1221 | * | ||
1222 | * It returns 0 on success and -EINVAL on failure. | ||
1223 | */ | ||
1224 | static int __init parse_crashkernel_suffix(char *cmdline, | ||
1225 | unsigned long long *crash_size, | ||
1226 | const char *suffix) | ||
1227 | { | ||
1228 | char *cur = cmdline; | ||
1229 | |||
1230 | *crash_size = memparse(cmdline, &cur); | ||
1231 | if (cmdline == cur) { | ||
1232 | pr_warn("crashkernel: memory value expected\n"); | ||
1233 | return -EINVAL; | ||
1234 | } | ||
1235 | |||
1236 | /* check with suffix */ | ||
1237 | if (strncmp(cur, suffix, strlen(suffix))) { | ||
1238 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
1239 | return -EINVAL; | ||
1240 | } | ||
1241 | cur += strlen(suffix); | ||
1242 | if (*cur != ' ' && *cur != '\0') { | ||
1243 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); | ||
1244 | return -EINVAL; | ||
1245 | } | ||
1246 | |||
1247 | return 0; | ||
1248 | } | ||
1249 | |||
1250 | static __init char *get_last_crashkernel(char *cmdline, | ||
1251 | const char *name, | ||
1252 | const char *suffix) | ||
1253 | { | ||
1254 | char *p = cmdline, *ck_cmdline = NULL; | ||
1255 | |||
1256 | /* find crashkernel and use the last one if there are more */ | ||
1257 | p = strstr(p, name); | ||
1258 | while (p) { | ||
1259 | char *end_p = strchr(p, ' '); | ||
1260 | char *q; | ||
1261 | |||
1262 | if (!end_p) | ||
1263 | end_p = p + strlen(p); | ||
1264 | |||
1265 | if (!suffix) { | ||
1266 | int i; | ||
1267 | |||
1268 | /* skip the one with any known suffix */ | ||
1269 | for (i = 0; suffix_tbl[i]; i++) { | ||
1270 | q = end_p - strlen(suffix_tbl[i]); | ||
1271 | if (!strncmp(q, suffix_tbl[i], | ||
1272 | strlen(suffix_tbl[i]))) | ||
1273 | goto next; | ||
1274 | } | ||
1275 | ck_cmdline = p; | ||
1276 | } else { | ||
1277 | q = end_p - strlen(suffix); | ||
1278 | if (!strncmp(q, suffix, strlen(suffix))) | ||
1279 | ck_cmdline = p; | ||
1280 | } | ||
1281 | next: | ||
1282 | p = strstr(p+1, name); | ||
1283 | } | ||
1284 | |||
1285 | if (!ck_cmdline) | ||
1286 | return NULL; | ||
1287 | |||
1288 | return ck_cmdline; | ||
1289 | } | ||
1290 | |||
1291 | static int __init __parse_crashkernel(char *cmdline, | ||
1292 | unsigned long long system_ram, | ||
1293 | unsigned long long *crash_size, | ||
1294 | unsigned long long *crash_base, | ||
1295 | const char *name, | ||
1296 | const char *suffix) | ||
1297 | { | ||
1298 | char *first_colon, *first_space; | ||
1299 | char *ck_cmdline; | ||
1300 | |||
1301 | BUG_ON(!crash_size || !crash_base); | ||
1302 | *crash_size = 0; | ||
1303 | *crash_base = 0; | ||
1304 | |||
1305 | ck_cmdline = get_last_crashkernel(cmdline, name, suffix); | ||
1306 | |||
1307 | if (!ck_cmdline) | ||
1308 | return -EINVAL; | ||
1309 | |||
1310 | ck_cmdline += strlen(name); | ||
1311 | |||
1312 | if (suffix) | ||
1313 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | ||
1314 | suffix); | ||
1315 | /* | ||
1316 | * if the commandline contains a ':', then that's the extended | ||
1317 | * syntax -- if not, it must be the classic syntax | ||
1318 | */ | ||
1319 | first_colon = strchr(ck_cmdline, ':'); | ||
1320 | first_space = strchr(ck_cmdline, ' '); | ||
1321 | if (first_colon && (!first_space || first_colon < first_space)) | ||
1322 | return parse_crashkernel_mem(ck_cmdline, system_ram, | ||
1323 | crash_size, crash_base); | ||
1324 | |||
1325 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); | ||
1326 | } | ||
1327 | |||
1328 | /* | ||
1329 | * That function is the entry point for command line parsing and should be | ||
1330 | * called from the arch-specific code. | ||
1331 | */ | ||
1332 | int __init parse_crashkernel(char *cmdline, | ||
1333 | unsigned long long system_ram, | ||
1334 | unsigned long long *crash_size, | ||
1335 | unsigned long long *crash_base) | ||
1336 | { | ||
1337 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1338 | "crashkernel=", NULL); | ||
1339 | } | ||
1340 | |||
1341 | int __init parse_crashkernel_high(char *cmdline, | ||
1342 | unsigned long long system_ram, | ||
1343 | unsigned long long *crash_size, | ||
1344 | unsigned long long *crash_base) | ||
1345 | { | ||
1346 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1347 | "crashkernel=", suffix_tbl[SUFFIX_HIGH]); | ||
1348 | } | ||
1349 | |||
1350 | int __init parse_crashkernel_low(char *cmdline, | ||
1351 | unsigned long long system_ram, | ||
1352 | unsigned long long *crash_size, | ||
1353 | unsigned long long *crash_base) | ||
1354 | { | ||
1355 | return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, | ||
1356 | "crashkernel=", suffix_tbl[SUFFIX_LOW]); | ||
1357 | } | ||
1358 | |||
1359 | static void update_vmcoreinfo_note(void) | ||
1360 | { | ||
1361 | u32 *buf = vmcoreinfo_note; | ||
1362 | |||
1363 | if (!vmcoreinfo_size) | ||
1364 | return; | ||
1365 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | ||
1366 | vmcoreinfo_size); | ||
1367 | final_note(buf); | ||
1368 | } | ||
1369 | |||
1370 | void crash_save_vmcoreinfo(void) | ||
1371 | { | ||
1372 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | ||
1373 | update_vmcoreinfo_note(); | ||
1374 | } | ||
1375 | |||
1376 | void vmcoreinfo_append_str(const char *fmt, ...) | ||
1377 | { | ||
1378 | va_list args; | ||
1379 | char buf[0x50]; | ||
1380 | size_t r; | ||
1381 | |||
1382 | va_start(args, fmt); | ||
1383 | r = vscnprintf(buf, sizeof(buf), fmt, args); | ||
1384 | va_end(args); | ||
1385 | |||
1386 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | ||
1387 | |||
1388 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | ||
1389 | |||
1390 | vmcoreinfo_size += r; | ||
1391 | } | ||
1392 | |||
1393 | /* | ||
1394 | * provide an empty default implementation here -- architecture | ||
1395 | * code may override this | ||
1396 | */ | ||
1397 | void __weak arch_crash_save_vmcoreinfo(void) | ||
1398 | {} | ||
1399 | |||
1400 | phys_addr_t __weak paddr_vmcoreinfo_note(void) | ||
1401 | { | ||
1402 | return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); | ||
1403 | } | ||
1404 | |||
1405 | static int __init crash_save_vmcoreinfo_init(void) | ||
1406 | { | ||
1407 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | ||
1408 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | ||
1409 | |||
1410 | VMCOREINFO_SYMBOL(init_uts_ns); | ||
1411 | VMCOREINFO_SYMBOL(node_online_map); | ||
1412 | #ifdef CONFIG_MMU | ||
1413 | VMCOREINFO_SYMBOL(swapper_pg_dir); | ||
1414 | #endif | ||
1415 | VMCOREINFO_SYMBOL(_stext); | ||
1416 | VMCOREINFO_SYMBOL(vmap_area_list); | ||
1417 | |||
1418 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
1419 | VMCOREINFO_SYMBOL(mem_map); | ||
1420 | VMCOREINFO_SYMBOL(contig_page_data); | ||
1421 | #endif | ||
1422 | #ifdef CONFIG_SPARSEMEM | ||
1423 | VMCOREINFO_SYMBOL(mem_section); | ||
1424 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | ||
1425 | VMCOREINFO_STRUCT_SIZE(mem_section); | ||
1426 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | ||
1427 | #endif | ||
1428 | VMCOREINFO_STRUCT_SIZE(page); | ||
1429 | VMCOREINFO_STRUCT_SIZE(pglist_data); | ||
1430 | VMCOREINFO_STRUCT_SIZE(zone); | ||
1431 | VMCOREINFO_STRUCT_SIZE(free_area); | ||
1432 | VMCOREINFO_STRUCT_SIZE(list_head); | ||
1433 | VMCOREINFO_SIZE(nodemask_t); | ||
1434 | VMCOREINFO_OFFSET(page, flags); | ||
1435 | VMCOREINFO_OFFSET(page, _refcount); | ||
1436 | VMCOREINFO_OFFSET(page, mapping); | ||
1437 | VMCOREINFO_OFFSET(page, lru); | ||
1438 | VMCOREINFO_OFFSET(page, _mapcount); | ||
1439 | VMCOREINFO_OFFSET(page, private); | ||
1440 | VMCOREINFO_OFFSET(page, compound_dtor); | ||
1441 | VMCOREINFO_OFFSET(page, compound_order); | ||
1442 | VMCOREINFO_OFFSET(page, compound_head); | ||
1443 | VMCOREINFO_OFFSET(pglist_data, node_zones); | ||
1444 | VMCOREINFO_OFFSET(pglist_data, nr_zones); | ||
1445 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
1446 | VMCOREINFO_OFFSET(pglist_data, node_mem_map); | ||
1447 | #endif | ||
1448 | VMCOREINFO_OFFSET(pglist_data, node_start_pfn); | ||
1449 | VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); | ||
1450 | VMCOREINFO_OFFSET(pglist_data, node_id); | ||
1451 | VMCOREINFO_OFFSET(zone, free_area); | ||
1452 | VMCOREINFO_OFFSET(zone, vm_stat); | ||
1453 | VMCOREINFO_OFFSET(zone, spanned_pages); | ||
1454 | VMCOREINFO_OFFSET(free_area, free_list); | ||
1455 | VMCOREINFO_OFFSET(list_head, next); | ||
1456 | VMCOREINFO_OFFSET(list_head, prev); | ||
1457 | VMCOREINFO_OFFSET(vmap_area, va_start); | ||
1458 | VMCOREINFO_OFFSET(vmap_area, list); | ||
1459 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | ||
1460 | log_buf_kexec_setup(); | ||
1461 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
1462 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | ||
1463 | VMCOREINFO_NUMBER(PG_lru); | ||
1464 | VMCOREINFO_NUMBER(PG_private); | ||
1465 | VMCOREINFO_NUMBER(PG_swapcache); | ||
1466 | VMCOREINFO_NUMBER(PG_slab); | ||
1467 | #ifdef CONFIG_MEMORY_FAILURE | ||
1468 | VMCOREINFO_NUMBER(PG_hwpoison); | ||
1469 | #endif | ||
1470 | VMCOREINFO_NUMBER(PG_head_mask); | ||
1471 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | ||
1472 | #ifdef CONFIG_HUGETLB_PAGE | ||
1473 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); | ||
1474 | #endif | ||
1475 | |||
1476 | arch_crash_save_vmcoreinfo(); | ||
1477 | update_vmcoreinfo_note(); | ||
1478 | |||
1479 | return 0; | ||
1480 | } | ||
1481 | |||
1482 | subsys_initcall(crash_save_vmcoreinfo_init); | ||
1483 | |||
1484 | /* | ||
1485 | * Move into place and start executing a preloaded standalone | 1054 | * Move into place and start executing a preloaded standalone |
1486 | * executable. If nothing was preloaded return an error. | 1055 | * executable. If nothing was preloaded return an error. |
1487 | */ | 1056 | */ |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 699c5bc51a92..2d2d3a568e4e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -58,15 +58,6 @@ | |||
58 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | 58 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) |
59 | 59 | ||
60 | 60 | ||
61 | /* | ||
62 | * Some oddball architectures like 64bit powerpc have function descriptors | ||
63 | * so this must be overridable. | ||
64 | */ | ||
65 | #ifndef kprobe_lookup_name | ||
66 | #define kprobe_lookup_name(name, addr) \ | ||
67 | addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) | ||
68 | #endif | ||
69 | |||
70 | static int kprobes_initialized; | 61 | static int kprobes_initialized; |
71 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 62 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
72 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 63 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
@@ -81,6 +72,12 @@ static struct { | |||
81 | raw_spinlock_t lock ____cacheline_aligned_in_smp; | 72 | raw_spinlock_t lock ____cacheline_aligned_in_smp; |
82 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; | 73 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; |
83 | 74 | ||
75 | kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, | ||
76 | unsigned int __unused) | ||
77 | { | ||
78 | return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); | ||
79 | } | ||
80 | |||
84 | static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) | 81 | static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) |
85 | { | 82 | { |
86 | return &(kretprobe_table_locks[hash].lock); | 83 | return &(kretprobe_table_locks[hash].lock); |
@@ -598,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work) | |||
598 | } | 595 | } |
599 | 596 | ||
600 | /* Wait for completing optimization and unoptimization */ | 597 | /* Wait for completing optimization and unoptimization */ |
601 | static void wait_for_kprobe_optimizer(void) | 598 | void wait_for_kprobe_optimizer(void) |
602 | { | 599 | { |
603 | mutex_lock(&kprobe_mutex); | 600 | mutex_lock(&kprobe_mutex); |
604 | 601 | ||
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p) | |||
746 | arch_remove_optimized_kprobe(op); | 743 | arch_remove_optimized_kprobe(op); |
747 | } | 744 | } |
748 | 745 | ||
746 | static inline | ||
747 | void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) | ||
748 | { | ||
749 | if (!kprobe_ftrace(p)) | ||
750 | arch_prepare_optimized_kprobe(op, p); | ||
751 | } | ||
752 | |||
749 | /* Try to prepare optimized instructions */ | 753 | /* Try to prepare optimized instructions */ |
750 | static void prepare_optimized_kprobe(struct kprobe *p) | 754 | static void prepare_optimized_kprobe(struct kprobe *p) |
751 | { | 755 | { |
752 | struct optimized_kprobe *op; | 756 | struct optimized_kprobe *op; |
753 | 757 | ||
754 | op = container_of(p, struct optimized_kprobe, kp); | 758 | op = container_of(p, struct optimized_kprobe, kp); |
755 | arch_prepare_optimized_kprobe(op, p); | 759 | __prepare_optimized_kprobe(op, p); |
756 | } | 760 | } |
757 | 761 | ||
758 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 762 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
766 | 770 | ||
767 | INIT_LIST_HEAD(&op->list); | 771 | INIT_LIST_HEAD(&op->list); |
768 | op->kp.addr = p->addr; | 772 | op->kp.addr = p->addr; |
769 | arch_prepare_optimized_kprobe(op, p); | 773 | __prepare_optimized_kprobe(op, p); |
770 | 774 | ||
771 | return &op->kp; | 775 | return &op->kp; |
772 | } | 776 | } |
@@ -1391,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr) | |||
1391 | * This returns encoded errors if it fails to look up symbol or invalid | 1395 | * This returns encoded errors if it fails to look up symbol or invalid |
1392 | * combination of parameters. | 1396 | * combination of parameters. |
1393 | */ | 1397 | */ |
1394 | static kprobe_opcode_t *kprobe_addr(struct kprobe *p) | 1398 | static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, |
1399 | const char *symbol_name, unsigned int offset) | ||
1395 | { | 1400 | { |
1396 | kprobe_opcode_t *addr = p->addr; | 1401 | if ((symbol_name && addr) || (!symbol_name && !addr)) |
1397 | |||
1398 | if ((p->symbol_name && p->addr) || | ||
1399 | (!p->symbol_name && !p->addr)) | ||
1400 | goto invalid; | 1402 | goto invalid; |
1401 | 1403 | ||
1402 | if (p->symbol_name) { | 1404 | if (symbol_name) { |
1403 | kprobe_lookup_name(p->symbol_name, addr); | 1405 | addr = kprobe_lookup_name(symbol_name, offset); |
1404 | if (!addr) | 1406 | if (!addr) |
1405 | return ERR_PTR(-ENOENT); | 1407 | return ERR_PTR(-ENOENT); |
1406 | } | 1408 | } |
1407 | 1409 | ||
1408 | addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); | 1410 | addr = (kprobe_opcode_t *)(((char *)addr) + offset); |
1409 | if (addr) | 1411 | if (addr) |
1410 | return addr; | 1412 | return addr; |
1411 | 1413 | ||
@@ -1413,6 +1415,11 @@ invalid: | |||
1413 | return ERR_PTR(-EINVAL); | 1415 | return ERR_PTR(-EINVAL); |
1414 | } | 1416 | } |
1415 | 1417 | ||
1418 | static kprobe_opcode_t *kprobe_addr(struct kprobe *p) | ||
1419 | { | ||
1420 | return _kprobe_addr(p->addr, p->symbol_name, p->offset); | ||
1421 | } | ||
1422 | |||
1416 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1423 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1417 | static struct kprobe *__get_valid_kprobe(struct kprobe *p) | 1424 | static struct kprobe *__get_valid_kprobe(struct kprobe *p) |
1418 | { | 1425 | { |
@@ -1740,11 +1747,12 @@ void unregister_kprobes(struct kprobe **kps, int num) | |||
1740 | } | 1747 | } |
1741 | EXPORT_SYMBOL_GPL(unregister_kprobes); | 1748 | EXPORT_SYMBOL_GPL(unregister_kprobes); |
1742 | 1749 | ||
1743 | int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, | 1750 | int __weak kprobe_exceptions_notify(struct notifier_block *self, |
1744 | unsigned long val, void *data) | 1751 | unsigned long val, void *data) |
1745 | { | 1752 | { |
1746 | return NOTIFY_DONE; | 1753 | return NOTIFY_DONE; |
1747 | } | 1754 | } |
1755 | NOKPROBE_SYMBOL(kprobe_exceptions_notify); | ||
1748 | 1756 | ||
1749 | static struct notifier_block kprobe_exceptions_nb = { | 1757 | static struct notifier_block kprobe_exceptions_nb = { |
1750 | .notifier_call = kprobe_exceptions_notify, | 1758 | .notifier_call = kprobe_exceptions_notify, |
@@ -1875,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | |||
1875 | } | 1883 | } |
1876 | NOKPROBE_SYMBOL(pre_handler_kretprobe); | 1884 | NOKPROBE_SYMBOL(pre_handler_kretprobe); |
1877 | 1885 | ||
1886 | bool __weak arch_function_offset_within_entry(unsigned long offset) | ||
1887 | { | ||
1888 | return !offset; | ||
1889 | } | ||
1890 | |||
1891 | bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) | ||
1892 | { | ||
1893 | kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); | ||
1894 | |||
1895 | if (IS_ERR(kp_addr)) | ||
1896 | return false; | ||
1897 | |||
1898 | if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || | ||
1899 | !arch_function_offset_within_entry(offset)) | ||
1900 | return false; | ||
1901 | |||
1902 | return true; | ||
1903 | } | ||
1904 | |||
1878 | int register_kretprobe(struct kretprobe *rp) | 1905 | int register_kretprobe(struct kretprobe *rp) |
1879 | { | 1906 | { |
1880 | int ret = 0; | 1907 | int ret = 0; |
@@ -1882,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp) | |||
1882 | int i; | 1909 | int i; |
1883 | void *addr; | 1910 | void *addr; |
1884 | 1911 | ||
1912 | if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) | ||
1913 | return -EINVAL; | ||
1914 | |||
1885 | if (kretprobe_blacklist_size) { | 1915 | if (kretprobe_blacklist_size) { |
1886 | addr = kprobe_addr(&rp->kp); | 1916 | addr = kprobe_addr(&rp->kp); |
1887 | if (IS_ERR(addr)) | 1917 | if (IS_ERR(addr)) |
@@ -2153,6 +2183,12 @@ static int kprobes_module_callback(struct notifier_block *nb, | |||
2153 | * The vaddr this probe is installed will soon | 2183 | * The vaddr this probe is installed will soon |
2154 | * be vfreed buy not synced to disk. Hence, | 2184 | * be vfreed buy not synced to disk. Hence, |
2155 | * disarming the breakpoint isn't needed. | 2185 | * disarming the breakpoint isn't needed. |
2186 | * | ||
2187 | * Note, this will also move any optimized probes | ||
2188 | * that are pending to be removed from their | ||
2189 | * corresponding lists to the freeing_list and | ||
2190 | * will not be touched by the delayed | ||
2191 | * kprobe_optimizer work handler. | ||
2156 | */ | 2192 | */ |
2157 | kill_kprobe(p); | 2193 | kill_kprobe(p); |
2158 | } | 2194 | } |
@@ -2192,8 +2228,8 @@ static int __init init_kprobes(void) | |||
2192 | if (kretprobe_blacklist_size) { | 2228 | if (kretprobe_blacklist_size) { |
2193 | /* lookup the function address from its name */ | 2229 | /* lookup the function address from its name */ |
2194 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 2230 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
2195 | kprobe_lookup_name(kretprobe_blacklist[i].name, | 2231 | kretprobe_blacklist[i].addr = |
2196 | kretprobe_blacklist[i].addr); | 2232 | kprobe_lookup_name(kretprobe_blacklist[i].name, 0); |
2197 | if (!kretprobe_blacklist[i].addr) | 2233 | if (!kretprobe_blacklist[i].addr) |
2198 | printk("kretprobe: lookup failed: %s\n", | 2234 | printk("kretprobe: lookup failed: %s\n", |
2199 | kretprobe_blacklist[i].name); | 2235 | kretprobe_blacklist[i].name); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0999679d6f26..23cd70651238 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, | |||
125 | } | 125 | } |
126 | KERNEL_ATTR_RW(kexec_crash_size); | 126 | KERNEL_ATTR_RW(kexec_crash_size); |
127 | 127 | ||
128 | #endif /* CONFIG_KEXEC_CORE */ | ||
129 | |||
130 | #ifdef CONFIG_CRASH_CORE | ||
131 | |||
128 | static ssize_t vmcoreinfo_show(struct kobject *kobj, | 132 | static ssize_t vmcoreinfo_show(struct kobject *kobj, |
129 | struct kobj_attribute *attr, char *buf) | 133 | struct kobj_attribute *attr, char *buf) |
130 | { | 134 | { |
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, | |||
134 | } | 138 | } |
135 | KERNEL_ATTR_RO(vmcoreinfo); | 139 | KERNEL_ATTR_RO(vmcoreinfo); |
136 | 140 | ||
137 | #endif /* CONFIG_KEXEC_CORE */ | 141 | #endif /* CONFIG_CRASH_CORE */ |
138 | 142 | ||
139 | /* whether file capabilities are enabled */ | 143 | /* whether file capabilities are enabled */ |
140 | static ssize_t fscaps_show(struct kobject *kobj, | 144 | static ssize_t fscaps_show(struct kobject *kobj, |
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = { | |||
219 | &kexec_loaded_attr.attr, | 223 | &kexec_loaded_attr.attr, |
220 | &kexec_crash_loaded_attr.attr, | 224 | &kexec_crash_loaded_attr.attr, |
221 | &kexec_crash_size_attr.attr, | 225 | &kexec_crash_size_attr.attr, |
226 | #endif | ||
227 | #ifdef CONFIG_CRASH_CORE | ||
222 | &vmcoreinfo_attr.attr, | 228 | &vmcoreinfo_attr.attr, |
223 | #endif | 229 | #endif |
224 | #ifndef CONFIG_TINY_RCU | 230 | #ifndef CONFIG_TINY_RCU |
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index e8780c0901d9..2b8bdb1925da 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile | |||
@@ -1,3 +1,3 @@ | |||
1 | obj-$(CONFIG_LIVEPATCH) += livepatch.o | 1 | obj-$(CONFIG_LIVEPATCH) += livepatch.o |
2 | 2 | ||
3 | livepatch-objs := core.o | 3 | livepatch-objs := core.o patch.o transition.o |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index af4643873e71..b9628e43c78f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -24,61 +24,31 @@ | |||
24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
25 | #include <linux/mutex.h> | 25 | #include <linux/mutex.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/ftrace.h> | ||
28 | #include <linux/list.h> | 27 | #include <linux/list.h> |
29 | #include <linux/kallsyms.h> | 28 | #include <linux/kallsyms.h> |
30 | #include <linux/livepatch.h> | 29 | #include <linux/livepatch.h> |
31 | #include <linux/elf.h> | 30 | #include <linux/elf.h> |
32 | #include <linux/moduleloader.h> | 31 | #include <linux/moduleloader.h> |
32 | #include <linux/completion.h> | ||
33 | #include <asm/cacheflush.h> | 33 | #include <asm/cacheflush.h> |
34 | 34 | #include "core.h" | |
35 | /** | 35 | #include "patch.h" |
36 | * struct klp_ops - structure for tracking registered ftrace ops structs | 36 | #include "transition.h" |
37 | * | ||
38 | * A single ftrace_ops is shared between all enabled replacement functions | ||
39 | * (klp_func structs) which have the same old_addr. This allows the switch | ||
40 | * between function versions to happen instantaneously by updating the klp_ops | ||
41 | * struct's func_stack list. The winner is the klp_func at the top of the | ||
42 | * func_stack (front of the list). | ||
43 | * | ||
44 | * @node: node for the global klp_ops list | ||
45 | * @func_stack: list head for the stack of klp_func's (active func is on top) | ||
46 | * @fops: registered ftrace ops struct | ||
47 | */ | ||
48 | struct klp_ops { | ||
49 | struct list_head node; | ||
50 | struct list_head func_stack; | ||
51 | struct ftrace_ops fops; | ||
52 | }; | ||
53 | 37 | ||
54 | /* | 38 | /* |
55 | * The klp_mutex protects the global lists and state transitions of any | 39 | * klp_mutex is a coarse lock which serializes access to klp data. All |
56 | * structure reachable from them. References to any structure must be obtained | 40 | * accesses to klp-related variables and structures must have mutex protection, |
57 | * under mutex protection (except in klp_ftrace_handler(), which uses RCU to | 41 | * except within the following functions which carefully avoid the need for it: |
58 | * ensure it gets consistent data). | 42 | * |
43 | * - klp_ftrace_handler() | ||
44 | * - klp_update_patch_state() | ||
59 | */ | 45 | */ |
60 | static DEFINE_MUTEX(klp_mutex); | 46 | DEFINE_MUTEX(klp_mutex); |
61 | 47 | ||
62 | static LIST_HEAD(klp_patches); | 48 | static LIST_HEAD(klp_patches); |
63 | static LIST_HEAD(klp_ops); | ||
64 | 49 | ||
65 | static struct kobject *klp_root_kobj; | 50 | static struct kobject *klp_root_kobj; |
66 | 51 | ||
67 | static struct klp_ops *klp_find_ops(unsigned long old_addr) | ||
68 | { | ||
69 | struct klp_ops *ops; | ||
70 | struct klp_func *func; | ||
71 | |||
72 | list_for_each_entry(ops, &klp_ops, node) { | ||
73 | func = list_first_entry(&ops->func_stack, struct klp_func, | ||
74 | stack_node); | ||
75 | if (func->old_addr == old_addr) | ||
76 | return ops; | ||
77 | } | ||
78 | |||
79 | return NULL; | ||
80 | } | ||
81 | |||
82 | static bool klp_is_module(struct klp_object *obj) | 52 | static bool klp_is_module(struct klp_object *obj) |
83 | { | 53 | { |
84 | return obj->name; | 54 | return obj->name; |
@@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj) | |||
117 | mutex_unlock(&module_mutex); | 87 | mutex_unlock(&module_mutex); |
118 | } | 88 | } |
119 | 89 | ||
120 | /* klp_mutex must be held by caller */ | ||
121 | static bool klp_is_patch_registered(struct klp_patch *patch) | 90 | static bool klp_is_patch_registered(struct klp_patch *patch) |
122 | { | 91 | { |
123 | struct klp_patch *mypatch; | 92 | struct klp_patch *mypatch; |
@@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name, | |||
182 | }; | 151 | }; |
183 | 152 | ||
184 | mutex_lock(&module_mutex); | 153 | mutex_lock(&module_mutex); |
185 | kallsyms_on_each_symbol(klp_find_callback, &args); | 154 | if (objname) |
155 | module_kallsyms_on_each_symbol(klp_find_callback, &args); | ||
156 | else | ||
157 | kallsyms_on_each_symbol(klp_find_callback, &args); | ||
186 | mutex_unlock(&module_mutex); | 158 | mutex_unlock(&module_mutex); |
187 | 159 | ||
188 | /* | 160 | /* |
@@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) | |||
233 | for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { | 205 | for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { |
234 | sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); | 206 | sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); |
235 | if (sym->st_shndx != SHN_LIVEPATCH) { | 207 | if (sym->st_shndx != SHN_LIVEPATCH) { |
236 | pr_err("symbol %s is not marked as a livepatch symbol", | 208 | pr_err("symbol %s is not marked as a livepatch symbol\n", |
237 | strtab + sym->st_name); | 209 | strtab + sym->st_name); |
238 | return -EINVAL; | 210 | return -EINVAL; |
239 | } | 211 | } |
@@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) | |||
243 | ".klp.sym.%55[^.].%127[^,],%lu", | 215 | ".klp.sym.%55[^.].%127[^,],%lu", |
244 | objname, symname, &sympos); | 216 | objname, symname, &sympos); |
245 | if (cnt != 3) { | 217 | if (cnt != 3) { |
246 | pr_err("symbol %s has an incorrectly formatted name", | 218 | pr_err("symbol %s has an incorrectly formatted name\n", |
247 | strtab + sym->st_name); | 219 | strtab + sym->st_name); |
248 | return -EINVAL; | 220 | return -EINVAL; |
249 | } | 221 | } |
@@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod, | |||
288 | */ | 260 | */ |
289 | cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); | 261 | cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); |
290 | if (cnt != 1) { | 262 | if (cnt != 1) { |
291 | pr_err("section %s has an incorrectly formatted name", | 263 | pr_err("section %s has an incorrectly formatted name\n", |
292 | secname); | 264 | secname); |
293 | ret = -EINVAL; | 265 | ret = -EINVAL; |
294 | break; | 266 | break; |
@@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod, | |||
311 | return ret; | 283 | return ret; |
312 | } | 284 | } |
313 | 285 | ||
314 | static void notrace klp_ftrace_handler(unsigned long ip, | ||
315 | unsigned long parent_ip, | ||
316 | struct ftrace_ops *fops, | ||
317 | struct pt_regs *regs) | ||
318 | { | ||
319 | struct klp_ops *ops; | ||
320 | struct klp_func *func; | ||
321 | |||
322 | ops = container_of(fops, struct klp_ops, fops); | ||
323 | |||
324 | rcu_read_lock(); | ||
325 | func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, | ||
326 | stack_node); | ||
327 | if (WARN_ON_ONCE(!func)) | ||
328 | goto unlock; | ||
329 | |||
330 | klp_arch_set_pc(regs, (unsigned long)func->new_func); | ||
331 | unlock: | ||
332 | rcu_read_unlock(); | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * Convert a function address into the appropriate ftrace location. | ||
337 | * | ||
338 | * Usually this is just the address of the function, but on some architectures | ||
339 | * it's more complicated so allow them to provide a custom behaviour. | ||
340 | */ | ||
341 | #ifndef klp_get_ftrace_location | ||
342 | static unsigned long klp_get_ftrace_location(unsigned long faddr) | ||
343 | { | ||
344 | return faddr; | ||
345 | } | ||
346 | #endif | ||
347 | |||
348 | static void klp_disable_func(struct klp_func *func) | ||
349 | { | ||
350 | struct klp_ops *ops; | ||
351 | |||
352 | if (WARN_ON(func->state != KLP_ENABLED)) | ||
353 | return; | ||
354 | if (WARN_ON(!func->old_addr)) | ||
355 | return; | ||
356 | |||
357 | ops = klp_find_ops(func->old_addr); | ||
358 | if (WARN_ON(!ops)) | ||
359 | return; | ||
360 | |||
361 | if (list_is_singular(&ops->func_stack)) { | ||
362 | unsigned long ftrace_loc; | ||
363 | |||
364 | ftrace_loc = klp_get_ftrace_location(func->old_addr); | ||
365 | if (WARN_ON(!ftrace_loc)) | ||
366 | return; | ||
367 | |||
368 | WARN_ON(unregister_ftrace_function(&ops->fops)); | ||
369 | WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); | ||
370 | |||
371 | list_del_rcu(&func->stack_node); | ||
372 | list_del(&ops->node); | ||
373 | kfree(ops); | ||
374 | } else { | ||
375 | list_del_rcu(&func->stack_node); | ||
376 | } | ||
377 | |||
378 | func->state = KLP_DISABLED; | ||
379 | } | ||
380 | |||
381 | static int klp_enable_func(struct klp_func *func) | ||
382 | { | ||
383 | struct klp_ops *ops; | ||
384 | int ret; | ||
385 | |||
386 | if (WARN_ON(!func->old_addr)) | ||
387 | return -EINVAL; | ||
388 | |||
389 | if (WARN_ON(func->state != KLP_DISABLED)) | ||
390 | return -EINVAL; | ||
391 | |||
392 | ops = klp_find_ops(func->old_addr); | ||
393 | if (!ops) { | ||
394 | unsigned long ftrace_loc; | ||
395 | |||
396 | ftrace_loc = klp_get_ftrace_location(func->old_addr); | ||
397 | if (!ftrace_loc) { | ||
398 | pr_err("failed to find location for function '%s'\n", | ||
399 | func->old_name); | ||
400 | return -EINVAL; | ||
401 | } | ||
402 | |||
403 | ops = kzalloc(sizeof(*ops), GFP_KERNEL); | ||
404 | if (!ops) | ||
405 | return -ENOMEM; | ||
406 | |||
407 | ops->fops.func = klp_ftrace_handler; | ||
408 | ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | | ||
409 | FTRACE_OPS_FL_DYNAMIC | | ||
410 | FTRACE_OPS_FL_IPMODIFY; | ||
411 | |||
412 | list_add(&ops->node, &klp_ops); | ||
413 | |||
414 | INIT_LIST_HEAD(&ops->func_stack); | ||
415 | list_add_rcu(&func->stack_node, &ops->func_stack); | ||
416 | |||
417 | ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); | ||
418 | if (ret) { | ||
419 | pr_err("failed to set ftrace filter for function '%s' (%d)\n", | ||
420 | func->old_name, ret); | ||
421 | goto err; | ||
422 | } | ||
423 | |||
424 | ret = register_ftrace_function(&ops->fops); | ||
425 | if (ret) { | ||
426 | pr_err("failed to register ftrace handler for function '%s' (%d)\n", | ||
427 | func->old_name, ret); | ||
428 | ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); | ||
429 | goto err; | ||
430 | } | ||
431 | |||
432 | |||
433 | } else { | ||
434 | list_add_rcu(&func->stack_node, &ops->func_stack); | ||
435 | } | ||
436 | |||
437 | func->state = KLP_ENABLED; | ||
438 | |||
439 | return 0; | ||
440 | |||
441 | err: | ||
442 | list_del_rcu(&func->stack_node); | ||
443 | list_del(&ops->node); | ||
444 | kfree(ops); | ||
445 | return ret; | ||
446 | } | ||
447 | |||
448 | static void klp_disable_object(struct klp_object *obj) | ||
449 | { | ||
450 | struct klp_func *func; | ||
451 | |||
452 | klp_for_each_func(obj, func) | ||
453 | if (func->state == KLP_ENABLED) | ||
454 | klp_disable_func(func); | ||
455 | |||
456 | obj->state = KLP_DISABLED; | ||
457 | } | ||
458 | |||
459 | static int klp_enable_object(struct klp_object *obj) | ||
460 | { | ||
461 | struct klp_func *func; | ||
462 | int ret; | ||
463 | |||
464 | if (WARN_ON(obj->state != KLP_DISABLED)) | ||
465 | return -EINVAL; | ||
466 | |||
467 | if (WARN_ON(!klp_is_object_loaded(obj))) | ||
468 | return -EINVAL; | ||
469 | |||
470 | klp_for_each_func(obj, func) { | ||
471 | ret = klp_enable_func(func); | ||
472 | if (ret) { | ||
473 | klp_disable_object(obj); | ||
474 | return ret; | ||
475 | } | ||
476 | } | ||
477 | obj->state = KLP_ENABLED; | ||
478 | |||
479 | return 0; | ||
480 | } | ||
481 | |||
482 | static int __klp_disable_patch(struct klp_patch *patch) | 286 | static int __klp_disable_patch(struct klp_patch *patch) |
483 | { | 287 | { |
484 | struct klp_object *obj; | 288 | if (klp_transition_patch) |
289 | return -EBUSY; | ||
485 | 290 | ||
486 | /* enforce stacking: only the last enabled patch can be disabled */ | 291 | /* enforce stacking: only the last enabled patch can be disabled */ |
487 | if (!list_is_last(&patch->list, &klp_patches) && | 292 | if (!list_is_last(&patch->list, &klp_patches) && |
488 | list_next_entry(patch, list)->state == KLP_ENABLED) | 293 | list_next_entry(patch, list)->enabled) |
489 | return -EBUSY; | 294 | return -EBUSY; |
490 | 295 | ||
491 | pr_notice("disabling patch '%s'\n", patch->mod->name); | 296 | klp_init_transition(patch, KLP_UNPATCHED); |
492 | 297 | ||
493 | klp_for_each_object(patch, obj) { | 298 | /* |
494 | if (obj->state == KLP_ENABLED) | 299 | * Enforce the order of the func->transition writes in |
495 | klp_disable_object(obj); | 300 | * klp_init_transition() and the TIF_PATCH_PENDING writes in |
496 | } | 301 | * klp_start_transition(). In the rare case where klp_ftrace_handler() |
302 | * is called shortly after klp_update_patch_state() switches the task, | ||
303 | * this ensures the handler sees that func->transition is set. | ||
304 | */ | ||
305 | smp_wmb(); | ||
497 | 306 | ||
498 | patch->state = KLP_DISABLED; | 307 | klp_start_transition(); |
308 | klp_try_complete_transition(); | ||
309 | patch->enabled = false; | ||
499 | 310 | ||
500 | return 0; | 311 | return 0; |
501 | } | 312 | } |
@@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch) | |||
519 | goto err; | 330 | goto err; |
520 | } | 331 | } |
521 | 332 | ||
522 | if (patch->state == KLP_DISABLED) { | 333 | if (!patch->enabled) { |
523 | ret = -EINVAL; | 334 | ret = -EINVAL; |
524 | goto err; | 335 | goto err; |
525 | } | 336 | } |
@@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
537 | struct klp_object *obj; | 348 | struct klp_object *obj; |
538 | int ret; | 349 | int ret; |
539 | 350 | ||
540 | if (WARN_ON(patch->state != KLP_DISABLED)) | 351 | if (klp_transition_patch) |
352 | return -EBUSY; | ||
353 | |||
354 | if (WARN_ON(patch->enabled)) | ||
541 | return -EINVAL; | 355 | return -EINVAL; |
542 | 356 | ||
543 | /* enforce stacking: only the first disabled patch can be enabled */ | 357 | /* enforce stacking: only the first disabled patch can be enabled */ |
544 | if (patch->list.prev != &klp_patches && | 358 | if (patch->list.prev != &klp_patches && |
545 | list_prev_entry(patch, list)->state == KLP_DISABLED) | 359 | !list_prev_entry(patch, list)->enabled) |
546 | return -EBUSY; | 360 | return -EBUSY; |
547 | 361 | ||
362 | /* | ||
363 | * A reference is taken on the patch module to prevent it from being | ||
364 | * unloaded. | ||
365 | * | ||
366 | * Note: For immediate (no consistency model) patches we don't allow | ||
367 | * patch modules to unload since there is no safe/sane method to | ||
368 | * determine if a thread is still running in the patched code contained | ||
369 | * in the patch module once the ftrace registration is successful. | ||
370 | */ | ||
371 | if (!try_module_get(patch->mod)) | ||
372 | return -ENODEV; | ||
373 | |||
548 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 374 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
549 | 375 | ||
376 | klp_init_transition(patch, KLP_PATCHED); | ||
377 | |||
378 | /* | ||
379 | * Enforce the order of the func->transition writes in | ||
380 | * klp_init_transition() and the ops->func_stack writes in | ||
381 | * klp_patch_object(), so that klp_ftrace_handler() will see the | ||
382 | * func->transition updates before the handler is registered and the | ||
383 | * new funcs become visible to the handler. | ||
384 | */ | ||
385 | smp_wmb(); | ||
386 | |||
550 | klp_for_each_object(patch, obj) { | 387 | klp_for_each_object(patch, obj) { |
551 | if (!klp_is_object_loaded(obj)) | 388 | if (!klp_is_object_loaded(obj)) |
552 | continue; | 389 | continue; |
553 | 390 | ||
554 | ret = klp_enable_object(obj); | 391 | ret = klp_patch_object(obj); |
555 | if (ret) | 392 | if (ret) { |
556 | goto unregister; | 393 | pr_warn("failed to enable patch '%s'\n", |
394 | patch->mod->name); | ||
395 | |||
396 | klp_cancel_transition(); | ||
397 | return ret; | ||
398 | } | ||
557 | } | 399 | } |
558 | 400 | ||
559 | patch->state = KLP_ENABLED; | 401 | klp_start_transition(); |
402 | klp_try_complete_transition(); | ||
403 | patch->enabled = true; | ||
560 | 404 | ||
561 | return 0; | 405 | return 0; |
562 | |||
563 | unregister: | ||
564 | WARN_ON(__klp_disable_patch(patch)); | ||
565 | return ret; | ||
566 | } | 406 | } |
567 | 407 | ||
568 | /** | 408 | /** |
@@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); | |||
599 | * /sys/kernel/livepatch | 439 | * /sys/kernel/livepatch |
600 | * /sys/kernel/livepatch/<patch> | 440 | * /sys/kernel/livepatch/<patch> |
601 | * /sys/kernel/livepatch/<patch>/enabled | 441 | * /sys/kernel/livepatch/<patch>/enabled |
442 | * /sys/kernel/livepatch/<patch>/transition | ||
602 | * /sys/kernel/livepatch/<patch>/<object> | 443 | * /sys/kernel/livepatch/<patch>/<object> |
603 | * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> | 444 | * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> |
604 | */ | 445 | */ |
@@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
608 | { | 449 | { |
609 | struct klp_patch *patch; | 450 | struct klp_patch *patch; |
610 | int ret; | 451 | int ret; |
611 | unsigned long val; | 452 | bool enabled; |
612 | 453 | ||
613 | ret = kstrtoul(buf, 10, &val); | 454 | ret = kstrtobool(buf, &enabled); |
614 | if (ret) | 455 | if (ret) |
615 | return -EINVAL; | 456 | return ret; |
616 | |||
617 | if (val != KLP_DISABLED && val != KLP_ENABLED) | ||
618 | return -EINVAL; | ||
619 | 457 | ||
620 | patch = container_of(kobj, struct klp_patch, kobj); | 458 | patch = container_of(kobj, struct klp_patch, kobj); |
621 | 459 | ||
622 | mutex_lock(&klp_mutex); | 460 | mutex_lock(&klp_mutex); |
623 | 461 | ||
624 | if (val == patch->state) { | 462 | if (!klp_is_patch_registered(patch)) { |
463 | /* | ||
464 | * Module with the patch could either disappear meanwhile or is | ||
465 | * not properly initialized yet. | ||
466 | */ | ||
467 | ret = -EINVAL; | ||
468 | goto err; | ||
469 | } | ||
470 | |||
471 | if (patch->enabled == enabled) { | ||
625 | /* already in requested state */ | 472 | /* already in requested state */ |
626 | ret = -EINVAL; | 473 | ret = -EINVAL; |
627 | goto err; | 474 | goto err; |
628 | } | 475 | } |
629 | 476 | ||
630 | if (val == KLP_ENABLED) { | 477 | if (patch == klp_transition_patch) { |
478 | klp_reverse_transition(); | ||
479 | } else if (enabled) { | ||
631 | ret = __klp_enable_patch(patch); | 480 | ret = __klp_enable_patch(patch); |
632 | if (ret) | 481 | if (ret) |
633 | goto err; | 482 | goto err; |
@@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj, | |||
652 | struct klp_patch *patch; | 501 | struct klp_patch *patch; |
653 | 502 | ||
654 | patch = container_of(kobj, struct klp_patch, kobj); | 503 | patch = container_of(kobj, struct klp_patch, kobj); |
655 | return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); | 504 | return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled); |
505 | } | ||
506 | |||
507 | static ssize_t transition_show(struct kobject *kobj, | ||
508 | struct kobj_attribute *attr, char *buf) | ||
509 | { | ||
510 | struct klp_patch *patch; | ||
511 | |||
512 | patch = container_of(kobj, struct klp_patch, kobj); | ||
513 | return snprintf(buf, PAGE_SIZE-1, "%d\n", | ||
514 | patch == klp_transition_patch); | ||
656 | } | 515 | } |
657 | 516 | ||
658 | static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); | 517 | static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); |
518 | static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); | ||
659 | static struct attribute *klp_patch_attrs[] = { | 519 | static struct attribute *klp_patch_attrs[] = { |
660 | &enabled_kobj_attr.attr, | 520 | &enabled_kobj_attr.attr, |
521 | &transition_kobj_attr.attr, | ||
661 | NULL | 522 | NULL |
662 | }; | 523 | }; |
663 | 524 | ||
664 | static void klp_kobj_release_patch(struct kobject *kobj) | 525 | static void klp_kobj_release_patch(struct kobject *kobj) |
665 | { | 526 | { |
666 | /* | 527 | struct klp_patch *patch; |
667 | * Once we have a consistency model we'll need to module_put() the | 528 | |
668 | * patch module here. See klp_register_patch() for more details. | 529 | patch = container_of(kobj, struct klp_patch, kobj); |
669 | */ | 530 | complete(&patch->finish); |
670 | } | 531 | } |
671 | 532 | ||
672 | static struct kobj_type klp_ktype_patch = { | 533 | static struct kobj_type klp_ktype_patch = { |
@@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch) | |||
737 | klp_free_objects_limited(patch, NULL); | 598 | klp_free_objects_limited(patch, NULL); |
738 | if (!list_empty(&patch->list)) | 599 | if (!list_empty(&patch->list)) |
739 | list_del(&patch->list); | 600 | list_del(&patch->list); |
740 | kobject_put(&patch->kobj); | ||
741 | } | 601 | } |
742 | 602 | ||
743 | static int klp_init_func(struct klp_object *obj, struct klp_func *func) | 603 | static int klp_init_func(struct klp_object *obj, struct klp_func *func) |
@@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) | |||
746 | return -EINVAL; | 606 | return -EINVAL; |
747 | 607 | ||
748 | INIT_LIST_HEAD(&func->stack_node); | 608 | INIT_LIST_HEAD(&func->stack_node); |
749 | func->state = KLP_DISABLED; | 609 | func->patched = false; |
610 | func->transition = false; | ||
750 | 611 | ||
751 | /* The format for the sysfs directory is <function,sympos> where sympos | 612 | /* The format for the sysfs directory is <function,sympos> where sympos |
752 | * is the nth occurrence of this symbol in kallsyms for the patched | 613 | * is the nth occurrence of this symbol in kallsyms for the patched |
@@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch, | |||
787 | &func->old_addr); | 648 | &func->old_addr); |
788 | if (ret) | 649 | if (ret) |
789 | return ret; | 650 | return ret; |
651 | |||
652 | ret = kallsyms_lookup_size_offset(func->old_addr, | ||
653 | &func->old_size, NULL); | ||
654 | if (!ret) { | ||
655 | pr_err("kallsyms size lookup failed for '%s'\n", | ||
656 | func->old_name); | ||
657 | return -ENOENT; | ||
658 | } | ||
659 | |||
660 | ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, | ||
661 | &func->new_size, NULL); | ||
662 | if (!ret) { | ||
663 | pr_err("kallsyms size lookup failed for '%s' replacement\n", | ||
664 | func->old_name); | ||
665 | return -ENOENT; | ||
666 | } | ||
790 | } | 667 | } |
791 | 668 | ||
792 | return 0; | 669 | return 0; |
@@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) | |||
801 | if (!obj->funcs) | 678 | if (!obj->funcs) |
802 | return -EINVAL; | 679 | return -EINVAL; |
803 | 680 | ||
804 | obj->state = KLP_DISABLED; | 681 | obj->patched = false; |
805 | obj->mod = NULL; | 682 | obj->mod = NULL; |
806 | 683 | ||
807 | klp_find_object_module(obj); | 684 | klp_find_object_module(obj); |
@@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch) | |||
842 | 719 | ||
843 | mutex_lock(&klp_mutex); | 720 | mutex_lock(&klp_mutex); |
844 | 721 | ||
845 | patch->state = KLP_DISABLED; | 722 | patch->enabled = false; |
723 | init_completion(&patch->finish); | ||
846 | 724 | ||
847 | ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, | 725 | ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, |
848 | klp_root_kobj, "%s", patch->mod->name); | 726 | klp_root_kobj, "%s", patch->mod->name); |
849 | if (ret) | 727 | if (ret) { |
850 | goto unlock; | 728 | mutex_unlock(&klp_mutex); |
729 | return ret; | ||
730 | } | ||
851 | 731 | ||
852 | klp_for_each_object(patch, obj) { | 732 | klp_for_each_object(patch, obj) { |
853 | ret = klp_init_object(patch, obj); | 733 | ret = klp_init_object(patch, obj); |
@@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch) | |||
863 | 743 | ||
864 | free: | 744 | free: |
865 | klp_free_objects_limited(patch, obj); | 745 | klp_free_objects_limited(patch, obj); |
866 | kobject_put(&patch->kobj); | 746 | |
867 | unlock: | ||
868 | mutex_unlock(&klp_mutex); | 747 | mutex_unlock(&klp_mutex); |
748 | |||
749 | kobject_put(&patch->kobj); | ||
750 | wait_for_completion(&patch->finish); | ||
751 | |||
869 | return ret; | 752 | return ret; |
870 | } | 753 | } |
871 | 754 | ||
@@ -879,23 +762,29 @@ unlock: | |||
879 | */ | 762 | */ |
880 | int klp_unregister_patch(struct klp_patch *patch) | 763 | int klp_unregister_patch(struct klp_patch *patch) |
881 | { | 764 | { |
882 | int ret = 0; | 765 | int ret; |
883 | 766 | ||
884 | mutex_lock(&klp_mutex); | 767 | mutex_lock(&klp_mutex); |
885 | 768 | ||
886 | if (!klp_is_patch_registered(patch)) { | 769 | if (!klp_is_patch_registered(patch)) { |
887 | ret = -EINVAL; | 770 | ret = -EINVAL; |
888 | goto out; | 771 | goto err; |
889 | } | 772 | } |
890 | 773 | ||
891 | if (patch->state == KLP_ENABLED) { | 774 | if (patch->enabled) { |
892 | ret = -EBUSY; | 775 | ret = -EBUSY; |
893 | goto out; | 776 | goto err; |
894 | } | 777 | } |
895 | 778 | ||
896 | klp_free_patch(patch); | 779 | klp_free_patch(patch); |
897 | 780 | ||
898 | out: | 781 | mutex_unlock(&klp_mutex); |
782 | |||
783 | kobject_put(&patch->kobj); | ||
784 | wait_for_completion(&patch->finish); | ||
785 | |||
786 | return 0; | ||
787 | err: | ||
899 | mutex_unlock(&klp_mutex); | 788 | mutex_unlock(&klp_mutex); |
900 | return ret; | 789 | return ret; |
901 | } | 790 | } |
@@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch); | |||
908 | * Initializes the data structure associated with the patch and | 797 | * Initializes the data structure associated with the patch and |
909 | * creates the sysfs interface. | 798 | * creates the sysfs interface. |
910 | * | 799 | * |
800 | * There is no need to take the reference on the patch module here. It is done | ||
801 | * later when the patch is enabled. | ||
802 | * | ||
911 | * Return: 0 on success, otherwise error | 803 | * Return: 0 on success, otherwise error |
912 | */ | 804 | */ |
913 | int klp_register_patch(struct klp_patch *patch) | 805 | int klp_register_patch(struct klp_patch *patch) |
914 | { | 806 | { |
915 | int ret; | ||
916 | |||
917 | if (!patch || !patch->mod) | 807 | if (!patch || !patch->mod) |
918 | return -EINVAL; | 808 | return -EINVAL; |
919 | 809 | ||
920 | if (!is_livepatch_module(patch->mod)) { | 810 | if (!is_livepatch_module(patch->mod)) { |
921 | pr_err("module %s is not marked as a livepatch module", | 811 | pr_err("module %s is not marked as a livepatch module\n", |
922 | patch->mod->name); | 812 | patch->mod->name); |
923 | return -EINVAL; | 813 | return -EINVAL; |
924 | } | 814 | } |
@@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch) | |||
927 | return -ENODEV; | 817 | return -ENODEV; |
928 | 818 | ||
929 | /* | 819 | /* |
930 | * A reference is taken on the patch module to prevent it from being | 820 | * Architectures without reliable stack traces have to set |
931 | * unloaded. Right now, we don't allow patch modules to unload since | 821 | * patch->immediate because there's currently no way to patch kthreads |
932 | * there is currently no method to determine if a thread is still | 822 | * with the consistency model. |
933 | * running in the patched code contained in the patch module once | ||
934 | * the ftrace registration is successful. | ||
935 | */ | 823 | */ |
936 | if (!try_module_get(patch->mod)) | 824 | if (!klp_have_reliable_stack() && !patch->immediate) { |
937 | return -ENODEV; | 825 | pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); |
938 | 826 | return -ENOSYS; | |
939 | ret = klp_init_patch(patch); | 827 | } |
940 | if (ret) | ||
941 | module_put(patch->mod); | ||
942 | 828 | ||
943 | return ret; | 829 | return klp_init_patch(patch); |
944 | } | 830 | } |
945 | EXPORT_SYMBOL_GPL(klp_register_patch); | 831 | EXPORT_SYMBOL_GPL(klp_register_patch); |
946 | 832 | ||
@@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod) | |||
975 | goto err; | 861 | goto err; |
976 | } | 862 | } |
977 | 863 | ||
978 | if (patch->state == KLP_DISABLED) | 864 | /* |
865 | * Only patch the module if the patch is enabled or is | ||
866 | * in transition. | ||
867 | */ | ||
868 | if (!patch->enabled && patch != klp_transition_patch) | ||
979 | break; | 869 | break; |
980 | 870 | ||
981 | pr_notice("applying patch '%s' to loading module '%s'\n", | 871 | pr_notice("applying patch '%s' to loading module '%s'\n", |
982 | patch->mod->name, obj->mod->name); | 872 | patch->mod->name, obj->mod->name); |
983 | 873 | ||
984 | ret = klp_enable_object(obj); | 874 | ret = klp_patch_object(obj); |
985 | if (ret) { | 875 | if (ret) { |
986 | pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", | 876 | pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", |
987 | patch->mod->name, obj->mod->name, ret); | 877 | patch->mod->name, obj->mod->name, ret); |
@@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod) | |||
1032 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | 922 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) |
1033 | continue; | 923 | continue; |
1034 | 924 | ||
1035 | if (patch->state != KLP_DISABLED) { | 925 | /* |
926 | * Only unpatch the module if the patch is enabled or | ||
927 | * is in transition. | ||
928 | */ | ||
929 | if (patch->enabled || patch == klp_transition_patch) { | ||
1036 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | 930 | pr_notice("reverting patch '%s' on unloading module '%s'\n", |
1037 | patch->mod->name, obj->mod->name); | 931 | patch->mod->name, obj->mod->name); |
1038 | klp_disable_object(obj); | 932 | klp_unpatch_object(obj); |
1039 | } | 933 | } |
1040 | 934 | ||
1041 | klp_free_object_loaded(obj); | 935 | klp_free_object_loaded(obj); |
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h new file mode 100644 index 000000000000..c74f24c47837 --- /dev/null +++ b/kernel/livepatch/core.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef _LIVEPATCH_CORE_H | ||
2 | #define _LIVEPATCH_CORE_H | ||
3 | |||
4 | extern struct mutex klp_mutex; | ||
5 | |||
6 | #endif /* _LIVEPATCH_CORE_H */ | ||
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c new file mode 100644 index 000000000000..f8269036bf0b --- /dev/null +++ b/kernel/livepatch/patch.c | |||
@@ -0,0 +1,272 @@ | |||
1 | /* | ||
2 | * patch.c - livepatch patching functions | ||
3 | * | ||
4 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> | ||
5 | * Copyright (C) 2014 SUSE | ||
6 | * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public License | ||
10 | * as published by the Free Software Foundation; either version 2 | ||
11 | * of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
20 | */ | ||
21 | |||
22 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
23 | |||
24 | #include <linux/livepatch.h> | ||
25 | #include <linux/list.h> | ||
26 | #include <linux/ftrace.h> | ||
27 | #include <linux/rculist.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/bug.h> | ||
30 | #include <linux/printk.h> | ||
31 | #include "patch.h" | ||
32 | #include "transition.h" | ||
33 | |||
34 | static LIST_HEAD(klp_ops); | ||
35 | |||
36 | struct klp_ops *klp_find_ops(unsigned long old_addr) | ||
37 | { | ||
38 | struct klp_ops *ops; | ||
39 | struct klp_func *func; | ||
40 | |||
41 | list_for_each_entry(ops, &klp_ops, node) { | ||
42 | func = list_first_entry(&ops->func_stack, struct klp_func, | ||
43 | stack_node); | ||
44 | if (func->old_addr == old_addr) | ||
45 | return ops; | ||
46 | } | ||
47 | |||
48 | return NULL; | ||
49 | } | ||
50 | |||
51 | static void notrace klp_ftrace_handler(unsigned long ip, | ||
52 | unsigned long parent_ip, | ||
53 | struct ftrace_ops *fops, | ||
54 | struct pt_regs *regs) | ||
55 | { | ||
56 | struct klp_ops *ops; | ||
57 | struct klp_func *func; | ||
58 | int patch_state; | ||
59 | |||
60 | ops = container_of(fops, struct klp_ops, fops); | ||
61 | |||
62 | rcu_read_lock(); | ||
63 | |||
64 | func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, | ||
65 | stack_node); | ||
66 | |||
67 | /* | ||
68 | * func should never be NULL because preemption should be disabled here | ||
69 | * and unregister_ftrace_function() does the equivalent of a | ||
70 | * synchronize_sched() before the func_stack removal. | ||
71 | */ | ||
72 | if (WARN_ON_ONCE(!func)) | ||
73 | goto unlock; | ||
74 | |||
75 | /* | ||
76 | * In the enable path, enforce the order of the ops->func_stack and | ||
77 | * func->transition reads. The corresponding write barrier is in | ||
78 | * __klp_enable_patch(). | ||
79 | * | ||
80 | * (Note that this barrier technically isn't needed in the disable | ||
81 | * path. In the rare case where klp_update_patch_state() runs before | ||
82 | * this handler, its TIF_PATCH_PENDING read and this func->transition | ||
83 | * read need to be ordered. But klp_update_patch_state() already | ||
84 | * enforces that.) | ||
85 | */ | ||
86 | smp_rmb(); | ||
87 | |||
88 | if (unlikely(func->transition)) { | ||
89 | |||
90 | /* | ||
91 | * Enforce the order of the func->transition and | ||
92 | * current->patch_state reads. Otherwise we could read an | ||
93 | * out-of-date task state and pick the wrong function. The | ||
94 | * corresponding write barrier is in klp_init_transition(). | ||
95 | */ | ||
96 | smp_rmb(); | ||
97 | |||
98 | patch_state = current->patch_state; | ||
99 | |||
100 | WARN_ON_ONCE(patch_state == KLP_UNDEFINED); | ||
101 | |||
102 | if (patch_state == KLP_UNPATCHED) { | ||
103 | /* | ||
104 | * Use the previously patched version of the function. | ||
105 | * If no previous patches exist, continue with the | ||
106 | * original function. | ||
107 | */ | ||
108 | func = list_entry_rcu(func->stack_node.next, | ||
109 | struct klp_func, stack_node); | ||
110 | |||
111 | if (&func->stack_node == &ops->func_stack) | ||
112 | goto unlock; | ||
113 | } | ||
114 | } | ||
115 | |||
116 | klp_arch_set_pc(regs, (unsigned long)func->new_func); | ||
117 | unlock: | ||
118 | rcu_read_unlock(); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Convert a function address into the appropriate ftrace location. | ||
123 | * | ||
124 | * Usually this is just the address of the function, but on some architectures | ||
125 | * it's more complicated so allow them to provide a custom behaviour. | ||
126 | */ | ||
127 | #ifndef klp_get_ftrace_location | ||
128 | static unsigned long klp_get_ftrace_location(unsigned long faddr) | ||
129 | { | ||
130 | return faddr; | ||
131 | } | ||
132 | #endif | ||
133 | |||
134 | static void klp_unpatch_func(struct klp_func *func) | ||
135 | { | ||
136 | struct klp_ops *ops; | ||
137 | |||
138 | if (WARN_ON(!func->patched)) | ||
139 | return; | ||
140 | if (WARN_ON(!func->old_addr)) | ||
141 | return; | ||
142 | |||
143 | ops = klp_find_ops(func->old_addr); | ||
144 | if (WARN_ON(!ops)) | ||
145 | return; | ||
146 | |||
147 | if (list_is_singular(&ops->func_stack)) { | ||
148 | unsigned long ftrace_loc; | ||
149 | |||
150 | ftrace_loc = klp_get_ftrace_location(func->old_addr); | ||
151 | if (WARN_ON(!ftrace_loc)) | ||
152 | return; | ||
153 | |||
154 | WARN_ON(unregister_ftrace_function(&ops->fops)); | ||
155 | WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); | ||
156 | |||
157 | list_del_rcu(&func->stack_node); | ||
158 | list_del(&ops->node); | ||
159 | kfree(ops); | ||
160 | } else { | ||
161 | list_del_rcu(&func->stack_node); | ||
162 | } | ||
163 | |||
164 | func->patched = false; | ||
165 | } | ||
166 | |||
167 | static int klp_patch_func(struct klp_func *func) | ||
168 | { | ||
169 | struct klp_ops *ops; | ||
170 | int ret; | ||
171 | |||
172 | if (WARN_ON(!func->old_addr)) | ||
173 | return -EINVAL; | ||
174 | |||
175 | if (WARN_ON(func->patched)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | ops = klp_find_ops(func->old_addr); | ||
179 | if (!ops) { | ||
180 | unsigned long ftrace_loc; | ||
181 | |||
182 | ftrace_loc = klp_get_ftrace_location(func->old_addr); | ||
183 | if (!ftrace_loc) { | ||
184 | pr_err("failed to find location for function '%s'\n", | ||
185 | func->old_name); | ||
186 | return -EINVAL; | ||
187 | } | ||
188 | |||
189 | ops = kzalloc(sizeof(*ops), GFP_KERNEL); | ||
190 | if (!ops) | ||
191 | return -ENOMEM; | ||
192 | |||
193 | ops->fops.func = klp_ftrace_handler; | ||
194 | ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | | ||
195 | FTRACE_OPS_FL_DYNAMIC | | ||
196 | FTRACE_OPS_FL_IPMODIFY; | ||
197 | |||
198 | list_add(&ops->node, &klp_ops); | ||
199 | |||
200 | INIT_LIST_HEAD(&ops->func_stack); | ||
201 | list_add_rcu(&func->stack_node, &ops->func_stack); | ||
202 | |||
203 | ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); | ||
204 | if (ret) { | ||
205 | pr_err("failed to set ftrace filter for function '%s' (%d)\n", | ||
206 | func->old_name, ret); | ||
207 | goto err; | ||
208 | } | ||
209 | |||
210 | ret = register_ftrace_function(&ops->fops); | ||
211 | if (ret) { | ||
212 | pr_err("failed to register ftrace handler for function '%s' (%d)\n", | ||
213 | func->old_name, ret); | ||
214 | ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); | ||
215 | goto err; | ||
216 | } | ||
217 | |||
218 | |||
219 | } else { | ||
220 | list_add_rcu(&func->stack_node, &ops->func_stack); | ||
221 | } | ||
222 | |||
223 | func->patched = true; | ||
224 | |||
225 | return 0; | ||
226 | |||
227 | err: | ||
228 | list_del_rcu(&func->stack_node); | ||
229 | list_del(&ops->node); | ||
230 | kfree(ops); | ||
231 | return ret; | ||
232 | } | ||
233 | |||
234 | void klp_unpatch_object(struct klp_object *obj) | ||
235 | { | ||
236 | struct klp_func *func; | ||
237 | |||
238 | klp_for_each_func(obj, func) | ||
239 | if (func->patched) | ||
240 | klp_unpatch_func(func); | ||
241 | |||
242 | obj->patched = false; | ||
243 | } | ||
244 | |||
245 | int klp_patch_object(struct klp_object *obj) | ||
246 | { | ||
247 | struct klp_func *func; | ||
248 | int ret; | ||
249 | |||
250 | if (WARN_ON(obj->patched)) | ||
251 | return -EINVAL; | ||
252 | |||
253 | klp_for_each_func(obj, func) { | ||
254 | ret = klp_patch_func(func); | ||
255 | if (ret) { | ||
256 | klp_unpatch_object(obj); | ||
257 | return ret; | ||
258 | } | ||
259 | } | ||
260 | obj->patched = true; | ||
261 | |||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | void klp_unpatch_objects(struct klp_patch *patch) | ||
266 | { | ||
267 | struct klp_object *obj; | ||
268 | |||
269 | klp_for_each_object(patch, obj) | ||
270 | if (obj->patched) | ||
271 | klp_unpatch_object(obj); | ||
272 | } | ||
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h new file mode 100644 index 000000000000..0db227170c36 --- /dev/null +++ b/kernel/livepatch/patch.h | |||
@@ -0,0 +1,33 @@ | |||
1 | #ifndef _LIVEPATCH_PATCH_H | ||
2 | #define _LIVEPATCH_PATCH_H | ||
3 | |||
4 | #include <linux/livepatch.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/ftrace.h> | ||
7 | |||
8 | /** | ||
9 | * struct klp_ops - structure for tracking registered ftrace ops structs | ||
10 | * | ||
11 | * A single ftrace_ops is shared between all enabled replacement functions | ||
12 | * (klp_func structs) which have the same old_addr. This allows the switch | ||
13 | * between function versions to happen instantaneously by updating the klp_ops | ||
14 | * struct's func_stack list. The winner is the klp_func at the top of the | ||
15 | * func_stack (front of the list). | ||
16 | * | ||
17 | * @node: node for the global klp_ops list | ||
18 | * @func_stack: list head for the stack of klp_func's (active func is on top) | ||
19 | * @fops: registered ftrace ops struct | ||
20 | */ | ||
21 | struct klp_ops { | ||
22 | struct list_head node; | ||
23 | struct list_head func_stack; | ||
24 | struct ftrace_ops fops; | ||
25 | }; | ||
26 | |||
27 | struct klp_ops *klp_find_ops(unsigned long old_addr); | ||
28 | |||
29 | int klp_patch_object(struct klp_object *obj); | ||
30 | void klp_unpatch_object(struct klp_object *obj); | ||
31 | void klp_unpatch_objects(struct klp_patch *patch); | ||
32 | |||
33 | #endif /* _LIVEPATCH_PATCH_H */ | ||
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c new file mode 100644 index 000000000000..adc0cc64aa4b --- /dev/null +++ b/kernel/livepatch/transition.c | |||
@@ -0,0 +1,553 @@ | |||
1 | /* | ||
2 | * transition.c - Kernel Live Patching transition functions | ||
3 | * | ||
4 | * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version 2 | ||
9 | * of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
18 | */ | ||
19 | |||
20 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
21 | |||
22 | #include <linux/cpu.h> | ||
23 | #include <linux/stacktrace.h> | ||
24 | #include "core.h" | ||
25 | #include "patch.h" | ||
26 | #include "transition.h" | ||
27 | #include "../sched/sched.h" | ||
28 | |||
29 | #define MAX_STACK_ENTRIES 100 | ||
30 | #define STACK_ERR_BUF_SIZE 128 | ||
31 | |||
32 | struct klp_patch *klp_transition_patch; | ||
33 | |||
34 | static int klp_target_state = KLP_UNDEFINED; | ||
35 | |||
36 | /* | ||
37 | * This work can be performed periodically to finish patching or unpatching any | ||
38 | * "straggler" tasks which failed to transition in the first attempt. | ||
39 | */ | ||
40 | static void klp_transition_work_fn(struct work_struct *work) | ||
41 | { | ||
42 | mutex_lock(&klp_mutex); | ||
43 | |||
44 | if (klp_transition_patch) | ||
45 | klp_try_complete_transition(); | ||
46 | |||
47 | mutex_unlock(&klp_mutex); | ||
48 | } | ||
49 | static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); | ||
50 | |||
51 | /* | ||
52 | * The transition to the target patch state is complete. Clean up the data | ||
53 | * structures. | ||
54 | */ | ||
55 | static void klp_complete_transition(void) | ||
56 | { | ||
57 | struct klp_object *obj; | ||
58 | struct klp_func *func; | ||
59 | struct task_struct *g, *task; | ||
60 | unsigned int cpu; | ||
61 | bool immediate_func = false; | ||
62 | |||
63 | if (klp_target_state == KLP_UNPATCHED) { | ||
64 | /* | ||
65 | * All tasks have transitioned to KLP_UNPATCHED so we can now | ||
66 | * remove the new functions from the func_stack. | ||
67 | */ | ||
68 | klp_unpatch_objects(klp_transition_patch); | ||
69 | |||
70 | /* | ||
71 | * Make sure klp_ftrace_handler() can no longer see functions | ||
72 | * from this patch on the ops->func_stack. Otherwise, after | ||
73 | * func->transition gets cleared, the handler may choose a | ||
74 | * removed function. | ||
75 | */ | ||
76 | synchronize_rcu(); | ||
77 | } | ||
78 | |||
79 | if (klp_transition_patch->immediate) | ||
80 | goto done; | ||
81 | |||
82 | klp_for_each_object(klp_transition_patch, obj) { | ||
83 | klp_for_each_func(obj, func) { | ||
84 | func->transition = false; | ||
85 | if (func->immediate) | ||
86 | immediate_func = true; | ||
87 | } | ||
88 | } | ||
89 | |||
90 | if (klp_target_state == KLP_UNPATCHED && !immediate_func) | ||
91 | module_put(klp_transition_patch->mod); | ||
92 | |||
93 | /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ | ||
94 | if (klp_target_state == KLP_PATCHED) | ||
95 | synchronize_rcu(); | ||
96 | |||
97 | read_lock(&tasklist_lock); | ||
98 | for_each_process_thread(g, task) { | ||
99 | WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); | ||
100 | task->patch_state = KLP_UNDEFINED; | ||
101 | } | ||
102 | read_unlock(&tasklist_lock); | ||
103 | |||
104 | for_each_possible_cpu(cpu) { | ||
105 | task = idle_task(cpu); | ||
106 | WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); | ||
107 | task->patch_state = KLP_UNDEFINED; | ||
108 | } | ||
109 | |||
110 | done: | ||
111 | klp_target_state = KLP_UNDEFINED; | ||
112 | klp_transition_patch = NULL; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * This is called in the error path, to cancel a transition before it has | ||
117 | * started, i.e. klp_init_transition() has been called but | ||
118 | * klp_start_transition() hasn't. If the transition *has* been started, | ||
119 | * klp_reverse_transition() should be used instead. | ||
120 | */ | ||
121 | void klp_cancel_transition(void) | ||
122 | { | ||
123 | if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) | ||
124 | return; | ||
125 | |||
126 | klp_target_state = KLP_UNPATCHED; | ||
127 | klp_complete_transition(); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * Switch the patched state of the task to the set of functions in the target | ||
132 | * patch state. | ||
133 | * | ||
134 | * NOTE: If task is not 'current', the caller must ensure the task is inactive. | ||
135 | * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value. | ||
136 | */ | ||
137 | void klp_update_patch_state(struct task_struct *task) | ||
138 | { | ||
139 | rcu_read_lock(); | ||
140 | |||
141 | /* | ||
142 | * This test_and_clear_tsk_thread_flag() call also serves as a read | ||
143 | * barrier (smp_rmb) for two cases: | ||
144 | * | ||
145 | * 1) Enforce the order of the TIF_PATCH_PENDING read and the | ||
146 | * klp_target_state read. The corresponding write barrier is in | ||
147 | * klp_init_transition(). | ||
148 | * | ||
149 | * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read | ||
150 | * of func->transition, if klp_ftrace_handler() is called later on | ||
151 | * the same CPU. See __klp_disable_patch(). | ||
152 | */ | ||
153 | if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) | ||
154 | task->patch_state = READ_ONCE(klp_target_state); | ||
155 | |||
156 | rcu_read_unlock(); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Determine whether the given stack trace includes any references to a | ||
161 | * to-be-patched or to-be-unpatched function. | ||
162 | */ | ||
163 | static int klp_check_stack_func(struct klp_func *func, | ||
164 | struct stack_trace *trace) | ||
165 | { | ||
166 | unsigned long func_addr, func_size, address; | ||
167 | struct klp_ops *ops; | ||
168 | int i; | ||
169 | |||
170 | if (func->immediate) | ||
171 | return 0; | ||
172 | |||
173 | for (i = 0; i < trace->nr_entries; i++) { | ||
174 | address = trace->entries[i]; | ||
175 | |||
176 | if (klp_target_state == KLP_UNPATCHED) { | ||
177 | /* | ||
178 | * Check for the to-be-unpatched function | ||
179 | * (the func itself). | ||
180 | */ | ||
181 | func_addr = (unsigned long)func->new_func; | ||
182 | func_size = func->new_size; | ||
183 | } else { | ||
184 | /* | ||
185 | * Check for the to-be-patched function | ||
186 | * (the previous func). | ||
187 | */ | ||
188 | ops = klp_find_ops(func->old_addr); | ||
189 | |||
190 | if (list_is_singular(&ops->func_stack)) { | ||
191 | /* original function */ | ||
192 | func_addr = func->old_addr; | ||
193 | func_size = func->old_size; | ||
194 | } else { | ||
195 | /* previously patched function */ | ||
196 | struct klp_func *prev; | ||
197 | |||
198 | prev = list_next_entry(func, stack_node); | ||
199 | func_addr = (unsigned long)prev->new_func; | ||
200 | func_size = prev->new_size; | ||
201 | } | ||
202 | } | ||
203 | |||
204 | if (address >= func_addr && address < func_addr + func_size) | ||
205 | return -EAGAIN; | ||
206 | } | ||
207 | |||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Determine whether it's safe to transition the task to the target patch state | ||
213 | * by looking for any to-be-patched or to-be-unpatched functions on its stack. | ||
214 | */ | ||
215 | static int klp_check_stack(struct task_struct *task, char *err_buf) | ||
216 | { | ||
217 | static unsigned long entries[MAX_STACK_ENTRIES]; | ||
218 | struct stack_trace trace; | ||
219 | struct klp_object *obj; | ||
220 | struct klp_func *func; | ||
221 | int ret; | ||
222 | |||
223 | trace.skip = 0; | ||
224 | trace.nr_entries = 0; | ||
225 | trace.max_entries = MAX_STACK_ENTRIES; | ||
226 | trace.entries = entries; | ||
227 | ret = save_stack_trace_tsk_reliable(task, &trace); | ||
228 | WARN_ON_ONCE(ret == -ENOSYS); | ||
229 | if (ret) { | ||
230 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | ||
231 | "%s: %s:%d has an unreliable stack\n", | ||
232 | __func__, task->comm, task->pid); | ||
233 | return ret; | ||
234 | } | ||
235 | |||
236 | klp_for_each_object(klp_transition_patch, obj) { | ||
237 | if (!obj->patched) | ||
238 | continue; | ||
239 | klp_for_each_func(obj, func) { | ||
240 | ret = klp_check_stack_func(func, &trace); | ||
241 | if (ret) { | ||
242 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | ||
243 | "%s: %s:%d is sleeping on function %s\n", | ||
244 | __func__, task->comm, task->pid, | ||
245 | func->old_name); | ||
246 | return ret; | ||
247 | } | ||
248 | } | ||
249 | } | ||
250 | |||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * Try to safely switch a task to the target patch state. If it's currently | ||
256 | * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or | ||
257 | * if the stack is unreliable, return false. | ||
258 | */ | ||
259 | static bool klp_try_switch_task(struct task_struct *task) | ||
260 | { | ||
261 | struct rq *rq; | ||
262 | struct rq_flags flags; | ||
263 | int ret; | ||
264 | bool success = false; | ||
265 | char err_buf[STACK_ERR_BUF_SIZE]; | ||
266 | |||
267 | err_buf[0] = '\0'; | ||
268 | |||
269 | /* check if this task has already switched over */ | ||
270 | if (task->patch_state == klp_target_state) | ||
271 | return true; | ||
272 | |||
273 | /* | ||
274 | * For arches which don't have reliable stack traces, we have to rely | ||
275 | * on other methods (e.g., switching tasks at kernel exit). | ||
276 | */ | ||
277 | if (!klp_have_reliable_stack()) | ||
278 | return false; | ||
279 | |||
280 | /* | ||
281 | * Now try to check the stack for any to-be-patched or to-be-unpatched | ||
282 | * functions. If all goes well, switch the task to the target patch | ||
283 | * state. | ||
284 | */ | ||
285 | rq = task_rq_lock(task, &flags); | ||
286 | |||
287 | if (task_running(rq, task) && task != current) { | ||
288 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | ||
289 | "%s: %s:%d is running\n", __func__, task->comm, | ||
290 | task->pid); | ||
291 | goto done; | ||
292 | } | ||
293 | |||
294 | ret = klp_check_stack(task, err_buf); | ||
295 | if (ret) | ||
296 | goto done; | ||
297 | |||
298 | success = true; | ||
299 | |||
300 | clear_tsk_thread_flag(task, TIF_PATCH_PENDING); | ||
301 | task->patch_state = klp_target_state; | ||
302 | |||
303 | done: | ||
304 | task_rq_unlock(rq, task, &flags); | ||
305 | |||
306 | /* | ||
307 | * Due to console deadlock issues, pr_debug() can't be used while | ||
308 | * holding the task rq lock. Instead we have to use a temporary buffer | ||
309 | * and print the debug message after releasing the lock. | ||
310 | */ | ||
311 | if (err_buf[0] != '\0') | ||
312 | pr_debug("%s", err_buf); | ||
313 | |||
314 | return success; | ||
315 | |||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Try to switch all remaining tasks to the target patch state by walking the | ||
320 | * stacks of sleeping tasks and looking for any to-be-patched or | ||
321 | * to-be-unpatched functions. If such functions are found, the task can't be | ||
322 | * switched yet. | ||
323 | * | ||
324 | * If any tasks are still stuck in the initial patch state, schedule a retry. | ||
325 | */ | ||
326 | void klp_try_complete_transition(void) | ||
327 | { | ||
328 | unsigned int cpu; | ||
329 | struct task_struct *g, *task; | ||
330 | bool complete = true; | ||
331 | |||
332 | WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); | ||
333 | |||
334 | /* | ||
335 | * If the patch can be applied or reverted immediately, skip the | ||
336 | * per-task transitions. | ||
337 | */ | ||
338 | if (klp_transition_patch->immediate) | ||
339 | goto success; | ||
340 | |||
341 | /* | ||
342 | * Try to switch the tasks to the target patch state by walking their | ||
343 | * stacks and looking for any to-be-patched or to-be-unpatched | ||
344 | * functions. If such functions are found on a stack, or if the stack | ||
345 | * is deemed unreliable, the task can't be switched yet. | ||
346 | * | ||
347 | * Usually this will transition most (or all) of the tasks on a system | ||
348 | * unless the patch includes changes to a very common function. | ||
349 | */ | ||
350 | read_lock(&tasklist_lock); | ||
351 | for_each_process_thread(g, task) | ||
352 | if (!klp_try_switch_task(task)) | ||
353 | complete = false; | ||
354 | read_unlock(&tasklist_lock); | ||
355 | |||
356 | /* | ||
357 | * Ditto for the idle "swapper" tasks. | ||
358 | */ | ||
359 | get_online_cpus(); | ||
360 | for_each_possible_cpu(cpu) { | ||
361 | task = idle_task(cpu); | ||
362 | if (cpu_online(cpu)) { | ||
363 | if (!klp_try_switch_task(task)) | ||
364 | complete = false; | ||
365 | } else if (task->patch_state != klp_target_state) { | ||
366 | /* offline idle tasks can be switched immediately */ | ||
367 | clear_tsk_thread_flag(task, TIF_PATCH_PENDING); | ||
368 | task->patch_state = klp_target_state; | ||
369 | } | ||
370 | } | ||
371 | put_online_cpus(); | ||
372 | |||
373 | if (!complete) { | ||
374 | /* | ||
375 | * Some tasks weren't able to be switched over. Try again | ||
376 | * later and/or wait for other methods like kernel exit | ||
377 | * switching. | ||
378 | */ | ||
379 | schedule_delayed_work(&klp_transition_work, | ||
380 | round_jiffies_relative(HZ)); | ||
381 | return; | ||
382 | } | ||
383 | |||
384 | success: | ||
385 | pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, | ||
386 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); | ||
387 | |||
388 | /* we're done, now cleanup the data structures */ | ||
389 | klp_complete_transition(); | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Start the transition to the specified target patch state so tasks can begin | ||
394 | * switching to it. | ||
395 | */ | ||
396 | void klp_start_transition(void) | ||
397 | { | ||
398 | struct task_struct *g, *task; | ||
399 | unsigned int cpu; | ||
400 | |||
401 | WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); | ||
402 | |||
403 | pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, | ||
404 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); | ||
405 | |||
406 | /* | ||
407 | * If the patch can be applied or reverted immediately, skip the | ||
408 | * per-task transitions. | ||
409 | */ | ||
410 | if (klp_transition_patch->immediate) | ||
411 | return; | ||
412 | |||
413 | /* | ||
414 | * Mark all normal tasks as needing a patch state update. They'll | ||
415 | * switch either in klp_try_complete_transition() or as they exit the | ||
416 | * kernel. | ||
417 | */ | ||
418 | read_lock(&tasklist_lock); | ||
419 | for_each_process_thread(g, task) | ||
420 | if (task->patch_state != klp_target_state) | ||
421 | set_tsk_thread_flag(task, TIF_PATCH_PENDING); | ||
422 | read_unlock(&tasklist_lock); | ||
423 | |||
424 | /* | ||
425 | * Mark all idle tasks as needing a patch state update. They'll switch | ||
426 | * either in klp_try_complete_transition() or at the idle loop switch | ||
427 | * point. | ||
428 | */ | ||
429 | for_each_possible_cpu(cpu) { | ||
430 | task = idle_task(cpu); | ||
431 | if (task->patch_state != klp_target_state) | ||
432 | set_tsk_thread_flag(task, TIF_PATCH_PENDING); | ||
433 | } | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * Initialize the global target patch state and all tasks to the initial patch | ||
438 | * state, and initialize all function transition states to true in preparation | ||
439 | * for patching or unpatching. | ||
440 | */ | ||
441 | void klp_init_transition(struct klp_patch *patch, int state) | ||
442 | { | ||
443 | struct task_struct *g, *task; | ||
444 | unsigned int cpu; | ||
445 | struct klp_object *obj; | ||
446 | struct klp_func *func; | ||
447 | int initial_state = !state; | ||
448 | |||
449 | WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); | ||
450 | |||
451 | klp_transition_patch = patch; | ||
452 | |||
453 | /* | ||
454 | * Set the global target patch state which tasks will switch to. This | ||
455 | * has no effect until the TIF_PATCH_PENDING flags get set later. | ||
456 | */ | ||
457 | klp_target_state = state; | ||
458 | |||
459 | /* | ||
460 | * If the patch can be applied or reverted immediately, skip the | ||
461 | * per-task transitions. | ||
462 | */ | ||
463 | if (patch->immediate) | ||
464 | return; | ||
465 | |||
466 | /* | ||
467 | * Initialize all tasks to the initial patch state to prepare them for | ||
468 | * switching to the target state. | ||
469 | */ | ||
470 | read_lock(&tasklist_lock); | ||
471 | for_each_process_thread(g, task) { | ||
472 | WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); | ||
473 | task->patch_state = initial_state; | ||
474 | } | ||
475 | read_unlock(&tasklist_lock); | ||
476 | |||
477 | /* | ||
478 | * Ditto for the idle "swapper" tasks. | ||
479 | */ | ||
480 | for_each_possible_cpu(cpu) { | ||
481 | task = idle_task(cpu); | ||
482 | WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); | ||
483 | task->patch_state = initial_state; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Enforce the order of the task->patch_state initializations and the | ||
488 | * func->transition updates to ensure that klp_ftrace_handler() doesn't | ||
489 | * see a func in transition with a task->patch_state of KLP_UNDEFINED. | ||
490 | * | ||
491 | * Also enforce the order of the klp_target_state write and future | ||
492 | * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't | ||
493 | * set a task->patch_state to KLP_UNDEFINED. | ||
494 | */ | ||
495 | smp_wmb(); | ||
496 | |||
497 | /* | ||
498 | * Set the func transition states so klp_ftrace_handler() will know to | ||
499 | * switch to the transition logic. | ||
500 | * | ||
501 | * When patching, the funcs aren't yet in the func_stack and will be | ||
502 | * made visible to the ftrace handler shortly by the calls to | ||
503 | * klp_patch_object(). | ||
504 | * | ||
505 | * When unpatching, the funcs are already in the func_stack and so are | ||
506 | * already visible to the ftrace handler. | ||
507 | */ | ||
508 | klp_for_each_object(patch, obj) | ||
509 | klp_for_each_func(obj, func) | ||
510 | func->transition = true; | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * This function can be called in the middle of an existing transition to | ||
515 | * reverse the direction of the target patch state. This can be done to | ||
516 | * effectively cancel an existing enable or disable operation if there are any | ||
517 | * tasks which are stuck in the initial patch state. | ||
518 | */ | ||
519 | void klp_reverse_transition(void) | ||
520 | { | ||
521 | unsigned int cpu; | ||
522 | struct task_struct *g, *task; | ||
523 | |||
524 | klp_transition_patch->enabled = !klp_transition_patch->enabled; | ||
525 | |||
526 | klp_target_state = !klp_target_state; | ||
527 | |||
528 | /* | ||
529 | * Clear all TIF_PATCH_PENDING flags to prevent races caused by | ||
530 | * klp_update_patch_state() running in parallel with | ||
531 | * klp_start_transition(). | ||
532 | */ | ||
533 | read_lock(&tasklist_lock); | ||
534 | for_each_process_thread(g, task) | ||
535 | clear_tsk_thread_flag(task, TIF_PATCH_PENDING); | ||
536 | read_unlock(&tasklist_lock); | ||
537 | |||
538 | for_each_possible_cpu(cpu) | ||
539 | clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); | ||
540 | |||
541 | /* Let any remaining calls to klp_update_patch_state() complete */ | ||
542 | synchronize_rcu(); | ||
543 | |||
544 | klp_start_transition(); | ||
545 | } | ||
546 | |||
547 | /* Called from copy_process() during fork */ | ||
548 | void klp_copy_process(struct task_struct *child) | ||
549 | { | ||
550 | child->patch_state = current->patch_state; | ||
551 | |||
552 | /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ | ||
553 | } | ||
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h new file mode 100644 index 000000000000..ce09b326546c --- /dev/null +++ b/kernel/livepatch/transition.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef _LIVEPATCH_TRANSITION_H | ||
2 | #define _LIVEPATCH_TRANSITION_H | ||
3 | |||
4 | #include <linux/livepatch.h> | ||
5 | |||
6 | extern struct klp_patch *klp_transition_patch; | ||
7 | |||
8 | void klp_init_transition(struct klp_patch *patch, int state); | ||
9 | void klp_cancel_transition(void); | ||
10 | void klp_start_transition(void); | ||
11 | void klp_try_complete_transition(void); | ||
12 | void klp_reverse_transition(void); | ||
13 | |||
14 | #endif /* _LIVEPATCH_TRANSITION_H */ | ||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..c0e31bfee25c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/sched.h> | 30 | #include <linux/sched.h> |
31 | #include <linux/sched/clock.h> | 31 | #include <linux/sched/clock.h> |
32 | #include <linux/sched/task.h> | 32 | #include <linux/sched/task.h> |
33 | #include <linux/sched/mm.h> | ||
33 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
34 | #include <linux/module.h> | 35 | #include <linux/module.h> |
35 | #include <linux/proc_fs.h> | 36 | #include <linux/proc_fs.h> |
@@ -660,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
660 | struct lockdep_subclass_key *key; | 661 | struct lockdep_subclass_key *key; |
661 | struct hlist_head *hash_head; | 662 | struct hlist_head *hash_head; |
662 | struct lock_class *class; | 663 | struct lock_class *class; |
664 | bool is_static = false; | ||
663 | 665 | ||
664 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | 666 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { |
665 | debug_locks_off(); | 667 | debug_locks_off(); |
@@ -673,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
673 | 675 | ||
674 | /* | 676 | /* |
675 | * Static locks do not have their class-keys yet - for them the key | 677 | * Static locks do not have their class-keys yet - for them the key |
676 | * is the lock object itself: | 678 | * is the lock object itself. If the lock is in the per cpu area, |
679 | * the canonical address of the lock (per cpu offset removed) is | ||
680 | * used. | ||
677 | */ | 681 | */ |
678 | if (unlikely(!lock->key)) | 682 | if (unlikely(!lock->key)) { |
679 | lock->key = (void *)lock; | 683 | unsigned long can_addr, addr = (unsigned long)lock; |
684 | |||
685 | if (__is_kernel_percpu_address(addr, &can_addr)) | ||
686 | lock->key = (void *)can_addr; | ||
687 | else if (__is_module_percpu_address(addr, &can_addr)) | ||
688 | lock->key = (void *)can_addr; | ||
689 | else if (static_obj(lock)) | ||
690 | lock->key = (void *)lock; | ||
691 | else | ||
692 | return ERR_PTR(-EINVAL); | ||
693 | is_static = true; | ||
694 | } | ||
680 | 695 | ||
681 | /* | 696 | /* |
682 | * NOTE: the class-key must be unique. For dynamic locks, a static | 697 | * NOTE: the class-key must be unique. For dynamic locks, a static |
@@ -708,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
708 | } | 723 | } |
709 | } | 724 | } |
710 | 725 | ||
711 | return NULL; | 726 | return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); |
712 | } | 727 | } |
713 | 728 | ||
714 | /* | 729 | /* |
@@ -726,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
726 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | 741 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); |
727 | 742 | ||
728 | class = look_up_lock_class(lock, subclass); | 743 | class = look_up_lock_class(lock, subclass); |
729 | if (likely(class)) | 744 | if (likely(!IS_ERR_OR_NULL(class))) |
730 | goto out_set_class_cache; | 745 | goto out_set_class_cache; |
731 | 746 | ||
732 | /* | 747 | /* |
733 | * Debug-check: all keys must be persistent! | 748 | * Debug-check: all keys must be persistent! |
734 | */ | 749 | */ |
735 | if (!static_obj(lock->key)) { | 750 | if (IS_ERR(class)) { |
736 | debug_locks_off(); | 751 | debug_locks_off(); |
737 | printk("INFO: trying to register non-static key.\n"); | 752 | printk("INFO: trying to register non-static key.\n"); |
738 | printk("the code is fine but needs lockdep annotation.\n"); | 753 | printk("the code is fine but needs lockdep annotation.\n"); |
739 | printk("turning off the locking correctness validator.\n"); | 754 | printk("turning off the locking correctness validator.\n"); |
740 | dump_stack(); | 755 | dump_stack(); |
741 | |||
742 | return NULL; | 756 | return NULL; |
743 | } | 757 | } |
744 | 758 | ||
@@ -1144,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1144 | return 0; | 1158 | return 0; |
1145 | 1159 | ||
1146 | printk("\n"); | 1160 | printk("\n"); |
1147 | printk("======================================================\n"); | 1161 | pr_warn("======================================================\n"); |
1148 | printk("[ INFO: possible circular locking dependency detected ]\n"); | 1162 | pr_warn("WARNING: possible circular locking dependency detected\n"); |
1149 | print_kernel_ident(); | 1163 | print_kernel_ident(); |
1150 | printk("-------------------------------------------------------\n"); | 1164 | pr_warn("------------------------------------------------------\n"); |
1151 | printk("%s/%d is trying to acquire lock:\n", | 1165 | printk("%s/%d is trying to acquire lock:\n", |
1152 | curr->comm, task_pid_nr(curr)); | 1166 | curr->comm, task_pid_nr(curr)); |
1153 | print_lock(check_src); | 1167 | print_lock(check_src); |
@@ -1482,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1482 | return 0; | 1496 | return 0; |
1483 | 1497 | ||
1484 | printk("\n"); | 1498 | printk("\n"); |
1485 | printk("======================================================\n"); | 1499 | pr_warn("=====================================================\n"); |
1486 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1500 | pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", |
1487 | irqclass, irqclass); | 1501 | irqclass, irqclass); |
1488 | print_kernel_ident(); | 1502 | print_kernel_ident(); |
1489 | printk("------------------------------------------------------\n"); | 1503 | pr_warn("-----------------------------------------------------\n"); |
1490 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1504 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1491 | curr->comm, task_pid_nr(curr), | 1505 | curr->comm, task_pid_nr(curr), |
1492 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1506 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
@@ -1711,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1711 | return 0; | 1725 | return 0; |
1712 | 1726 | ||
1713 | printk("\n"); | 1727 | printk("\n"); |
1714 | printk("=============================================\n"); | 1728 | pr_warn("============================================\n"); |
1715 | printk("[ INFO: possible recursive locking detected ]\n"); | 1729 | pr_warn("WARNING: possible recursive locking detected\n"); |
1716 | print_kernel_ident(); | 1730 | print_kernel_ident(); |
1717 | printk("---------------------------------------------\n"); | 1731 | pr_warn("--------------------------------------------\n"); |
1718 | printk("%s/%d is trying to acquire lock:\n", | 1732 | printk("%s/%d is trying to acquire lock:\n", |
1719 | curr->comm, task_pid_nr(curr)); | 1733 | curr->comm, task_pid_nr(curr)); |
1720 | print_lock(next); | 1734 | print_lock(next); |
@@ -2061,10 +2075,10 @@ static void print_collision(struct task_struct *curr, | |||
2061 | struct lock_chain *chain) | 2075 | struct lock_chain *chain) |
2062 | { | 2076 | { |
2063 | printk("\n"); | 2077 | printk("\n"); |
2064 | printk("======================\n"); | 2078 | pr_warn("============================\n"); |
2065 | printk("[chain_key collision ]\n"); | 2079 | pr_warn("WARNING: chain_key collision\n"); |
2066 | print_kernel_ident(); | 2080 | print_kernel_ident(); |
2067 | printk("----------------------\n"); | 2081 | pr_warn("----------------------------\n"); |
2068 | printk("%s/%d: ", current->comm, task_pid_nr(current)); | 2082 | printk("%s/%d: ", current->comm, task_pid_nr(current)); |
2069 | printk("Hash chain already cached but the contents don't match!\n"); | 2083 | printk("Hash chain already cached but the contents don't match!\n"); |
2070 | 2084 | ||
@@ -2360,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2360 | return 0; | 2374 | return 0; |
2361 | 2375 | ||
2362 | printk("\n"); | 2376 | printk("\n"); |
2363 | printk("=================================\n"); | 2377 | pr_warn("================================\n"); |
2364 | printk("[ INFO: inconsistent lock state ]\n"); | 2378 | pr_warn("WARNING: inconsistent lock state\n"); |
2365 | print_kernel_ident(); | 2379 | print_kernel_ident(); |
2366 | printk("---------------------------------\n"); | 2380 | pr_warn("--------------------------------\n"); |
2367 | 2381 | ||
2368 | printk("inconsistent {%s} -> {%s} usage.\n", | 2382 | printk("inconsistent {%s} -> {%s} usage.\n", |
2369 | usage_str[prev_bit], usage_str[new_bit]); | 2383 | usage_str[prev_bit], usage_str[new_bit]); |
@@ -2425,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2425 | return 0; | 2439 | return 0; |
2426 | 2440 | ||
2427 | printk("\n"); | 2441 | printk("\n"); |
2428 | printk("=========================================================\n"); | 2442 | pr_warn("========================================================\n"); |
2429 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | 2443 | pr_warn("WARNING: possible irq lock inversion dependency detected\n"); |
2430 | print_kernel_ident(); | 2444 | print_kernel_ident(); |
2431 | printk("---------------------------------------------------------\n"); | 2445 | pr_warn("--------------------------------------------------------\n"); |
2432 | printk("%s/%d just changed the state of lock:\n", | 2446 | printk("%s/%d just changed the state of lock:\n", |
2433 | curr->comm, task_pid_nr(curr)); | 2447 | curr->comm, task_pid_nr(curr)); |
2434 | print_lock(this); | 2448 | print_lock(this); |
@@ -2863,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2863 | if (unlikely(!debug_locks)) | 2877 | if (unlikely(!debug_locks)) |
2864 | return; | 2878 | return; |
2865 | 2879 | ||
2880 | gfp_mask = current_gfp_context(gfp_mask); | ||
2881 | |||
2866 | /* no reclaim without waiting on it */ | 2882 | /* no reclaim without waiting on it */ |
2867 | if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) | 2883 | if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) |
2868 | return; | 2884 | return; |
@@ -2872,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2872 | return; | 2888 | return; |
2873 | 2889 | ||
2874 | /* We're only interested __GFP_FS allocations for now */ | 2890 | /* We're only interested __GFP_FS allocations for now */ |
2875 | if (!(gfp_mask & __GFP_FS)) | 2891 | if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) |
2876 | return; | 2892 | return; |
2877 | 2893 | ||
2878 | /* | 2894 | /* |
@@ -2881,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2881 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) | 2897 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) |
2882 | return; | 2898 | return; |
2883 | 2899 | ||
2900 | /* Disable lockdep if explicitly requested */ | ||
2901 | if (gfp_mask & __GFP_NOLOCKDEP) | ||
2902 | return; | ||
2903 | |||
2884 | mark_held_locks(curr, RECLAIM_FS); | 2904 | mark_held_locks(curr, RECLAIM_FS); |
2885 | } | 2905 | } |
2886 | 2906 | ||
@@ -3170,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, | |||
3170 | return 0; | 3190 | return 0; |
3171 | 3191 | ||
3172 | printk("\n"); | 3192 | printk("\n"); |
3173 | printk("==================================\n"); | 3193 | pr_warn("==================================\n"); |
3174 | printk("[ BUG: Nested lock was not taken ]\n"); | 3194 | pr_warn("WARNING: Nested lock was not taken\n"); |
3175 | print_kernel_ident(); | 3195 | print_kernel_ident(); |
3176 | printk("----------------------------------\n"); | 3196 | pr_warn("----------------------------------\n"); |
3177 | 3197 | ||
3178 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | 3198 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); |
3179 | print_lock(hlock); | 3199 | print_lock(hlock); |
@@ -3383,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3383 | return 0; | 3403 | return 0; |
3384 | 3404 | ||
3385 | printk("\n"); | 3405 | printk("\n"); |
3386 | printk("=====================================\n"); | 3406 | pr_warn("=====================================\n"); |
3387 | printk("[ BUG: bad unlock balance detected! ]\n"); | 3407 | pr_warn("WARNING: bad unlock balance detected!\n"); |
3388 | print_kernel_ident(); | 3408 | print_kernel_ident(); |
3389 | printk("-------------------------------------\n"); | 3409 | pr_warn("-------------------------------------\n"); |
3390 | printk("%s/%d is trying to release lock (", | 3410 | printk("%s/%d is trying to release lock (", |
3391 | curr->comm, task_pid_nr(curr)); | 3411 | curr->comm, task_pid_nr(curr)); |
3392 | print_lockdep_cache(lock); | 3412 | print_lockdep_cache(lock); |
@@ -3419,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
3419 | * Clearly if the lock hasn't been acquired _ever_, we're not | 3439 | * Clearly if the lock hasn't been acquired _ever_, we're not |
3420 | * holding it either, so report failure. | 3440 | * holding it either, so report failure. |
3421 | */ | 3441 | */ |
3422 | if (!class) | 3442 | if (IS_ERR_OR_NULL(class)) |
3423 | return 0; | 3443 | return 0; |
3424 | 3444 | ||
3425 | /* | 3445 | /* |
@@ -3437,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
3437 | return 0; | 3457 | return 0; |
3438 | } | 3458 | } |
3439 | 3459 | ||
3460 | /* @depth must not be zero */ | ||
3461 | static struct held_lock *find_held_lock(struct task_struct *curr, | ||
3462 | struct lockdep_map *lock, | ||
3463 | unsigned int depth, int *idx) | ||
3464 | { | ||
3465 | struct held_lock *ret, *hlock, *prev_hlock; | ||
3466 | int i; | ||
3467 | |||
3468 | i = depth - 1; | ||
3469 | hlock = curr->held_locks + i; | ||
3470 | ret = hlock; | ||
3471 | if (match_held_lock(hlock, lock)) | ||
3472 | goto out; | ||
3473 | |||
3474 | ret = NULL; | ||
3475 | for (i--, prev_hlock = hlock--; | ||
3476 | i >= 0; | ||
3477 | i--, prev_hlock = hlock--) { | ||
3478 | /* | ||
3479 | * We must not cross into another context: | ||
3480 | */ | ||
3481 | if (prev_hlock->irq_context != hlock->irq_context) { | ||
3482 | ret = NULL; | ||
3483 | break; | ||
3484 | } | ||
3485 | if (match_held_lock(hlock, lock)) { | ||
3486 | ret = hlock; | ||
3487 | break; | ||
3488 | } | ||
3489 | } | ||
3490 | |||
3491 | out: | ||
3492 | *idx = i; | ||
3493 | return ret; | ||
3494 | } | ||
3495 | |||
3496 | static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, | ||
3497 | int idx) | ||
3498 | { | ||
3499 | struct held_lock *hlock; | ||
3500 | |||
3501 | for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { | ||
3502 | if (!__lock_acquire(hlock->instance, | ||
3503 | hlock_class(hlock)->subclass, | ||
3504 | hlock->trylock, | ||
3505 | hlock->read, hlock->check, | ||
3506 | hlock->hardirqs_off, | ||
3507 | hlock->nest_lock, hlock->acquire_ip, | ||
3508 | hlock->references, hlock->pin_count)) | ||
3509 | return 1; | ||
3510 | } | ||
3511 | return 0; | ||
3512 | } | ||
3513 | |||
3440 | static int | 3514 | static int |
3441 | __lock_set_class(struct lockdep_map *lock, const char *name, | 3515 | __lock_set_class(struct lockdep_map *lock, const char *name, |
3442 | struct lock_class_key *key, unsigned int subclass, | 3516 | struct lock_class_key *key, unsigned int subclass, |
3443 | unsigned long ip) | 3517 | unsigned long ip) |
3444 | { | 3518 | { |
3445 | struct task_struct *curr = current; | 3519 | struct task_struct *curr = current; |
3446 | struct held_lock *hlock, *prev_hlock; | 3520 | struct held_lock *hlock; |
3447 | struct lock_class *class; | 3521 | struct lock_class *class; |
3448 | unsigned int depth; | 3522 | unsigned int depth; |
3449 | int i; | 3523 | int i; |
@@ -3456,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
3456 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3530 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3457 | return 0; | 3531 | return 0; |
3458 | 3532 | ||
3459 | prev_hlock = NULL; | 3533 | hlock = find_held_lock(curr, lock, depth, &i); |
3460 | for (i = depth-1; i >= 0; i--) { | 3534 | if (!hlock) |
3461 | hlock = curr->held_locks + i; | 3535 | return print_unlock_imbalance_bug(curr, lock, ip); |
3462 | /* | ||
3463 | * We must not cross into another context: | ||
3464 | */ | ||
3465 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
3466 | break; | ||
3467 | if (match_held_lock(hlock, lock)) | ||
3468 | goto found_it; | ||
3469 | prev_hlock = hlock; | ||
3470 | } | ||
3471 | return print_unlock_imbalance_bug(curr, lock, ip); | ||
3472 | 3536 | ||
3473 | found_it: | ||
3474 | lockdep_init_map(lock, name, key, 0); | 3537 | lockdep_init_map(lock, name, key, 0); |
3475 | class = register_lock_class(lock, subclass, 0); | 3538 | class = register_lock_class(lock, subclass, 0); |
3476 | hlock->class_idx = class - lock_classes + 1; | 3539 | hlock->class_idx = class - lock_classes + 1; |
@@ -3478,15 +3541,46 @@ found_it: | |||
3478 | curr->lockdep_depth = i; | 3541 | curr->lockdep_depth = i; |
3479 | curr->curr_chain_key = hlock->prev_chain_key; | 3542 | curr->curr_chain_key = hlock->prev_chain_key; |
3480 | 3543 | ||
3481 | for (; i < depth; i++) { | 3544 | if (reacquire_held_locks(curr, depth, i)) |
3482 | hlock = curr->held_locks + i; | 3545 | return 0; |
3483 | if (!__lock_acquire(hlock->instance, | 3546 | |
3484 | hlock_class(hlock)->subclass, hlock->trylock, | 3547 | /* |
3485 | hlock->read, hlock->check, hlock->hardirqs_off, | 3548 | * I took it apart and put it back together again, except now I have |
3486 | hlock->nest_lock, hlock->acquire_ip, | 3549 | * these 'spare' parts.. where shall I put them. |
3487 | hlock->references, hlock->pin_count)) | 3550 | */ |
3488 | return 0; | 3551 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) |
3489 | } | 3552 | return 0; |
3553 | return 1; | ||
3554 | } | ||
3555 | |||
3556 | static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | ||
3557 | { | ||
3558 | struct task_struct *curr = current; | ||
3559 | struct held_lock *hlock; | ||
3560 | unsigned int depth; | ||
3561 | int i; | ||
3562 | |||
3563 | depth = curr->lockdep_depth; | ||
3564 | /* | ||
3565 | * This function is about (re)setting the class of a held lock, | ||
3566 | * yet we're not actually holding any locks. Naughty user! | ||
3567 | */ | ||
3568 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
3569 | return 0; | ||
3570 | |||
3571 | hlock = find_held_lock(curr, lock, depth, &i); | ||
3572 | if (!hlock) | ||
3573 | return print_unlock_imbalance_bug(curr, lock, ip); | ||
3574 | |||
3575 | curr->lockdep_depth = i; | ||
3576 | curr->curr_chain_key = hlock->prev_chain_key; | ||
3577 | |||
3578 | WARN(hlock->read, "downgrading a read lock"); | ||
3579 | hlock->read = 1; | ||
3580 | hlock->acquire_ip = ip; | ||
3581 | |||
3582 | if (reacquire_held_locks(curr, depth, i)) | ||
3583 | return 0; | ||
3490 | 3584 | ||
3491 | /* | 3585 | /* |
3492 | * I took it apart and put it back together again, except now I have | 3586 | * I took it apart and put it back together again, except now I have |
@@ -3508,7 +3602,7 @@ static int | |||
3508 | __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | 3602 | __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) |
3509 | { | 3603 | { |
3510 | struct task_struct *curr = current; | 3604 | struct task_struct *curr = current; |
3511 | struct held_lock *hlock, *prev_hlock; | 3605 | struct held_lock *hlock; |
3512 | unsigned int depth; | 3606 | unsigned int depth; |
3513 | int i; | 3607 | int i; |
3514 | 3608 | ||
@@ -3527,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
3527 | * Check whether the lock exists in the current stack | 3621 | * Check whether the lock exists in the current stack |
3528 | * of held locks: | 3622 | * of held locks: |
3529 | */ | 3623 | */ |
3530 | prev_hlock = NULL; | 3624 | hlock = find_held_lock(curr, lock, depth, &i); |
3531 | for (i = depth-1; i >= 0; i--) { | 3625 | if (!hlock) |
3532 | hlock = curr->held_locks + i; | 3626 | return print_unlock_imbalance_bug(curr, lock, ip); |
3533 | /* | ||
3534 | * We must not cross into another context: | ||
3535 | */ | ||
3536 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
3537 | break; | ||
3538 | if (match_held_lock(hlock, lock)) | ||
3539 | goto found_it; | ||
3540 | prev_hlock = hlock; | ||
3541 | } | ||
3542 | return print_unlock_imbalance_bug(curr, lock, ip); | ||
3543 | 3627 | ||
3544 | found_it: | ||
3545 | if (hlock->instance == lock) | 3628 | if (hlock->instance == lock) |
3546 | lock_release_holdtime(hlock); | 3629 | lock_release_holdtime(hlock); |
3547 | 3630 | ||
@@ -3568,15 +3651,8 @@ found_it: | |||
3568 | curr->lockdep_depth = i; | 3651 | curr->lockdep_depth = i; |
3569 | curr->curr_chain_key = hlock->prev_chain_key; | 3652 | curr->curr_chain_key = hlock->prev_chain_key; |
3570 | 3653 | ||
3571 | for (i++; i < depth; i++) { | 3654 | if (reacquire_held_locks(curr, depth, i + 1)) |
3572 | hlock = curr->held_locks + i; | 3655 | return 0; |
3573 | if (!__lock_acquire(hlock->instance, | ||
3574 | hlock_class(hlock)->subclass, hlock->trylock, | ||
3575 | hlock->read, hlock->check, hlock->hardirqs_off, | ||
3576 | hlock->nest_lock, hlock->acquire_ip, | ||
3577 | hlock->references, hlock->pin_count)) | ||
3578 | return 0; | ||
3579 | } | ||
3580 | 3656 | ||
3581 | /* | 3657 | /* |
3582 | * We had N bottles of beer on the wall, we drank one, but now | 3658 | * We had N bottles of beer on the wall, we drank one, but now |
@@ -3741,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name, | |||
3741 | } | 3817 | } |
3742 | EXPORT_SYMBOL_GPL(lock_set_class); | 3818 | EXPORT_SYMBOL_GPL(lock_set_class); |
3743 | 3819 | ||
3820 | void lock_downgrade(struct lockdep_map *lock, unsigned long ip) | ||
3821 | { | ||
3822 | unsigned long flags; | ||
3823 | |||
3824 | if (unlikely(current->lockdep_recursion)) | ||
3825 | return; | ||
3826 | |||
3827 | raw_local_irq_save(flags); | ||
3828 | current->lockdep_recursion = 1; | ||
3829 | check_flags(flags); | ||
3830 | if (__lock_downgrade(lock, ip)) | ||
3831 | check_chain_key(current); | ||
3832 | current->lockdep_recursion = 0; | ||
3833 | raw_local_irq_restore(flags); | ||
3834 | } | ||
3835 | EXPORT_SYMBOL_GPL(lock_downgrade); | ||
3836 | |||
3744 | /* | 3837 | /* |
3745 | * We are not always called with irqs disabled - do that here, | 3838 | * We are not always called with irqs disabled - do that here, |
3746 | * and also avoid lockdep recursion: | 3839 | * and also avoid lockdep recursion: |
@@ -3861,13 +3954,15 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); | |||
3861 | 3954 | ||
3862 | void lockdep_set_current_reclaim_state(gfp_t gfp_mask) | 3955 | void lockdep_set_current_reclaim_state(gfp_t gfp_mask) |
3863 | { | 3956 | { |
3864 | current->lockdep_reclaim_gfp = gfp_mask; | 3957 | current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); |
3865 | } | 3958 | } |
3959 | EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); | ||
3866 | 3960 | ||
3867 | void lockdep_clear_current_reclaim_state(void) | 3961 | void lockdep_clear_current_reclaim_state(void) |
3868 | { | 3962 | { |
3869 | current->lockdep_reclaim_gfp = 0; | 3963 | current->lockdep_reclaim_gfp = 0; |
3870 | } | 3964 | } |
3965 | EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); | ||
3871 | 3966 | ||
3872 | #ifdef CONFIG_LOCK_STAT | 3967 | #ifdef CONFIG_LOCK_STAT |
3873 | static int | 3968 | static int |
@@ -3880,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3880 | return 0; | 3975 | return 0; |
3881 | 3976 | ||
3882 | printk("\n"); | 3977 | printk("\n"); |
3883 | printk("=================================\n"); | 3978 | pr_warn("=================================\n"); |
3884 | printk("[ BUG: bad contention detected! ]\n"); | 3979 | pr_warn("WARNING: bad contention detected!\n"); |
3885 | print_kernel_ident(); | 3980 | print_kernel_ident(); |
3886 | printk("---------------------------------\n"); | 3981 | pr_warn("---------------------------------\n"); |
3887 | printk("%s/%d is trying to contend lock (", | 3982 | printk("%s/%d is trying to contend lock (", |
3888 | curr->comm, task_pid_nr(curr)); | 3983 | curr->comm, task_pid_nr(curr)); |
3889 | print_lockdep_cache(lock); | 3984 | print_lockdep_cache(lock); |
@@ -3903,7 +3998,7 @@ static void | |||
3903 | __lock_contended(struct lockdep_map *lock, unsigned long ip) | 3998 | __lock_contended(struct lockdep_map *lock, unsigned long ip) |
3904 | { | 3999 | { |
3905 | struct task_struct *curr = current; | 4000 | struct task_struct *curr = current; |
3906 | struct held_lock *hlock, *prev_hlock; | 4001 | struct held_lock *hlock; |
3907 | struct lock_class_stats *stats; | 4002 | struct lock_class_stats *stats; |
3908 | unsigned int depth; | 4003 | unsigned int depth; |
3909 | int i, contention_point, contending_point; | 4004 | int i, contention_point, contending_point; |
@@ -3916,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
3916 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 4011 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3917 | return; | 4012 | return; |
3918 | 4013 | ||
3919 | prev_hlock = NULL; | 4014 | hlock = find_held_lock(curr, lock, depth, &i); |
3920 | for (i = depth-1; i >= 0; i--) { | 4015 | if (!hlock) { |
3921 | hlock = curr->held_locks + i; | 4016 | print_lock_contention_bug(curr, lock, ip); |
3922 | /* | 4017 | return; |
3923 | * We must not cross into another context: | ||
3924 | */ | ||
3925 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
3926 | break; | ||
3927 | if (match_held_lock(hlock, lock)) | ||
3928 | goto found_it; | ||
3929 | prev_hlock = hlock; | ||
3930 | } | 4018 | } |
3931 | print_lock_contention_bug(curr, lock, ip); | ||
3932 | return; | ||
3933 | 4019 | ||
3934 | found_it: | ||
3935 | if (hlock->instance != lock) | 4020 | if (hlock->instance != lock) |
3936 | return; | 4021 | return; |
3937 | 4022 | ||
@@ -3955,7 +4040,7 @@ static void | |||
3955 | __lock_acquired(struct lockdep_map *lock, unsigned long ip) | 4040 | __lock_acquired(struct lockdep_map *lock, unsigned long ip) |
3956 | { | 4041 | { |
3957 | struct task_struct *curr = current; | 4042 | struct task_struct *curr = current; |
3958 | struct held_lock *hlock, *prev_hlock; | 4043 | struct held_lock *hlock; |
3959 | struct lock_class_stats *stats; | 4044 | struct lock_class_stats *stats; |
3960 | unsigned int depth; | 4045 | unsigned int depth; |
3961 | u64 now, waittime = 0; | 4046 | u64 now, waittime = 0; |
@@ -3969,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) | |||
3969 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 4054 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3970 | return; | 4055 | return; |
3971 | 4056 | ||
3972 | prev_hlock = NULL; | 4057 | hlock = find_held_lock(curr, lock, depth, &i); |
3973 | for (i = depth-1; i >= 0; i--) { | 4058 | if (!hlock) { |
3974 | hlock = curr->held_locks + i; | 4059 | print_lock_contention_bug(curr, lock, _RET_IP_); |
3975 | /* | 4060 | return; |
3976 | * We must not cross into another context: | ||
3977 | */ | ||
3978 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
3979 | break; | ||
3980 | if (match_held_lock(hlock, lock)) | ||
3981 | goto found_it; | ||
3982 | prev_hlock = hlock; | ||
3983 | } | 4061 | } |
3984 | print_lock_contention_bug(curr, lock, _RET_IP_); | ||
3985 | return; | ||
3986 | 4062 | ||
3987 | found_it: | ||
3988 | if (hlock->instance != lock) | 4063 | if (hlock->instance != lock) |
3989 | return; | 4064 | return; |
3990 | 4065 | ||
@@ -4172,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
4172 | * If the class exists we look it up and zap it: | 4247 | * If the class exists we look it up and zap it: |
4173 | */ | 4248 | */ |
4174 | class = look_up_lock_class(lock, j); | 4249 | class = look_up_lock_class(lock, j); |
4175 | if (class) | 4250 | if (!IS_ERR_OR_NULL(class)) |
4176 | zap_class(class); | 4251 | zap_class(class); |
4177 | } | 4252 | } |
4178 | /* | 4253 | /* |
@@ -4244,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
4244 | return; | 4319 | return; |
4245 | 4320 | ||
4246 | printk("\n"); | 4321 | printk("\n"); |
4247 | printk("=========================\n"); | 4322 | pr_warn("=========================\n"); |
4248 | printk("[ BUG: held lock freed! ]\n"); | 4323 | pr_warn("WARNING: held lock freed!\n"); |
4249 | print_kernel_ident(); | 4324 | print_kernel_ident(); |
4250 | printk("-------------------------\n"); | 4325 | pr_warn("-------------------------\n"); |
4251 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4326 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
4252 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4327 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
4253 | print_lock(hlock); | 4328 | print_lock(hlock); |
@@ -4302,11 +4377,11 @@ static void print_held_locks_bug(void) | |||
4302 | return; | 4377 | return; |
4303 | 4378 | ||
4304 | printk("\n"); | 4379 | printk("\n"); |
4305 | printk("=====================================\n"); | 4380 | pr_warn("====================================\n"); |
4306 | printk("[ BUG: %s/%d still has locks held! ]\n", | 4381 | pr_warn("WARNING: %s/%d still has locks held!\n", |
4307 | current->comm, task_pid_nr(current)); | 4382 | current->comm, task_pid_nr(current)); |
4308 | print_kernel_ident(); | 4383 | print_kernel_ident(); |
4309 | printk("-------------------------------------\n"); | 4384 | pr_warn("------------------------------------\n"); |
4310 | lockdep_print_held_locks(current); | 4385 | lockdep_print_held_locks(current); |
4311 | printk("\nstack backtrace:\n"); | 4386 | printk("\nstack backtrace:\n"); |
4312 | dump_stack(); | 4387 | dump_stack(); |
@@ -4371,7 +4446,7 @@ retry: | |||
4371 | } while_each_thread(g, p); | 4446 | } while_each_thread(g, p); |
4372 | 4447 | ||
4373 | printk("\n"); | 4448 | printk("\n"); |
4374 | printk("=============================================\n\n"); | 4449 | pr_warn("=============================================\n\n"); |
4375 | 4450 | ||
4376 | if (unlock) | 4451 | if (unlock) |
4377 | read_unlock(&tasklist_lock); | 4452 | read_unlock(&tasklist_lock); |
@@ -4401,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void) | |||
4401 | if (!debug_locks_off()) | 4476 | if (!debug_locks_off()) |
4402 | return; | 4477 | return; |
4403 | printk("\n"); | 4478 | printk("\n"); |
4404 | printk("================================================\n"); | 4479 | pr_warn("================================================\n"); |
4405 | printk("[ BUG: lock held when returning to user space! ]\n"); | 4480 | pr_warn("WARNING: lock held when returning to user space!\n"); |
4406 | print_kernel_ident(); | 4481 | print_kernel_ident(); |
4407 | printk("------------------------------------------------\n"); | 4482 | pr_warn("------------------------------------------------\n"); |
4408 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4483 | printk("%s/%d is leaving the kernel with locks still held!\n", |
4409 | curr->comm, curr->pid); | 4484 | curr->comm, curr->pid); |
4410 | lockdep_print_held_locks(curr); | 4485 | lockdep_print_held_locks(curr); |
@@ -4421,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4421 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | 4496 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ |
4422 | /* Note: the following can be executed concurrently, so be careful. */ | 4497 | /* Note: the following can be executed concurrently, so be careful. */ |
4423 | printk("\n"); | 4498 | printk("\n"); |
4424 | pr_err("===============================\n"); | 4499 | pr_warn("=============================\n"); |
4425 | pr_err("[ ERR: suspicious RCU usage. ]\n"); | 4500 | pr_warn("WARNING: suspicious RCU usage\n"); |
4426 | print_kernel_ident(); | 4501 | print_kernel_ident(); |
4427 | pr_err("-------------------------------\n"); | 4502 | pr_warn("-----------------------------\n"); |
4428 | pr_err("%s:%d %s!\n", file, line, s); | 4503 | printk("%s:%d %s!\n", file, line, s); |
4429 | pr_err("\nother info that might help us debug this:\n\n"); | 4504 | printk("\nother info that might help us debug this:\n\n"); |
4430 | pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4505 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
4431 | !rcu_lockdep_current_cpu_online() | 4506 | !rcu_lockdep_current_cpu_online() |
4432 | ? "RCU used illegally from offline CPU!\n" | 4507 | ? "RCU used illegally from offline CPU!\n" |
4433 | : !rcu_is_watching() | 4508 | : !rcu_is_watching() |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f..58e366ad36f4 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
102 | return; | 102 | return; |
103 | } | 103 | } |
104 | 104 | ||
105 | printk("\n============================================\n"); | 105 | pr_warn("\n"); |
106 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 106 | pr_warn("============================================\n"); |
107 | printk("%s\n", print_tainted()); | 107 | pr_warn("WARNING: circular locking deadlock detected!\n"); |
108 | printk( "--------------------------------------------\n"); | 108 | pr_warn("%s\n", print_tainted()); |
109 | pr_warn("--------------------------------------------\n"); | ||
109 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 110 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
110 | task->comm, task_pid_nr(task), | 111 | task->comm, task_pid_nr(task), |
111 | current->comm, task_pid_nr(current)); | 112 | current->comm, task_pid_nr(current)); |
@@ -174,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
174 | lock->name = name; | 175 | lock->name = name; |
175 | } | 176 | } |
176 | 177 | ||
177 | void | ||
178 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) | ||
179 | { | ||
180 | } | ||
181 | |||
182 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
183 | { | ||
184 | } | ||
185 | |||
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b6..b585af9a1b50 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h | |||
@@ -9,9 +9,6 @@ | |||
9 | * This file contains macros used solely by rtmutex.c. Debug version. | 9 | * This file contains macros used solely by rtmutex.c. Debug version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | extern void | ||
13 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
14 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
15 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | 12 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); |
16 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | 13 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); |
17 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | 14 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6edc32ecd9c5..b95509416909 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, | |||
224 | } | 224 | } |
225 | #endif | 225 | #endif |
226 | 226 | ||
227 | /* | ||
228 | * Only use with rt_mutex_waiter_{less,equal}() | ||
229 | */ | ||
230 | #define task_to_waiter(p) \ | ||
231 | &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } | ||
232 | |||
227 | static inline int | 233 | static inline int |
228 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, | 234 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, |
229 | struct rt_mutex_waiter *right) | 235 | struct rt_mutex_waiter *right) |
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, | |||
238 | * then right waiter has a dl_prio() too. | 244 | * then right waiter has a dl_prio() too. |
239 | */ | 245 | */ |
240 | if (dl_prio(left->prio)) | 246 | if (dl_prio(left->prio)) |
241 | return dl_time_before(left->task->dl.deadline, | 247 | return dl_time_before(left->deadline, right->deadline); |
242 | right->task->dl.deadline); | ||
243 | 248 | ||
244 | return 0; | 249 | return 0; |
245 | } | 250 | } |
246 | 251 | ||
252 | static inline int | ||
253 | rt_mutex_waiter_equal(struct rt_mutex_waiter *left, | ||
254 | struct rt_mutex_waiter *right) | ||
255 | { | ||
256 | if (left->prio != right->prio) | ||
257 | return 0; | ||
258 | |||
259 | /* | ||
260 | * If both waiters have dl_prio(), we check the deadlines of the | ||
261 | * associated tasks. | ||
262 | * If left waiter has a dl_prio(), and we didn't return 0 above, | ||
263 | * then right waiter has a dl_prio() too. | ||
264 | */ | ||
265 | if (dl_prio(left->prio)) | ||
266 | return left->deadline == right->deadline; | ||
267 | |||
268 | return 1; | ||
269 | } | ||
270 | |||
247 | static void | 271 | static void |
248 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | 272 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) |
249 | { | 273 | { |
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) | |||
322 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | 346 | RB_CLEAR_NODE(&waiter->pi_tree_entry); |
323 | } | 347 | } |
324 | 348 | ||
325 | /* | 349 | static void rt_mutex_adjust_prio(struct task_struct *p) |
326 | * Calculate task priority from the waiter tree priority | ||
327 | * | ||
328 | * Return task->normal_prio when the waiter tree is empty or when | ||
329 | * the waiter is not allowed to do priority boosting | ||
330 | */ | ||
331 | int rt_mutex_getprio(struct task_struct *task) | ||
332 | { | ||
333 | if (likely(!task_has_pi_waiters(task))) | ||
334 | return task->normal_prio; | ||
335 | |||
336 | return min(task_top_pi_waiter(task)->prio, | ||
337 | task->normal_prio); | ||
338 | } | ||
339 | |||
340 | struct task_struct *rt_mutex_get_top_task(struct task_struct *task) | ||
341 | { | 350 | { |
342 | if (likely(!task_has_pi_waiters(task))) | 351 | struct task_struct *pi_task = NULL; |
343 | return NULL; | ||
344 | |||
345 | return task_top_pi_waiter(task)->task; | ||
346 | } | ||
347 | 352 | ||
348 | /* | 353 | lockdep_assert_held(&p->pi_lock); |
349 | * Called by sched_setscheduler() to get the priority which will be | ||
350 | * effective after the change. | ||
351 | */ | ||
352 | int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) | ||
353 | { | ||
354 | if (!task_has_pi_waiters(task)) | ||
355 | return newprio; | ||
356 | 354 | ||
357 | if (task_top_pi_waiter(task)->task->prio <= newprio) | 355 | if (task_has_pi_waiters(p)) |
358 | return task_top_pi_waiter(task)->task->prio; | 356 | pi_task = task_top_pi_waiter(p)->task; |
359 | return newprio; | ||
360 | } | ||
361 | 357 | ||
362 | /* | 358 | rt_mutex_setprio(p, pi_task); |
363 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
364 | * | ||
365 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
366 | */ | ||
367 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
368 | { | ||
369 | int prio = rt_mutex_getprio(task); | ||
370 | |||
371 | if (task->prio != prio || dl_prio(prio)) | ||
372 | rt_mutex_setprio(task, prio); | ||
373 | } | ||
374 | |||
375 | /* | ||
376 | * Adjust task priority (undo boosting). Called from the exit path of | ||
377 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
378 | * | ||
379 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
380 | * allow the lock to be taken while or before we readjust the priority | ||
381 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
382 | * outside of the debug path.) | ||
383 | */ | ||
384 | void rt_mutex_adjust_prio(struct task_struct *task) | ||
385 | { | ||
386 | unsigned long flags; | ||
387 | |||
388 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
389 | __rt_mutex_adjust_prio(task); | ||
390 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
391 | } | 359 | } |
392 | 360 | ||
393 | /* | 361 | /* |
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
610 | * enabled we continue, but stop the requeueing in the chain | 578 | * enabled we continue, but stop the requeueing in the chain |
611 | * walk. | 579 | * walk. |
612 | */ | 580 | */ |
613 | if (waiter->prio == task->prio) { | 581 | if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { |
614 | if (!detect_deadlock) | 582 | if (!detect_deadlock) |
615 | goto out_unlock_pi; | 583 | goto out_unlock_pi; |
616 | else | 584 | else |
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
706 | 674 | ||
707 | /* [7] Requeue the waiter in the lock waiter tree. */ | 675 | /* [7] Requeue the waiter in the lock waiter tree. */ |
708 | rt_mutex_dequeue(lock, waiter); | 676 | rt_mutex_dequeue(lock, waiter); |
677 | |||
678 | /* | ||
679 | * Update the waiter prio fields now that we're dequeued. | ||
680 | * | ||
681 | * These values can have changed through either: | ||
682 | * | ||
683 | * sys_sched_set_scheduler() / sys_sched_setattr() | ||
684 | * | ||
685 | * or | ||
686 | * | ||
687 | * DL CBS enforcement advancing the effective deadline. | ||
688 | * | ||
689 | * Even though pi_waiters also uses these fields, and that tree is only | ||
690 | * updated in [11], we can do this here, since we hold [L], which | ||
691 | * serializes all pi_waiters access and rb_erase() does not care about | ||
692 | * the values of the node being removed. | ||
693 | */ | ||
709 | waiter->prio = task->prio; | 694 | waiter->prio = task->prio; |
695 | waiter->deadline = task->dl.deadline; | ||
696 | |||
710 | rt_mutex_enqueue(lock, waiter); | 697 | rt_mutex_enqueue(lock, waiter); |
711 | 698 | ||
712 | /* [8] Release the task */ | 699 | /* [8] Release the task */ |
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
747 | */ | 734 | */ |
748 | rt_mutex_dequeue_pi(task, prerequeue_top_waiter); | 735 | rt_mutex_dequeue_pi(task, prerequeue_top_waiter); |
749 | rt_mutex_enqueue_pi(task, waiter); | 736 | rt_mutex_enqueue_pi(task, waiter); |
750 | __rt_mutex_adjust_prio(task); | 737 | rt_mutex_adjust_prio(task); |
751 | 738 | ||
752 | } else if (prerequeue_top_waiter == waiter) { | 739 | } else if (prerequeue_top_waiter == waiter) { |
753 | /* | 740 | /* |
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
763 | rt_mutex_dequeue_pi(task, waiter); | 750 | rt_mutex_dequeue_pi(task, waiter); |
764 | waiter = rt_mutex_top_waiter(lock); | 751 | waiter = rt_mutex_top_waiter(lock); |
765 | rt_mutex_enqueue_pi(task, waiter); | 752 | rt_mutex_enqueue_pi(task, waiter); |
766 | __rt_mutex_adjust_prio(task); | 753 | rt_mutex_adjust_prio(task); |
767 | } else { | 754 | } else { |
768 | /* | 755 | /* |
769 | * Nothing changed. No need to do any priority | 756 | * Nothing changed. No need to do any priority |
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
833 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | 820 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
834 | struct rt_mutex_waiter *waiter) | 821 | struct rt_mutex_waiter *waiter) |
835 | { | 822 | { |
823 | lockdep_assert_held(&lock->wait_lock); | ||
824 | |||
836 | /* | 825 | /* |
837 | * Before testing whether we can acquire @lock, we set the | 826 | * Before testing whether we can acquire @lock, we set the |
838 | * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all | 827 | * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all |
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
892 | * the top waiter priority (kernel view), | 881 | * the top waiter priority (kernel view), |
893 | * @task lost. | 882 | * @task lost. |
894 | */ | 883 | */ |
895 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) | 884 | if (!rt_mutex_waiter_less(task_to_waiter(task), |
885 | rt_mutex_top_waiter(lock))) | ||
896 | return 0; | 886 | return 0; |
897 | 887 | ||
898 | /* | 888 | /* |
@@ -938,8 +928,6 @@ takeit: | |||
938 | */ | 928 | */ |
939 | rt_mutex_set_owner(lock, task); | 929 | rt_mutex_set_owner(lock, task); |
940 | 930 | ||
941 | rt_mutex_deadlock_account_lock(lock, task); | ||
942 | |||
943 | return 1; | 931 | return 1; |
944 | } | 932 | } |
945 | 933 | ||
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
960 | struct rt_mutex *next_lock; | 948 | struct rt_mutex *next_lock; |
961 | int chain_walk = 0, res; | 949 | int chain_walk = 0, res; |
962 | 950 | ||
951 | lockdep_assert_held(&lock->wait_lock); | ||
952 | |||
963 | /* | 953 | /* |
964 | * Early deadlock detection. We really don't want the task to | 954 | * Early deadlock detection. We really don't want the task to |
965 | * enqueue on itself just to untangle the mess later. It's not | 955 | * enqueue on itself just to untangle the mess later. It's not |
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
973 | return -EDEADLK; | 963 | return -EDEADLK; |
974 | 964 | ||
975 | raw_spin_lock(&task->pi_lock); | 965 | raw_spin_lock(&task->pi_lock); |
976 | __rt_mutex_adjust_prio(task); | 966 | rt_mutex_adjust_prio(task); |
977 | waiter->task = task; | 967 | waiter->task = task; |
978 | waiter->lock = lock; | 968 | waiter->lock = lock; |
979 | waiter->prio = task->prio; | 969 | waiter->prio = task->prio; |
970 | waiter->deadline = task->dl.deadline; | ||
980 | 971 | ||
981 | /* Get the top priority waiter on the lock */ | 972 | /* Get the top priority waiter on the lock */ |
982 | if (rt_mutex_has_waiters(lock)) | 973 | if (rt_mutex_has_waiters(lock)) |
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
995 | rt_mutex_dequeue_pi(owner, top_waiter); | 986 | rt_mutex_dequeue_pi(owner, top_waiter); |
996 | rt_mutex_enqueue_pi(owner, waiter); | 987 | rt_mutex_enqueue_pi(owner, waiter); |
997 | 988 | ||
998 | __rt_mutex_adjust_prio(owner); | 989 | rt_mutex_adjust_prio(owner); |
999 | if (owner->pi_blocked_on) | 990 | if (owner->pi_blocked_on) |
1000 | chain_walk = 1; | 991 | chain_walk = 1; |
1001 | } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { | 992 | } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { |
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, | |||
1047 | waiter = rt_mutex_top_waiter(lock); | 1038 | waiter = rt_mutex_top_waiter(lock); |
1048 | 1039 | ||
1049 | /* | 1040 | /* |
1050 | * Remove it from current->pi_waiters. We do not adjust a | 1041 | * Remove it from current->pi_waiters and deboost. |
1051 | * possible priority boost right now. We execute wakeup in the | 1042 | * |
1052 | * boosted mode and go back to normal after releasing | 1043 | * We must in fact deboost here in order to ensure we call |
1053 | * lock->wait_lock. | 1044 | * rt_mutex_setprio() to update p->pi_top_task before the |
1045 | * task unblocks. | ||
1054 | */ | 1046 | */ |
1055 | rt_mutex_dequeue_pi(current, waiter); | 1047 | rt_mutex_dequeue_pi(current, waiter); |
1048 | rt_mutex_adjust_prio(current); | ||
1056 | 1049 | ||
1057 | /* | 1050 | /* |
1058 | * As we are waking up the top waiter, and the waiter stays | 1051 | * As we are waking up the top waiter, and the waiter stays |
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, | |||
1064 | */ | 1057 | */ |
1065 | lock->owner = (void *) RT_MUTEX_HAS_WAITERS; | 1058 | lock->owner = (void *) RT_MUTEX_HAS_WAITERS; |
1066 | 1059 | ||
1067 | raw_spin_unlock(¤t->pi_lock); | 1060 | /* |
1068 | 1061 | * We deboosted before waking the top waiter task such that we don't | |
1062 | * run two tasks with the 'same' priority (and ensure the | ||
1063 | * p->pi_top_task pointer points to a blocked task). This however can | ||
1064 | * lead to priority inversion if we would get preempted after the | ||
1065 | * deboost but before waking our donor task, hence the preempt_disable() | ||
1066 | * before unlock. | ||
1067 | * | ||
1068 | * Pairs with preempt_enable() in rt_mutex_postunlock(); | ||
1069 | */ | ||
1070 | preempt_disable(); | ||
1069 | wake_q_add(wake_q, waiter->task); | 1071 | wake_q_add(wake_q, waiter->task); |
1072 | raw_spin_unlock(¤t->pi_lock); | ||
1070 | } | 1073 | } |
1071 | 1074 | ||
1072 | /* | 1075 | /* |
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock, | |||
1082 | struct task_struct *owner = rt_mutex_owner(lock); | 1085 | struct task_struct *owner = rt_mutex_owner(lock); |
1083 | struct rt_mutex *next_lock; | 1086 | struct rt_mutex *next_lock; |
1084 | 1087 | ||
1088 | lockdep_assert_held(&lock->wait_lock); | ||
1089 | |||
1085 | raw_spin_lock(¤t->pi_lock); | 1090 | raw_spin_lock(¤t->pi_lock); |
1086 | rt_mutex_dequeue(lock, waiter); | 1091 | rt_mutex_dequeue(lock, waiter); |
1087 | current->pi_blocked_on = NULL; | 1092 | current->pi_blocked_on = NULL; |
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock, | |||
1101 | if (rt_mutex_has_waiters(lock)) | 1106 | if (rt_mutex_has_waiters(lock)) |
1102 | rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); | 1107 | rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); |
1103 | 1108 | ||
1104 | __rt_mutex_adjust_prio(owner); | 1109 | rt_mutex_adjust_prio(owner); |
1105 | 1110 | ||
1106 | /* Store the lock on which owner is blocked or NULL */ | 1111 | /* Store the lock on which owner is blocked or NULL */ |
1107 | next_lock = task_blocked_on_lock(owner); | 1112 | next_lock = task_blocked_on_lock(owner); |
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
1140 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 1145 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
1141 | 1146 | ||
1142 | waiter = task->pi_blocked_on; | 1147 | waiter = task->pi_blocked_on; |
1143 | if (!waiter || (waiter->prio == task->prio && | 1148 | if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { |
1144 | !dl_prio(task->prio))) { | ||
1145 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 1149 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
1146 | return; | 1150 | return; |
1147 | } | 1151 | } |
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
1155 | next_lock, NULL, task); | 1159 | next_lock, NULL, task); |
1156 | } | 1160 | } |
1157 | 1161 | ||
1162 | void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
1163 | { | ||
1164 | debug_rt_mutex_init_waiter(waiter); | ||
1165 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | ||
1166 | RB_CLEAR_NODE(&waiter->tree_entry); | ||
1167 | waiter->task = NULL; | ||
1168 | } | ||
1169 | |||
1158 | /** | 1170 | /** |
1159 | * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop | 1171 | * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop |
1160 | * @lock: the rt_mutex to take | 1172 | * @lock: the rt_mutex to take |
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
1237 | unsigned long flags; | 1249 | unsigned long flags; |
1238 | int ret = 0; | 1250 | int ret = 0; |
1239 | 1251 | ||
1240 | debug_rt_mutex_init_waiter(&waiter); | 1252 | rt_mutex_init_waiter(&waiter); |
1241 | RB_CLEAR_NODE(&waiter.pi_tree_entry); | ||
1242 | RB_CLEAR_NODE(&waiter.tree_entry); | ||
1243 | 1253 | ||
1244 | /* | 1254 | /* |
1245 | * Technically we could use raw_spin_[un]lock_irq() here, but this can | 1255 | * Technically we could use raw_spin_[un]lock_irq() here, but this can |
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) | |||
1330 | 1340 | ||
1331 | /* | 1341 | /* |
1332 | * Slow path to release a rt-mutex. | 1342 | * Slow path to release a rt-mutex. |
1333 | * Return whether the current task needs to undo a potential priority boosting. | 1343 | * |
1344 | * Return whether the current task needs to call rt_mutex_postunlock(). | ||
1334 | */ | 1345 | */ |
1335 | static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, | 1346 | static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, |
1336 | struct wake_q_head *wake_q) | 1347 | struct wake_q_head *wake_q) |
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, | |||
1342 | 1353 | ||
1343 | debug_rt_mutex_unlock(lock); | 1354 | debug_rt_mutex_unlock(lock); |
1344 | 1355 | ||
1345 | rt_mutex_deadlock_account_unlock(current); | ||
1346 | |||
1347 | /* | 1356 | /* |
1348 | * We must be careful here if the fast path is enabled. If we | 1357 | * We must be careful here if the fast path is enabled. If we |
1349 | * have no waiters queued we cannot set owner to NULL here | 1358 | * have no waiters queued we cannot set owner to NULL here |
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, | |||
1390 | * Queue the next waiter for wakeup once we release the wait_lock. | 1399 | * Queue the next waiter for wakeup once we release the wait_lock. |
1391 | */ | 1400 | */ |
1392 | mark_wakeup_next_waiter(wake_q, lock); | 1401 | mark_wakeup_next_waiter(wake_q, lock); |
1393 | |||
1394 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | 1402 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); |
1395 | 1403 | ||
1396 | /* check PI boosting */ | 1404 | return true; /* call rt_mutex_postunlock() */ |
1397 | return true; | ||
1398 | } | 1405 | } |
1399 | 1406 | ||
1400 | /* | 1407 | /* |
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, | |||
1409 | struct hrtimer_sleeper *timeout, | 1416 | struct hrtimer_sleeper *timeout, |
1410 | enum rtmutex_chainwalk chwalk)) | 1417 | enum rtmutex_chainwalk chwalk)) |
1411 | { | 1418 | { |
1412 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { | 1419 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) |
1413 | rt_mutex_deadlock_account_lock(lock, current); | ||
1414 | return 0; | 1420 | return 0; |
1415 | } else | 1421 | |
1416 | return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); | 1422 | return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); |
1417 | } | 1423 | } |
1418 | 1424 | ||
1419 | static inline int | 1425 | static inline int |
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | |||
1425 | enum rtmutex_chainwalk chwalk)) | 1431 | enum rtmutex_chainwalk chwalk)) |
1426 | { | 1432 | { |
1427 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && | 1433 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && |
1428 | likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { | 1434 | likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) |
1429 | rt_mutex_deadlock_account_lock(lock, current); | ||
1430 | return 0; | 1435 | return 0; |
1431 | } else | 1436 | |
1432 | return slowfn(lock, state, timeout, chwalk); | 1437 | return slowfn(lock, state, timeout, chwalk); |
1433 | } | 1438 | } |
1434 | 1439 | ||
1435 | static inline int | 1440 | static inline int |
1436 | rt_mutex_fasttrylock(struct rt_mutex *lock, | 1441 | rt_mutex_fasttrylock(struct rt_mutex *lock, |
1437 | int (*slowfn)(struct rt_mutex *lock)) | 1442 | int (*slowfn)(struct rt_mutex *lock)) |
1438 | { | 1443 | { |
1439 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { | 1444 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) |
1440 | rt_mutex_deadlock_account_lock(lock, current); | ||
1441 | return 1; | 1445 | return 1; |
1442 | } | 1446 | |
1443 | return slowfn(lock); | 1447 | return slowfn(lock); |
1444 | } | 1448 | } |
1445 | 1449 | ||
1450 | /* | ||
1451 | * Performs the wakeup of the the top-waiter and re-enables preemption. | ||
1452 | */ | ||
1453 | void rt_mutex_postunlock(struct wake_q_head *wake_q) | ||
1454 | { | ||
1455 | wake_up_q(wake_q); | ||
1456 | |||
1457 | /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ | ||
1458 | preempt_enable(); | ||
1459 | } | ||
1460 | |||
1446 | static inline void | 1461 | static inline void |
1447 | rt_mutex_fastunlock(struct rt_mutex *lock, | 1462 | rt_mutex_fastunlock(struct rt_mutex *lock, |
1448 | bool (*slowfn)(struct rt_mutex *lock, | 1463 | bool (*slowfn)(struct rt_mutex *lock, |
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock, | |||
1450 | { | 1465 | { |
1451 | DEFINE_WAKE_Q(wake_q); | 1466 | DEFINE_WAKE_Q(wake_q); |
1452 | 1467 | ||
1453 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { | 1468 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) |
1454 | rt_mutex_deadlock_account_unlock(current); | 1469 | return; |
1455 | |||
1456 | } else { | ||
1457 | bool deboost = slowfn(lock, &wake_q); | ||
1458 | |||
1459 | wake_up_q(&wake_q); | ||
1460 | 1470 | ||
1461 | /* Undo pi boosting if necessary: */ | 1471 | if (slowfn(lock, &wake_q)) |
1462 | if (deboost) | 1472 | rt_mutex_postunlock(&wake_q); |
1463 | rt_mutex_adjust_prio(current); | ||
1464 | } | ||
1465 | } | 1473 | } |
1466 | 1474 | ||
1467 | /** | 1475 | /** |
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) | |||
1495 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | 1503 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); |
1496 | 1504 | ||
1497 | /* | 1505 | /* |
1498 | * Futex variant with full deadlock detection. | 1506 | * Futex variant, must not use fastpath. |
1499 | */ | 1507 | */ |
1500 | int rt_mutex_timed_futex_lock(struct rt_mutex *lock, | 1508 | int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) |
1501 | struct hrtimer_sleeper *timeout) | ||
1502 | { | 1509 | { |
1503 | might_sleep(); | 1510 | return rt_mutex_slowtrylock(lock); |
1504 | |||
1505 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
1506 | RT_MUTEX_FULL_CHAINWALK, | ||
1507 | rt_mutex_slowlock); | ||
1508 | } | 1511 | } |
1509 | 1512 | ||
1510 | /** | 1513 | /** |
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) | |||
1563 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | 1566 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); |
1564 | 1567 | ||
1565 | /** | 1568 | /** |
1566 | * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock | 1569 | * Futex variant, that since futex variants do not use the fast-path, can be |
1567 | * @lock: the rt_mutex to be unlocked | 1570 | * simple and will not need to retry. |
1568 | * | ||
1569 | * Returns: true/false indicating whether priority adjustment is | ||
1570 | * required or not. | ||
1571 | */ | 1571 | */ |
1572 | bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, | 1572 | bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, |
1573 | struct wake_q_head *wqh) | 1573 | struct wake_q_head *wake_q) |
1574 | { | 1574 | { |
1575 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { | 1575 | lockdep_assert_held(&lock->wait_lock); |
1576 | rt_mutex_deadlock_account_unlock(current); | 1576 | |
1577 | return false; | 1577 | debug_rt_mutex_unlock(lock); |
1578 | |||
1579 | if (!rt_mutex_has_waiters(lock)) { | ||
1580 | lock->owner = NULL; | ||
1581 | return false; /* done */ | ||
1578 | } | 1582 | } |
1579 | return rt_mutex_slowunlock(lock, wqh); | 1583 | |
1584 | /* | ||
1585 | * We've already deboosted, mark_wakeup_next_waiter() will | ||
1586 | * retain preempt_disabled when we drop the wait_lock, to | ||
1587 | * avoid inversion prior to the wakeup. preempt_disable() | ||
1588 | * therein pairs with rt_mutex_postunlock(). | ||
1589 | */ | ||
1590 | mark_wakeup_next_waiter(wake_q, lock); | ||
1591 | |||
1592 | return true; /* call postunlock() */ | ||
1593 | } | ||
1594 | |||
1595 | void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) | ||
1596 | { | ||
1597 | DEFINE_WAKE_Q(wake_q); | ||
1598 | bool postunlock; | ||
1599 | |||
1600 | raw_spin_lock_irq(&lock->wait_lock); | ||
1601 | postunlock = __rt_mutex_futex_unlock(lock, &wake_q); | ||
1602 | raw_spin_unlock_irq(&lock->wait_lock); | ||
1603 | |||
1604 | if (postunlock) | ||
1605 | rt_mutex_postunlock(&wake_q); | ||
1580 | } | 1606 | } |
1581 | 1607 | ||
1582 | /** | 1608 | /** |
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
1637 | __rt_mutex_init(lock, NULL); | 1663 | __rt_mutex_init(lock, NULL); |
1638 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | 1664 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
1639 | rt_mutex_set_owner(lock, proxy_owner); | 1665 | rt_mutex_set_owner(lock, proxy_owner); |
1640 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
1641 | } | 1666 | } |
1642 | 1667 | ||
1643 | /** | 1668 | /** |
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
1657 | { | 1682 | { |
1658 | debug_rt_mutex_proxy_unlock(lock); | 1683 | debug_rt_mutex_proxy_unlock(lock); |
1659 | rt_mutex_set_owner(lock, NULL); | 1684 | rt_mutex_set_owner(lock, NULL); |
1660 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
1661 | } | 1685 | } |
1662 | 1686 | ||
1663 | /** | 1687 | int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
1664 | * rt_mutex_start_proxy_lock() - Start lock acquisition for another task | ||
1665 | * @lock: the rt_mutex to take | ||
1666 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1667 | * @task: the task to prepare | ||
1668 | * | ||
1669 | * Returns: | ||
1670 | * 0 - task blocked on lock | ||
1671 | * 1 - acquired the lock for task, caller should wake it up | ||
1672 | * <0 - error | ||
1673 | * | ||
1674 | * Special API call for FUTEX_REQUEUE_PI support. | ||
1675 | */ | ||
1676 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
1677 | struct rt_mutex_waiter *waiter, | 1688 | struct rt_mutex_waiter *waiter, |
1678 | struct task_struct *task) | 1689 | struct task_struct *task) |
1679 | { | 1690 | { |
1680 | int ret; | 1691 | int ret; |
1681 | 1692 | ||
1682 | raw_spin_lock_irq(&lock->wait_lock); | 1693 | if (try_to_take_rt_mutex(lock, task, NULL)) |
1683 | |||
1684 | if (try_to_take_rt_mutex(lock, task, NULL)) { | ||
1685 | raw_spin_unlock_irq(&lock->wait_lock); | ||
1686 | return 1; | 1694 | return 1; |
1687 | } | ||
1688 | 1695 | ||
1689 | /* We enforce deadlock detection for futexes */ | 1696 | /* We enforce deadlock detection for futexes */ |
1690 | ret = task_blocks_on_rt_mutex(lock, waiter, task, | 1697 | ret = task_blocks_on_rt_mutex(lock, waiter, task, |
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1703 | if (unlikely(ret)) | 1710 | if (unlikely(ret)) |
1704 | remove_waiter(lock, waiter); | 1711 | remove_waiter(lock, waiter); |
1705 | 1712 | ||
1706 | raw_spin_unlock_irq(&lock->wait_lock); | ||
1707 | |||
1708 | debug_rt_mutex_print_deadlock(waiter); | 1713 | debug_rt_mutex_print_deadlock(waiter); |
1709 | 1714 | ||
1710 | return ret; | 1715 | return ret; |
1711 | } | 1716 | } |
1712 | 1717 | ||
1713 | /** | 1718 | /** |
1719 | * rt_mutex_start_proxy_lock() - Start lock acquisition for another task | ||
1720 | * @lock: the rt_mutex to take | ||
1721 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1722 | * @task: the task to prepare | ||
1723 | * | ||
1724 | * Returns: | ||
1725 | * 0 - task blocked on lock | ||
1726 | * 1 - acquired the lock for task, caller should wake it up | ||
1727 | * <0 - error | ||
1728 | * | ||
1729 | * Special API call for FUTEX_REQUEUE_PI support. | ||
1730 | */ | ||
1731 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
1732 | struct rt_mutex_waiter *waiter, | ||
1733 | struct task_struct *task) | ||
1734 | { | ||
1735 | int ret; | ||
1736 | |||
1737 | raw_spin_lock_irq(&lock->wait_lock); | ||
1738 | ret = __rt_mutex_start_proxy_lock(lock, waiter, task); | ||
1739 | raw_spin_unlock_irq(&lock->wait_lock); | ||
1740 | |||
1741 | return ret; | ||
1742 | } | ||
1743 | |||
1744 | /** | ||
1714 | * rt_mutex_next_owner - return the next owner of the lock | 1745 | * rt_mutex_next_owner - return the next owner of the lock |
1715 | * | 1746 | * |
1716 | * @lock: the rt lock query | 1747 | * @lock: the rt lock query |
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | |||
1731 | } | 1762 | } |
1732 | 1763 | ||
1733 | /** | 1764 | /** |
1734 | * rt_mutex_finish_proxy_lock() - Complete lock acquisition | 1765 | * rt_mutex_wait_proxy_lock() - Wait for lock acquisition |
1735 | * @lock: the rt_mutex we were woken on | 1766 | * @lock: the rt_mutex we were woken on |
1736 | * @to: the timeout, null if none. hrtimer should already have | 1767 | * @to: the timeout, null if none. hrtimer should already have |
1737 | * been started. | 1768 | * been started. |
1738 | * @waiter: the pre-initialized rt_mutex_waiter | 1769 | * @waiter: the pre-initialized rt_mutex_waiter |
1739 | * | 1770 | * |
1740 | * Complete the lock acquisition started our behalf by another thread. | 1771 | * Wait for the the lock acquisition started on our behalf by |
1772 | * rt_mutex_start_proxy_lock(). Upon failure, the caller must call | ||
1773 | * rt_mutex_cleanup_proxy_lock(). | ||
1741 | * | 1774 | * |
1742 | * Returns: | 1775 | * Returns: |
1743 | * 0 - success | 1776 | * 0 - success |
1744 | * <0 - error, one of -EINTR, -ETIMEDOUT | 1777 | * <0 - error, one of -EINTR, -ETIMEDOUT |
1745 | * | 1778 | * |
1746 | * Special API call for PI-futex requeue support | 1779 | * Special API call for PI-futex support |
1747 | */ | 1780 | */ |
1748 | int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | 1781 | int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, |
1749 | struct hrtimer_sleeper *to, | 1782 | struct hrtimer_sleeper *to, |
1750 | struct rt_mutex_waiter *waiter) | 1783 | struct rt_mutex_waiter *waiter) |
1751 | { | 1784 | { |
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1758 | /* sleep on the mutex */ | 1791 | /* sleep on the mutex */ |
1759 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); | 1792 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); |
1760 | 1793 | ||
1761 | if (unlikely(ret)) | 1794 | raw_spin_unlock_irq(&lock->wait_lock); |
1795 | |||
1796 | return ret; | ||
1797 | } | ||
1798 | |||
1799 | /** | ||
1800 | * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition | ||
1801 | * @lock: the rt_mutex we were woken on | ||
1802 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1803 | * | ||
1804 | * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). | ||
1805 | * | ||
1806 | * Unless we acquired the lock; we're still enqueued on the wait-list and can | ||
1807 | * in fact still be granted ownership until we're removed. Therefore we can | ||
1808 | * find we are in fact the owner and must disregard the | ||
1809 | * rt_mutex_wait_proxy_lock() failure. | ||
1810 | * | ||
1811 | * Returns: | ||
1812 | * true - did the cleanup, we done. | ||
1813 | * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, | ||
1814 | * caller should disregards its return value. | ||
1815 | * | ||
1816 | * Special API call for PI-futex support | ||
1817 | */ | ||
1818 | bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, | ||
1819 | struct rt_mutex_waiter *waiter) | ||
1820 | { | ||
1821 | bool cleanup = false; | ||
1822 | |||
1823 | raw_spin_lock_irq(&lock->wait_lock); | ||
1824 | /* | ||
1825 | * Unless we're the owner; we're still enqueued on the wait_list. | ||
1826 | * So check if we became owner, if not, take us off the wait_list. | ||
1827 | */ | ||
1828 | if (rt_mutex_owner(lock) != current) { | ||
1762 | remove_waiter(lock, waiter); | 1829 | remove_waiter(lock, waiter); |
1830 | fixup_rt_mutex_waiters(lock); | ||
1831 | cleanup = true; | ||
1832 | } | ||
1763 | 1833 | ||
1764 | /* | 1834 | /* |
1765 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | 1835 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might |
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | |||
1769 | 1839 | ||
1770 | raw_spin_unlock_irq(&lock->wait_lock); | 1840 | raw_spin_unlock_irq(&lock->wait_lock); |
1771 | 1841 | ||
1772 | return ret; | 1842 | return cleanup; |
1773 | } | 1843 | } |
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c407..6607802efa8b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h | |||
@@ -11,8 +11,6 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #define rt_mutex_deadlock_check(l) (0) | 13 | #define rt_mutex_deadlock_check(l) (0) |
14 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
15 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
16 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | 14 | #define debug_rt_mutex_init_waiter(w) do { } while (0) |
17 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | 15 | #define debug_rt_mutex_free_waiter(w) do { } while (0) |
18 | #define debug_rt_mutex_lock(l) do { } while (0) | 16 | #define debug_rt_mutex_lock(l) do { } while (0) |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 856dfff5c33a..72ad45a9a794 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -34,6 +34,7 @@ struct rt_mutex_waiter { | |||
34 | struct rt_mutex *deadlock_lock; | 34 | struct rt_mutex *deadlock_lock; |
35 | #endif | 35 | #endif |
36 | int prio; | 36 | int prio; |
37 | u64 deadline; | ||
37 | }; | 38 | }; |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
103 | struct task_struct *proxy_owner); | 104 | struct task_struct *proxy_owner); |
104 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | 105 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, |
105 | struct task_struct *proxy_owner); | 106 | struct task_struct *proxy_owner); |
107 | extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
108 | extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
109 | struct rt_mutex_waiter *waiter, | ||
110 | struct task_struct *task); | ||
106 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | 111 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
107 | struct rt_mutex_waiter *waiter, | 112 | struct rt_mutex_waiter *waiter, |
108 | struct task_struct *task); | 113 | struct task_struct *task); |
109 | extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | 114 | extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, |
110 | struct hrtimer_sleeper *to, | 115 | struct hrtimer_sleeper *to, |
111 | struct rt_mutex_waiter *waiter); | 116 | struct rt_mutex_waiter *waiter); |
112 | extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); | 117 | extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, |
113 | extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, | 118 | struct rt_mutex_waiter *waiter); |
114 | struct wake_q_head *wqh); | 119 | |
115 | extern void rt_mutex_adjust_prio(struct task_struct *task); | 120 | extern int rt_mutex_futex_trylock(struct rt_mutex *l); |
121 | |||
122 | extern void rt_mutex_futex_unlock(struct rt_mutex *lock); | ||
123 | extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, | ||
124 | struct wake_q_head *wqh); | ||
125 | |||
126 | extern void rt_mutex_postunlock(struct wake_q_head *wake_q); | ||
116 | 127 | ||
117 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 128 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
118 | # include "rtmutex-debug.h" | 129 | # include "rtmutex-debug.h" |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 90a74ccd85a4..4d48b1c4870d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write); | |||
124 | */ | 124 | */ |
125 | void downgrade_write(struct rw_semaphore *sem) | 125 | void downgrade_write(struct rw_semaphore *sem) |
126 | { | 126 | { |
127 | /* | 127 | lock_downgrade(&sem->dep_map, _RET_IP_); |
128 | * lockdep: a downgraded write will live on as a write | 128 | |
129 | * dependency. | ||
130 | */ | ||
131 | rwsem_set_reader_owned(sem); | 129 | rwsem_set_reader_owned(sem); |
132 | __downgrade_write(sem); | 130 | __downgrade_write(sem); |
133 | } | 131 | } |
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 6b7abb334ca6..39f56c870051 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c | |||
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus) | |||
353 | struct stress { | 353 | struct stress { |
354 | struct work_struct work; | 354 | struct work_struct work; |
355 | struct ww_mutex *locks; | 355 | struct ww_mutex *locks; |
356 | unsigned long timeout; | ||
356 | int nlocks; | 357 | int nlocks; |
357 | int nloops; | ||
358 | }; | 358 | }; |
359 | 359 | ||
360 | static int *get_random_order(int count) | 360 | static int *get_random_order(int count) |
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work) | |||
398 | if (!order) | 398 | if (!order) |
399 | return; | 399 | return; |
400 | 400 | ||
401 | ww_acquire_init(&ctx, &ww_class); | ||
402 | |||
403 | do { | 401 | do { |
404 | int contended = -1; | 402 | int contended = -1; |
405 | int n, err; | 403 | int n, err; |
406 | 404 | ||
405 | ww_acquire_init(&ctx, &ww_class); | ||
407 | retry: | 406 | retry: |
408 | err = 0; | 407 | err = 0; |
409 | for (n = 0; n < nlocks; n++) { | 408 | for (n = 0; n < nlocks; n++) { |
@@ -433,9 +432,9 @@ retry: | |||
433 | __func__, err); | 432 | __func__, err); |
434 | break; | 433 | break; |
435 | } | 434 | } |
436 | } while (--stress->nloops); | ||
437 | 435 | ||
438 | ww_acquire_fini(&ctx); | 436 | ww_acquire_fini(&ctx); |
437 | } while (!time_after(jiffies, stress->timeout)); | ||
439 | 438 | ||
440 | kfree(order); | 439 | kfree(order); |
441 | kfree(stress); | 440 | kfree(stress); |
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work) | |||
470 | kfree(order); | 469 | kfree(order); |
471 | order = NULL; | 470 | order = NULL; |
472 | 471 | ||
473 | ww_acquire_init(&ctx, &ww_class); | ||
474 | |||
475 | do { | 472 | do { |
473 | ww_acquire_init(&ctx, &ww_class); | ||
474 | |||
476 | list_for_each_entry(ll, &locks, link) { | 475 | list_for_each_entry(ll, &locks, link) { |
477 | err = ww_mutex_lock(ll->lock, &ctx); | 476 | err = ww_mutex_lock(ll->lock, &ctx); |
478 | if (!err) | 477 | if (!err) |
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work) | |||
495 | dummy_load(stress); | 494 | dummy_load(stress); |
496 | list_for_each_entry(ll, &locks, link) | 495 | list_for_each_entry(ll, &locks, link) |
497 | ww_mutex_unlock(ll->lock); | 496 | ww_mutex_unlock(ll->lock); |
498 | } while (--stress->nloops); | ||
499 | 497 | ||
500 | ww_acquire_fini(&ctx); | 498 | ww_acquire_fini(&ctx); |
499 | } while (!time_after(jiffies, stress->timeout)); | ||
501 | 500 | ||
502 | out: | 501 | out: |
503 | list_for_each_entry_safe(ll, ln, &locks, link) | 502 | list_for_each_entry_safe(ll, ln, &locks, link) |
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work) | |||
523 | __func__, err); | 522 | __func__, err); |
524 | break; | 523 | break; |
525 | } | 524 | } |
526 | } while (--stress->nloops); | 525 | } while (!time_after(jiffies, stress->timeout)); |
527 | 526 | ||
528 | kfree(stress); | 527 | kfree(stress); |
529 | } | 528 | } |
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work) | |||
533 | #define STRESS_ONE BIT(2) | 532 | #define STRESS_ONE BIT(2) |
534 | #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) | 533 | #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) |
535 | 534 | ||
536 | static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) | 535 | static int stress(int nlocks, int nthreads, unsigned int flags) |
537 | { | 536 | { |
538 | struct ww_mutex *locks; | 537 | struct ww_mutex *locks; |
539 | int n; | 538 | int n; |
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) | |||
575 | INIT_WORK(&stress->work, fn); | 574 | INIT_WORK(&stress->work, fn); |
576 | stress->locks = locks; | 575 | stress->locks = locks; |
577 | stress->nlocks = nlocks; | 576 | stress->nlocks = nlocks; |
578 | stress->nloops = nloops; | 577 | stress->timeout = jiffies + 2*HZ; |
579 | 578 | ||
580 | queue_work(wq, &stress->work); | 579 | queue_work(wq, &stress->work); |
581 | nthreads--; | 580 | nthreads--; |
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void) | |||
619 | if (ret) | 618 | if (ret) |
620 | return ret; | 619 | return ret; |
621 | 620 | ||
622 | ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); | 621 | ret = stress(16, 2*ncpus, STRESS_INORDER); |
623 | if (ret) | 622 | if (ret) |
624 | return ret; | 623 | return ret; |
625 | 624 | ||
626 | ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); | 625 | ret = stress(16, 2*ncpus, STRESS_REORDER); |
627 | if (ret) | 626 | if (ret) |
628 | return ret; | 627 | return ret; |
629 | 628 | ||
630 | ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); | 629 | ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); |
631 | if (ret) | 630 | if (ret) |
632 | return ret; | 631 | return ret; |
633 | 632 | ||
diff --git a/kernel/memremap.c b/kernel/memremap.c index 07e85e5229da..23a6483c3666 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -182,18 +182,6 @@ struct page_map { | |||
182 | struct vmem_altmap altmap; | 182 | struct vmem_altmap altmap; |
183 | }; | 183 | }; |
184 | 184 | ||
185 | void get_zone_device_page(struct page *page) | ||
186 | { | ||
187 | percpu_ref_get(page->pgmap->ref); | ||
188 | } | ||
189 | EXPORT_SYMBOL(get_zone_device_page); | ||
190 | |||
191 | void put_zone_device_page(struct page *page) | ||
192 | { | ||
193 | put_dev_pagemap(page->pgmap); | ||
194 | } | ||
195 | EXPORT_SYMBOL(put_zone_device_page); | ||
196 | |||
197 | static void pgmap_radix_release(struct resource *res) | 185 | static void pgmap_radix_release(struct resource *res) |
198 | { | 186 | { |
199 | resource_size_t key, align_start, align_size, align_end; | 187 | resource_size_t key, align_start, align_size, align_end; |
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data) | |||
237 | struct resource *res = &page_map->res; | 225 | struct resource *res = &page_map->res; |
238 | resource_size_t align_start, align_size; | 226 | resource_size_t align_start, align_size; |
239 | struct dev_pagemap *pgmap = &page_map->pgmap; | 227 | struct dev_pagemap *pgmap = &page_map->pgmap; |
228 | unsigned long pfn; | ||
229 | |||
230 | for_each_device_pfn(pfn, page_map) | ||
231 | put_page(pfn_to_page(pfn)); | ||
240 | 232 | ||
241 | if (percpu_ref_tryget_live(pgmap->ref)) { | 233 | if (percpu_ref_tryget_live(pgmap->ref)) { |
242 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); | 234 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); |
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | |||
277 | * | 269 | * |
278 | * Notes: | 270 | * Notes: |
279 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time | 271 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time |
280 | * (or devm release event). | 272 | * (or devm release event). The expected order of events is that @ref has |
273 | * been through percpu_ref_kill() before devm_memremap_pages_release(). The | ||
274 | * wait for the completion of all references being dropped and | ||
275 | * percpu_ref_exit() must occur after devm_memremap_pages_release(). | ||
281 | * | 276 | * |
282 | * 2/ @res is expected to be a host memory range that could feasibly be | 277 | * 2/ @res is expected to be a host memory range that could feasibly be |
283 | * treated as a "System RAM" range, i.e. not a device mmio range, but | 278 | * treated as a "System RAM" range, i.e. not a device mmio range, but |
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
379 | */ | 374 | */ |
380 | list_del(&page->lru); | 375 | list_del(&page->lru); |
381 | page->pgmap = pgmap; | 376 | page->pgmap = pgmap; |
377 | percpu_ref_get(ref); | ||
382 | } | 378 | } |
383 | devres_add(dev, page_map); | 379 | devres_add(dev, page_map); |
384 | return __va(res->start); | 380 | return __va(res->start); |
diff --git a/kernel/module.c b/kernel/module.c index 7eba6dea4f41..4a3665f8f837 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -49,6 +49,9 @@ | |||
49 | #include <linux/rculist.h> | 49 | #include <linux/rculist.h> |
50 | #include <linux/uaccess.h> | 50 | #include <linux/uaccess.h> |
51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
52 | #ifdef CONFIG_STRICT_MODULE_RWX | ||
53 | #include <asm/set_memory.h> | ||
54 | #endif | ||
52 | #include <asm/mmu_context.h> | 55 | #include <asm/mmu_context.h> |
53 | #include <linux/license.h> | 56 | #include <linux/license.h> |
54 | #include <asm/sections.h> | 57 | #include <asm/sections.h> |
@@ -665,16 +668,7 @@ static void percpu_modcopy(struct module *mod, | |||
665 | memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); | 668 | memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); |
666 | } | 669 | } |
667 | 670 | ||
668 | /** | 671 | bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) |
669 | * is_module_percpu_address - test whether address is from module static percpu | ||
670 | * @addr: address to test | ||
671 | * | ||
672 | * Test whether @addr belongs to module static percpu area. | ||
673 | * | ||
674 | * RETURNS: | ||
675 | * %true if @addr is from module static percpu area | ||
676 | */ | ||
677 | bool is_module_percpu_address(unsigned long addr) | ||
678 | { | 672 | { |
679 | struct module *mod; | 673 | struct module *mod; |
680 | unsigned int cpu; | 674 | unsigned int cpu; |
@@ -688,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr) | |||
688 | continue; | 682 | continue; |
689 | for_each_possible_cpu(cpu) { | 683 | for_each_possible_cpu(cpu) { |
690 | void *start = per_cpu_ptr(mod->percpu, cpu); | 684 | void *start = per_cpu_ptr(mod->percpu, cpu); |
691 | 685 | void *va = (void *)addr; | |
692 | if ((void *)addr >= start && | 686 | |
693 | (void *)addr < start + mod->percpu_size) { | 687 | if (va >= start && va < start + mod->percpu_size) { |
688 | if (can_addr) { | ||
689 | *can_addr = (unsigned long) (va - start); | ||
690 | *can_addr += (unsigned long) | ||
691 | per_cpu_ptr(mod->percpu, | ||
692 | get_boot_cpu_id()); | ||
693 | } | ||
694 | preempt_enable(); | 694 | preempt_enable(); |
695 | return true; | 695 | return true; |
696 | } | 696 | } |
@@ -701,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr) | |||
701 | return false; | 701 | return false; |
702 | } | 702 | } |
703 | 703 | ||
704 | /** | ||
705 | * is_module_percpu_address - test whether address is from module static percpu | ||
706 | * @addr: address to test | ||
707 | * | ||
708 | * Test whether @addr belongs to module static percpu area. | ||
709 | * | ||
710 | * RETURNS: | ||
711 | * %true if @addr is from module static percpu area | ||
712 | */ | ||
713 | bool is_module_percpu_address(unsigned long addr) | ||
714 | { | ||
715 | return __is_module_percpu_address(addr, NULL); | ||
716 | } | ||
717 | |||
704 | #else /* ... !CONFIG_SMP */ | 718 | #else /* ... !CONFIG_SMP */ |
705 | 719 | ||
706 | static inline void __percpu *mod_percpu(struct module *mod) | 720 | static inline void __percpu *mod_percpu(struct module *mod) |
@@ -732,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr) | |||
732 | return false; | 746 | return false; |
733 | } | 747 | } |
734 | 748 | ||
749 | bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) | ||
750 | { | ||
751 | return false; | ||
752 | } | ||
753 | |||
735 | #endif /* CONFIG_SMP */ | 754 | #endif /* CONFIG_SMP */ |
736 | 755 | ||
737 | #define MODINFO_ATTR(field) \ | 756 | #define MODINFO_ATTR(field) \ |
@@ -947,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
947 | return -EFAULT; | 966 | return -EFAULT; |
948 | name[MODULE_NAME_LEN-1] = '\0'; | 967 | name[MODULE_NAME_LEN-1] = '\0'; |
949 | 968 | ||
969 | audit_log_kern_module(name); | ||
970 | |||
950 | if (mutex_lock_interruptible(&module_mutex) != 0) | 971 | if (mutex_lock_interruptible(&module_mutex) != 0) |
951 | return -EINTR; | 972 | return -EINTR; |
952 | 973 | ||
@@ -2846,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
2846 | 2867 | ||
2847 | /* Suck in entire file: we'll want most of it. */ | 2868 | /* Suck in entire file: we'll want most of it. */ |
2848 | info->hdr = __vmalloc(info->len, | 2869 | info->hdr = __vmalloc(info->len, |
2849 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); | 2870 | GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); |
2850 | if (!info->hdr) | 2871 | if (!info->hdr) |
2851 | return -ENOMEM; | 2872 | return -ENOMEM; |
2852 | 2873 | ||
@@ -4017,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
4017 | 4038 | ||
4018 | /* Don't lock: we're in enough trouble already. */ | 4039 | /* Don't lock: we're in enough trouble already. */ |
4019 | preempt_disable(); | 4040 | preempt_disable(); |
4020 | if ((colon = strchr(name, ':')) != NULL) { | 4041 | if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { |
4021 | if ((mod = find_module_all(name, colon - name, false)) != NULL) | 4042 | if ((mod = find_module_all(name, colon - name, false)) != NULL) |
4022 | ret = mod_find_symname(mod, colon+1); | 4043 | ret = mod_find_symname(mod, colon+1); |
4023 | } else { | 4044 | } else { |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 782102e59eed..f6c5d330059a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/file.h> | 26 | #include <linux/file.h> |
27 | #include <linux/syscalls.h> | 27 | #include <linux/syscalls.h> |
28 | #include <linux/cgroup.h> | 28 | #include <linux/cgroup.h> |
29 | #include <linux/perf_event.h> | ||
29 | 30 | ||
30 | static struct kmem_cache *nsproxy_cachep; | 31 | static struct kmem_cache *nsproxy_cachep; |
31 | 32 | ||
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
262 | goto out; | 263 | goto out; |
263 | } | 264 | } |
264 | switch_task_namespaces(tsk, new_nsproxy); | 265 | switch_task_namespaces(tsk, new_nsproxy); |
266 | |||
267 | perf_event_namespaces(tsk); | ||
265 | out: | 268 | out: |
266 | fput(file); | 269 | fput(file); |
267 | return err; | 270 | return err; |
diff --git a/kernel/padata.c b/kernel/padata.c index 3202aa17492c..ac8f1e524836 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel); | |||
154 | * A pointer to the control struct of the next object that needs | 154 | * A pointer to the control struct of the next object that needs |
155 | * serialization, if present in one of the percpu reorder queues. | 155 | * serialization, if present in one of the percpu reorder queues. |
156 | * | 156 | * |
157 | * NULL, if all percpu reorder queues are empty. | ||
158 | * | ||
159 | * -EINPROGRESS, if the next object that needs serialization will | 157 | * -EINPROGRESS, if the next object that needs serialization will |
160 | * be parallel processed by another cpu and is not yet present in | 158 | * be parallel processed by another cpu and is not yet present in |
161 | * the cpu's reorder queue. | 159 | * the cpu's reorder queue. |
@@ -182,8 +180,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
182 | cpu = padata_index_to_cpu(pd, next_index); | 180 | cpu = padata_index_to_cpu(pd, next_index); |
183 | next_queue = per_cpu_ptr(pd->pqueue, cpu); | 181 | next_queue = per_cpu_ptr(pd->pqueue, cpu); |
184 | 182 | ||
185 | padata = NULL; | ||
186 | |||
187 | reorder = &next_queue->reorder; | 183 | reorder = &next_queue->reorder; |
188 | 184 | ||
189 | spin_lock(&reorder->lock); | 185 | spin_lock(&reorder->lock); |
@@ -235,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd) | |||
235 | padata = padata_get_next(pd); | 231 | padata = padata_get_next(pd); |
236 | 232 | ||
237 | /* | 233 | /* |
238 | * All reorder queues are empty, or the next object that needs | 234 | * If the next object that needs serialization is parallel |
239 | * serialization is parallel processed by another cpu and is | 235 | * processed by another cpu and is still on it's way to the |
240 | * still on it's way to the cpu's reorder queue, nothing to | 236 | * cpu's reorder queue, nothing to do for now. |
241 | * do for now. | ||
242 | */ | 237 | */ |
243 | if (!padata || PTR_ERR(padata) == -EINPROGRESS) | 238 | if (PTR_ERR(padata) == -EINPROGRESS) |
244 | break; | 239 | break; |
245 | 240 | ||
246 | /* | 241 | /* |
@@ -354,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd, | |||
354 | 349 | ||
355 | cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); | 350 | cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); |
356 | if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { | 351 | if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { |
357 | free_cpumask_var(pd->cpumask.cbcpu); | 352 | free_cpumask_var(pd->cpumask.pcpu); |
358 | return -ENOMEM; | 353 | return -ENOMEM; |
359 | } | 354 | } |
360 | 355 | ||
diff --git a/kernel/params.c b/kernel/params.c index a6d6149c0fe6..60b2d8101355 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -160,58 +160,6 @@ static int parse_one(char *param, | |||
160 | return -ENOENT; | 160 | return -ENOENT; |
161 | } | 161 | } |
162 | 162 | ||
163 | /* You can use " around spaces, but can't escape ". */ | ||
164 | /* Hyphens and underscores equivalent in parameter names. */ | ||
165 | static char *next_arg(char *args, char **param, char **val) | ||
166 | { | ||
167 | unsigned int i, equals = 0; | ||
168 | int in_quote = 0, quoted = 0; | ||
169 | char *next; | ||
170 | |||
171 | if (*args == '"') { | ||
172 | args++; | ||
173 | in_quote = 1; | ||
174 | quoted = 1; | ||
175 | } | ||
176 | |||
177 | for (i = 0; args[i]; i++) { | ||
178 | if (isspace(args[i]) && !in_quote) | ||
179 | break; | ||
180 | if (equals == 0) { | ||
181 | if (args[i] == '=') | ||
182 | equals = i; | ||
183 | } | ||
184 | if (args[i] == '"') | ||
185 | in_quote = !in_quote; | ||
186 | } | ||
187 | |||
188 | *param = args; | ||
189 | if (!equals) | ||
190 | *val = NULL; | ||
191 | else { | ||
192 | args[equals] = '\0'; | ||
193 | *val = args + equals + 1; | ||
194 | |||
195 | /* Don't include quotes in value. */ | ||
196 | if (**val == '"') { | ||
197 | (*val)++; | ||
198 | if (args[i-1] == '"') | ||
199 | args[i-1] = '\0'; | ||
200 | } | ||
201 | } | ||
202 | if (quoted && args[i-1] == '"') | ||
203 | args[i-1] = '\0'; | ||
204 | |||
205 | if (args[i]) { | ||
206 | args[i] = '\0'; | ||
207 | next = args + i + 1; | ||
208 | } else | ||
209 | next = args + i; | ||
210 | |||
211 | /* Chew up trailing spaces. */ | ||
212 | return skip_spaces(next); | ||
213 | } | ||
214 | |||
215 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 163 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
216 | char *parse_args(const char *doing, | 164 | char *parse_args(const char *doing, |
217 | char *args, | 165 | char *args, |
diff --git a/kernel/pid.c b/kernel/pid.c index 0143ac0ddceb..fd1cde1e4576 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -321,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
321 | } | 321 | } |
322 | 322 | ||
323 | if (unlikely(is_child_reaper(pid))) { | 323 | if (unlikely(is_child_reaper(pid))) { |
324 | if (pid_ns_prepare_proc(ns)) | 324 | if (pid_ns_prepare_proc(ns)) { |
325 | disable_pid_allocation(ns); | ||
325 | goto out_free; | 326 | goto out_free; |
327 | } | ||
326 | } | 328 | } |
327 | 329 | ||
328 | get_pid_ns(ns); | 330 | get_pid_ns(ns); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index de461aa0bf9a..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
277 | * if reparented. | 277 | * if reparented. |
278 | */ | 278 | */ |
279 | for (;;) { | 279 | for (;;) { |
280 | set_current_state(TASK_UNINTERRUPTIBLE); | 280 | set_current_state(TASK_INTERRUPTIBLE); |
281 | if (pid_ns->nr_hashed == init_pids) | 281 | if (pid_ns->nr_hashed == init_pids) |
282 | break; | 282 | break; |
283 | schedule(); | 283 | schedule(); |
@@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task) | |||
374 | return ns ? &ns->ns : NULL; | 374 | return ns ? &ns->ns : NULL; |
375 | } | 375 | } |
376 | 376 | ||
377 | static struct ns_common *pidns_for_children_get(struct task_struct *task) | ||
378 | { | ||
379 | struct pid_namespace *ns = NULL; | ||
380 | |||
381 | task_lock(task); | ||
382 | if (task->nsproxy) { | ||
383 | ns = task->nsproxy->pid_ns_for_children; | ||
384 | get_pid_ns(ns); | ||
385 | } | ||
386 | task_unlock(task); | ||
387 | |||
388 | if (ns) { | ||
389 | read_lock(&tasklist_lock); | ||
390 | if (!ns->child_reaper) { | ||
391 | put_pid_ns(ns); | ||
392 | ns = NULL; | ||
393 | } | ||
394 | read_unlock(&tasklist_lock); | ||
395 | } | ||
396 | |||
397 | return ns ? &ns->ns : NULL; | ||
398 | } | ||
399 | |||
377 | static void pidns_put(struct ns_common *ns) | 400 | static void pidns_put(struct ns_common *ns) |
378 | { | 401 | { |
379 | put_pid_ns(to_pid_ns(ns)); | 402 | put_pid_ns(to_pid_ns(ns)); |
@@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = { | |||
443 | .get_parent = pidns_get_parent, | 466 | .get_parent = pidns_get_parent, |
444 | }; | 467 | }; |
445 | 468 | ||
469 | const struct proc_ns_operations pidns_for_children_operations = { | ||
470 | .name = "pid_for_children", | ||
471 | .real_ns_name = "pid", | ||
472 | .type = CLONE_NEWPID, | ||
473 | .get = pidns_for_children_get, | ||
474 | .put = pidns_put, | ||
475 | .install = pidns_install, | ||
476 | .owner = pidns_owner, | ||
477 | .get_parent = pidns_get_parent, | ||
478 | }; | ||
479 | |||
446 | static __init int pid_namespaces_init(void) | 480 | static __init int pid_namespaces_init(void) |
447 | { | 481 | { |
448 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 482 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -132,7 +132,7 @@ int freeze_processes(void) | |||
132 | if (!pm_freezing) | 132 | if (!pm_freezing) |
133 | atomic_inc(&system_freezing_cnt); | 133 | atomic_inc(&system_freezing_cnt); |
134 | 134 | ||
135 | pm_wakeup_clear(); | 135 | pm_wakeup_clear(true); |
136 | pr_info("Freezing user space processes ... "); | 136 | pr_info("Freezing user space processes ... "); |
137 | pm_freezing = true; | 137 | pm_freezing = true; |
138 | error = try_to_freeze_tasks(true); | 138 | error = try_to_freeze_tasks(true); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a..fa46606f3356 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -36,6 +36,9 @@ | |||
36 | #include <asm/pgtable.h> | 36 | #include <asm/pgtable.h> |
37 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
38 | #include <asm/io.h> | 38 | #include <asm/io.h> |
39 | #ifdef CONFIG_STRICT_KERNEL_RWX | ||
40 | #include <asm/set_memory.h> | ||
41 | #endif | ||
39 | 42 | ||
40 | #include "power.h" | 43 | #include "power.h" |
41 | 44 | ||
@@ -1422,7 +1425,7 @@ static unsigned int nr_meta_pages; | |||
1422 | * Numbers of normal and highmem page frames allocated for hibernation image | 1425 | * Numbers of normal and highmem page frames allocated for hibernation image |
1423 | * before suspending devices. | 1426 | * before suspending devices. |
1424 | */ | 1427 | */ |
1425 | unsigned int alloc_normal, alloc_highmem; | 1428 | static unsigned int alloc_normal, alloc_highmem; |
1426 | /* | 1429 | /* |
1427 | * Memory bitmap used for marking saveable pages (during hibernation) or | 1430 | * Memory bitmap used for marking saveable pages (during hibernation) or |
1428 | * hibernation image pages (during restore) | 1431 | * hibernation image pages (during restore) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..c0248c74d6d4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -72,6 +72,8 @@ static void freeze_begin(void) | |||
72 | 72 | ||
73 | static void freeze_enter(void) | 73 | static void freeze_enter(void) |
74 | { | 74 | { |
75 | trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); | ||
76 | |||
75 | spin_lock_irq(&suspend_freeze_lock); | 77 | spin_lock_irq(&suspend_freeze_lock); |
76 | if (pm_wakeup_pending()) | 78 | if (pm_wakeup_pending()) |
77 | goto out; | 79 | goto out; |
@@ -98,6 +100,27 @@ static void freeze_enter(void) | |||
98 | out: | 100 | out: |
99 | suspend_freeze_state = FREEZE_STATE_NONE; | 101 | suspend_freeze_state = FREEZE_STATE_NONE; |
100 | spin_unlock_irq(&suspend_freeze_lock); | 102 | spin_unlock_irq(&suspend_freeze_lock); |
103 | |||
104 | trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); | ||
105 | } | ||
106 | |||
107 | static void s2idle_loop(void) | ||
108 | { | ||
109 | do { | ||
110 | freeze_enter(); | ||
111 | |||
112 | if (freeze_ops && freeze_ops->wake) | ||
113 | freeze_ops->wake(); | ||
114 | |||
115 | dpm_resume_noirq(PMSG_RESUME); | ||
116 | if (freeze_ops && freeze_ops->sync) | ||
117 | freeze_ops->sync(); | ||
118 | |||
119 | if (pm_wakeup_pending()) | ||
120 | break; | ||
121 | |||
122 | pm_wakeup_clear(false); | ||
123 | } while (!dpm_suspend_noirq(PMSG_SUSPEND)); | ||
101 | } | 124 | } |
102 | 125 | ||
103 | void freeze_wake(void) | 126 | void freeze_wake(void) |
@@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
371 | * all the devices are suspended. | 394 | * all the devices are suspended. |
372 | */ | 395 | */ |
373 | if (state == PM_SUSPEND_FREEZE) { | 396 | if (state == PM_SUSPEND_FREEZE) { |
374 | trace_suspend_resume(TPS("machine_suspend"), state, true); | 397 | s2idle_loop(); |
375 | freeze_enter(); | 398 | goto Platform_early_resume; |
376 | trace_suspend_resume(TPS("machine_suspend"), state, false); | ||
377 | goto Platform_wake; | ||
378 | } | 399 | } |
379 | 400 | ||
380 | error = disable_nonboot_cpus(); | 401 | error = disable_nonboot_cpus(); |
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index d5760c42f042..61d41ca41844 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c | |||
@@ -2,12 +2,13 @@ | |||
2 | 2 | ||
3 | #include <linux/kernel.h> | 3 | #include <linux/kernel.h> |
4 | #include <linux/console.h> | 4 | #include <linux/console.h> |
5 | #include <linux/errno.h> | ||
5 | #include <linux/string.h> | 6 | #include <linux/string.h> |
6 | 7 | ||
7 | #include "console_cmdline.h" | 8 | #include "console_cmdline.h" |
8 | #include "braille.h" | 9 | #include "braille.h" |
9 | 10 | ||
10 | char *_braille_console_setup(char **str, char **brl_options) | 11 | int _braille_console_setup(char **str, char **brl_options) |
11 | { | 12 | { |
12 | if (!strncmp(*str, "brl,", 4)) { | 13 | if (!strncmp(*str, "brl,", 4)) { |
13 | *brl_options = ""; | 14 | *brl_options = ""; |
@@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options) | |||
15 | } else if (!strncmp(*str, "brl=", 4)) { | 16 | } else if (!strncmp(*str, "brl=", 4)) { |
16 | *brl_options = *str + 4; | 17 | *brl_options = *str + 4; |
17 | *str = strchr(*brl_options, ','); | 18 | *str = strchr(*brl_options, ','); |
18 | if (!*str) | 19 | if (!*str) { |
19 | pr_err("need port name after brl=\n"); | 20 | pr_err("need port name after brl=\n"); |
20 | else | 21 | return -EINVAL; |
21 | *((*str)++) = 0; | 22 | } |
22 | } else | 23 | *((*str)++) = 0; |
23 | return NULL; | 24 | } |
24 | 25 | ||
25 | return *str; | 26 | return 0; |
26 | } | 27 | } |
27 | 28 | ||
28 | int | 29 | int |
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h index 769d771145c8..749a6756843a 100644 --- a/kernel/printk/braille.h +++ b/kernel/printk/braille.h | |||
@@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options) | |||
9 | c->brl_options = brl_options; | 9 | c->brl_options = brl_options; |
10 | } | 10 | } |
11 | 11 | ||
12 | char * | 12 | /* |
13 | * Setup console according to braille options. | ||
14 | * Return -EINVAL on syntax error, 0 on success (or no braille option was | ||
15 | * actually given). | ||
16 | * Modifies str to point to the serial options | ||
17 | * Sets brl_options to the parsed braille options. | ||
18 | */ | ||
19 | int | ||
13 | _braille_console_setup(char **str, char **brl_options); | 20 | _braille_console_setup(char **str, char **brl_options); |
14 | 21 | ||
15 | int | 22 | int |
@@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options) | |||
25 | { | 32 | { |
26 | } | 33 | } |
27 | 34 | ||
28 | static inline char * | 35 | static inline int |
29 | _braille_console_setup(char **str, char **brl_options) | 36 | _braille_console_setup(char **str, char **brl_options) |
30 | { | 37 | { |
31 | return NULL; | 38 | return 0; |
32 | } | 39 | } |
33 | 40 | ||
34 | static inline int | 41 | static inline int |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2984fb0f0257..a1aecf44ab07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -32,7 +32,7 @@ | |||
32 | #include <linux/bootmem.h> | 32 | #include <linux/bootmem.h> |
33 | #include <linux/memblock.h> | 33 | #include <linux/memblock.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/kexec.h> | 35 | #include <linux/crash_core.h> |
36 | #include <linux/kdb.h> | 36 | #include <linux/kdb.h> |
37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
38 | #include <linux/kmsg_dump.h> | 38 | #include <linux/kmsg_dump.h> |
@@ -269,8 +269,8 @@ static struct console *exclusive_console; | |||
269 | #define MAX_CMDLINECONSOLES 8 | 269 | #define MAX_CMDLINECONSOLES 8 |
270 | 270 | ||
271 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; | 271 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; |
272 | static int console_cmdline_cnt; | ||
272 | 273 | ||
273 | static int selected_console = -1; | ||
274 | static int preferred_console = -1; | 274 | static int preferred_console = -1; |
275 | int console_set_on_cmdline; | 275 | int console_set_on_cmdline; |
276 | EXPORT_SYMBOL(console_set_on_cmdline); | 276 | EXPORT_SYMBOL(console_set_on_cmdline); |
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = { | |||
1002 | .release = devkmsg_release, | 1002 | .release = devkmsg_release, |
1003 | }; | 1003 | }; |
1004 | 1004 | ||
1005 | #ifdef CONFIG_KEXEC_CORE | 1005 | #ifdef CONFIG_CRASH_CORE |
1006 | /* | 1006 | /* |
1007 | * This appends the listed symbols to /proc/vmcore | 1007 | * This appends the listed symbols to /proc/vmcore |
1008 | * | 1008 | * |
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = { | |||
1011 | * symbols are specifically used so that utilities can access and extract the | 1011 | * symbols are specifically used so that utilities can access and extract the |
1012 | * dmesg log from a vmcore file after a crash. | 1012 | * dmesg log from a vmcore file after a crash. |
1013 | */ | 1013 | */ |
1014 | void log_buf_kexec_setup(void) | 1014 | void log_buf_vmcoreinfo_setup(void) |
1015 | { | 1015 | { |
1016 | VMCOREINFO_SYMBOL(log_buf); | 1016 | VMCOREINFO_SYMBOL(log_buf); |
1017 | VMCOREINFO_SYMBOL(log_buf_len); | 1017 | VMCOREINFO_SYMBOL(log_buf_len); |
@@ -1906,24 +1906,38 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
1906 | * See if this tty is not yet registered, and | 1906 | * See if this tty is not yet registered, and |
1907 | * if we have a slot free. | 1907 | * if we have a slot free. |
1908 | */ | 1908 | */ |
1909 | for (i = 0, c = console_cmdline; | 1909 | for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { |
1910 | i < MAX_CMDLINECONSOLES && c->name[0]; | ||
1911 | i++, c++) { | ||
1912 | if (strcmp(c->name, name) == 0 && c->index == idx) { | 1910 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
1913 | if (!brl_options) | 1911 | if (brl_options) |
1914 | selected_console = i; | 1912 | return 0; |
1913 | |||
1914 | /* | ||
1915 | * Maintain an invariant that will help to find if | ||
1916 | * the matching console is preferred, see | ||
1917 | * register_console(): | ||
1918 | * | ||
1919 | * The last non-braille console is always | ||
1920 | * the preferred one. | ||
1921 | */ | ||
1922 | if (i != console_cmdline_cnt - 1) | ||
1923 | swap(console_cmdline[i], | ||
1924 | console_cmdline[console_cmdline_cnt - 1]); | ||
1925 | |||
1926 | preferred_console = console_cmdline_cnt - 1; | ||
1927 | |||
1915 | return 0; | 1928 | return 0; |
1916 | } | 1929 | } |
1917 | } | 1930 | } |
1918 | if (i == MAX_CMDLINECONSOLES) | 1931 | if (i == MAX_CMDLINECONSOLES) |
1919 | return -E2BIG; | 1932 | return -E2BIG; |
1920 | if (!brl_options) | 1933 | if (!brl_options) |
1921 | selected_console = i; | 1934 | preferred_console = i; |
1922 | strlcpy(c->name, name, sizeof(c->name)); | 1935 | strlcpy(c->name, name, sizeof(c->name)); |
1923 | c->options = options; | 1936 | c->options = options; |
1924 | braille_set_options(c, brl_options); | 1937 | braille_set_options(c, brl_options); |
1925 | 1938 | ||
1926 | c->index = idx; | 1939 | c->index = idx; |
1940 | console_cmdline_cnt++; | ||
1927 | return 0; | 1941 | return 0; |
1928 | } | 1942 | } |
1929 | /* | 1943 | /* |
@@ -2031,15 +2045,16 @@ void resume_console(void) | |||
2031 | * @cpu: unused | 2045 | * @cpu: unused |
2032 | * | 2046 | * |
2033 | * If printk() is called from a CPU that is not online yet, the messages | 2047 | * If printk() is called from a CPU that is not online yet, the messages |
2034 | * will be spooled but will not show up on the console. This function is | 2048 | * will be printed on the console only if there are CON_ANYTIME consoles. |
2035 | * called when a new CPU comes online (or fails to come up), and ensures | 2049 | * This function is called when a new CPU comes online (or fails to come |
2036 | * that any such output gets printed. | 2050 | * up) or goes offline. |
2037 | */ | 2051 | */ |
2038 | static int console_cpu_notify(unsigned int cpu) | 2052 | static int console_cpu_notify(unsigned int cpu) |
2039 | { | 2053 | { |
2040 | if (!cpuhp_tasks_frozen) { | 2054 | if (!cpuhp_tasks_frozen) { |
2041 | console_lock(); | 2055 | /* If trylock fails, someone else is doing the printing */ |
2042 | console_unlock(); | 2056 | if (console_trylock()) |
2057 | console_unlock(); | ||
2043 | } | 2058 | } |
2044 | return 0; | 2059 | return 0; |
2045 | } | 2060 | } |
@@ -2161,7 +2176,7 @@ void console_unlock(void) | |||
2161 | } | 2176 | } |
2162 | 2177 | ||
2163 | /* | 2178 | /* |
2164 | * Console drivers are called under logbuf_lock, so | 2179 | * Console drivers are called with interrupts disabled, so |
2165 | * @console_may_schedule should be cleared before; however, we may | 2180 | * @console_may_schedule should be cleared before; however, we may |
2166 | * end up dumping a lot of lines, for example, if called from | 2181 | * end up dumping a lot of lines, for example, if called from |
2167 | * console registration path, and should invoke cond_resched() | 2182 | * console registration path, and should invoke cond_resched() |
@@ -2169,11 +2184,15 @@ void console_unlock(void) | |||
2169 | * scheduling stall on a slow console leading to RCU stall and | 2184 | * scheduling stall on a slow console leading to RCU stall and |
2170 | * softlockup warnings which exacerbate the issue with more | 2185 | * softlockup warnings which exacerbate the issue with more |
2171 | * messages practically incapacitating the system. | 2186 | * messages practically incapacitating the system. |
2187 | * | ||
2188 | * console_trylock() is not able to detect the preemptive | ||
2189 | * context reliably. Therefore the value must be stored before | ||
2190 | * and cleared after the the "again" goto label. | ||
2172 | */ | 2191 | */ |
2173 | do_cond_resched = console_may_schedule; | 2192 | do_cond_resched = console_may_schedule; |
2193 | again: | ||
2174 | console_may_schedule = 0; | 2194 | console_may_schedule = 0; |
2175 | 2195 | ||
2176 | again: | ||
2177 | /* | 2196 | /* |
2178 | * We released the console_sem lock, so we need to recheck if | 2197 | * We released the console_sem lock, so we need to recheck if |
2179 | * cpu is online and (if not) is there at least one CON_ANYTIME | 2198 | * cpu is online and (if not) is there at least one CON_ANYTIME |
@@ -2409,6 +2428,7 @@ void register_console(struct console *newcon) | |||
2409 | unsigned long flags; | 2428 | unsigned long flags; |
2410 | struct console *bcon = NULL; | 2429 | struct console *bcon = NULL; |
2411 | struct console_cmdline *c; | 2430 | struct console_cmdline *c; |
2431 | static bool has_preferred; | ||
2412 | 2432 | ||
2413 | if (console_drivers) | 2433 | if (console_drivers) |
2414 | for_each_console(bcon) | 2434 | for_each_console(bcon) |
@@ -2435,15 +2455,15 @@ void register_console(struct console *newcon) | |||
2435 | if (console_drivers && console_drivers->flags & CON_BOOT) | 2455 | if (console_drivers && console_drivers->flags & CON_BOOT) |
2436 | bcon = console_drivers; | 2456 | bcon = console_drivers; |
2437 | 2457 | ||
2438 | if (preferred_console < 0 || bcon || !console_drivers) | 2458 | if (!has_preferred || bcon || !console_drivers) |
2439 | preferred_console = selected_console; | 2459 | has_preferred = preferred_console >= 0; |
2440 | 2460 | ||
2441 | /* | 2461 | /* |
2442 | * See if we want to use this console driver. If we | 2462 | * See if we want to use this console driver. If we |
2443 | * didn't select a console we take the first one | 2463 | * didn't select a console we take the first one |
2444 | * that registers here. | 2464 | * that registers here. |
2445 | */ | 2465 | */ |
2446 | if (preferred_console < 0) { | 2466 | if (!has_preferred) { |
2447 | if (newcon->index < 0) | 2467 | if (newcon->index < 0) |
2448 | newcon->index = 0; | 2468 | newcon->index = 0; |
2449 | if (newcon->setup == NULL || | 2469 | if (newcon->setup == NULL || |
@@ -2451,18 +2471,29 @@ void register_console(struct console *newcon) | |||
2451 | newcon->flags |= CON_ENABLED; | 2471 | newcon->flags |= CON_ENABLED; |
2452 | if (newcon->device) { | 2472 | if (newcon->device) { |
2453 | newcon->flags |= CON_CONSDEV; | 2473 | newcon->flags |= CON_CONSDEV; |
2454 | preferred_console = 0; | 2474 | has_preferred = true; |
2455 | } | 2475 | } |
2456 | } | 2476 | } |
2457 | } | 2477 | } |
2458 | 2478 | ||
2459 | /* | 2479 | /* |
2460 | * See if this console matches one we selected on | 2480 | * See if this console matches one we selected on the command line. |
2461 | * the command line. | 2481 | * |
2482 | * There may be several entries in the console_cmdline array matching | ||
2483 | * with the same console, one with newcon->match(), another by | ||
2484 | * name/index: | ||
2485 | * | ||
2486 | * pl011,mmio,0x87e024000000,115200 -- added from SPCR | ||
2487 | * ttyAMA0 -- added from command line | ||
2488 | * | ||
2489 | * Traverse the console_cmdline array in reverse order to be | ||
2490 | * sure that if this console is preferred then it will be the first | ||
2491 | * matching entry. We use the invariant that is maintained in | ||
2492 | * __add_preferred_console(). | ||
2462 | */ | 2493 | */ |
2463 | for (i = 0, c = console_cmdline; | 2494 | for (i = console_cmdline_cnt - 1; i >= 0; i--) { |
2464 | i < MAX_CMDLINECONSOLES && c->name[0]; | 2495 | c = console_cmdline + i; |
2465 | i++, c++) { | 2496 | |
2466 | if (!newcon->match || | 2497 | if (!newcon->match || |
2467 | newcon->match(newcon, c->name, c->index, c->options) != 0) { | 2498 | newcon->match(newcon, c->name, c->index, c->options) != 0) { |
2468 | /* default matching */ | 2499 | /* default matching */ |
@@ -2484,9 +2515,9 @@ void register_console(struct console *newcon) | |||
2484 | } | 2515 | } |
2485 | 2516 | ||
2486 | newcon->flags |= CON_ENABLED; | 2517 | newcon->flags |= CON_ENABLED; |
2487 | if (i == selected_console) { | 2518 | if (i == preferred_console) { |
2488 | newcon->flags |= CON_CONSDEV; | 2519 | newcon->flags |= CON_CONSDEV; |
2489 | preferred_console = selected_console; | 2520 | has_preferred = true; |
2490 | } | 2521 | } |
2491 | break; | 2522 | break; |
2492 | } | 2523 | } |
@@ -2611,6 +2642,30 @@ int unregister_console(struct console *console) | |||
2611 | EXPORT_SYMBOL(unregister_console); | 2642 | EXPORT_SYMBOL(unregister_console); |
2612 | 2643 | ||
2613 | /* | 2644 | /* |
2645 | * Initialize the console device. This is called *early*, so | ||
2646 | * we can't necessarily depend on lots of kernel help here. | ||
2647 | * Just do some early initializations, and do the complex setup | ||
2648 | * later. | ||
2649 | */ | ||
2650 | void __init console_init(void) | ||
2651 | { | ||
2652 | initcall_t *call; | ||
2653 | |||
2654 | /* Setup the default TTY line discipline. */ | ||
2655 | n_tty_init(); | ||
2656 | |||
2657 | /* | ||
2658 | * set up the console device so that later boot sequences can | ||
2659 | * inform about problems etc.. | ||
2660 | */ | ||
2661 | call = __con_initcall_start; | ||
2662 | while (call < __con_initcall_end) { | ||
2663 | (*call)(); | ||
2664 | call++; | ||
2665 | } | ||
2666 | } | ||
2667 | |||
2668 | /* | ||
2614 | * Some boot consoles access data that is in the init section and which will | 2669 | * Some boot consoles access data that is in the init section and which will |
2615 | * be discarded after the initcalls have been run. To make sure that no code | 2670 | * be discarded after the initcalls have been run. To make sure that no code |
2616 | * will access this data, unregister the boot consoles in a late initcall. | 2671 | * will access this data, unregister the boot consoles in a late initcall. |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..23803c7d5180 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
@@ -3,10 +3,13 @@ | |||
3 | KCOV_INSTRUMENT := n | 3 | KCOV_INSTRUMENT := n |
4 | 4 | ||
5 | obj-y += update.o sync.o | 5 | obj-y += update.o sync.o |
6 | obj-$(CONFIG_SRCU) += srcu.o | 6 | obj-$(CONFIG_CLASSIC_SRCU) += srcu.o |
7 | obj-$(CONFIG_TREE_SRCU) += srcutree.o | ||
8 | obj-$(CONFIG_TINY_SRCU) += srcutiny.o | ||
7 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 9 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
8 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o | 10 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o |
9 | obj-$(CONFIG_TREE_RCU) += tree.o | 11 | obj-$(CONFIG_TREE_RCU) += tree.o |
10 | obj-$(CONFIG_PREEMPT_RCU) += tree.o | 12 | obj-$(CONFIG_PREEMPT_RCU) += tree.o |
11 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | 13 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o |
12 | obj-$(CONFIG_TINY_RCU) += tiny.o | 14 | obj-$(CONFIG_TINY_RCU) += tiny.o |
15 | obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o | ||
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..73e16ec4054b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -56,6 +56,83 @@ | |||
56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ | 56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ |
57 | DYNTICK_TASK_FLAG) | 57 | DYNTICK_TASK_FLAG) |
58 | 58 | ||
59 | |||
60 | /* | ||
61 | * Grace-period counter management. | ||
62 | */ | ||
63 | |||
64 | #define RCU_SEQ_CTR_SHIFT 2 | ||
65 | #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) | ||
66 | |||
67 | /* | ||
68 | * Return the counter portion of a sequence number previously returned | ||
69 | * by rcu_seq_snap() or rcu_seq_current(). | ||
70 | */ | ||
71 | static inline unsigned long rcu_seq_ctr(unsigned long s) | ||
72 | { | ||
73 | return s >> RCU_SEQ_CTR_SHIFT; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Return the state portion of a sequence number previously returned | ||
78 | * by rcu_seq_snap() or rcu_seq_current(). | ||
79 | */ | ||
80 | static inline int rcu_seq_state(unsigned long s) | ||
81 | { | ||
82 | return s & RCU_SEQ_STATE_MASK; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Set the state portion of the pointed-to sequence number. | ||
87 | * The caller is responsible for preventing conflicting updates. | ||
88 | */ | ||
89 | static inline void rcu_seq_set_state(unsigned long *sp, int newstate) | ||
90 | { | ||
91 | WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); | ||
92 | WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); | ||
93 | } | ||
94 | |||
95 | /* Adjust sequence number for start of update-side operation. */ | ||
96 | static inline void rcu_seq_start(unsigned long *sp) | ||
97 | { | ||
98 | WRITE_ONCE(*sp, *sp + 1); | ||
99 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
100 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); | ||
101 | } | ||
102 | |||
103 | /* Adjust sequence number for end of update-side operation. */ | ||
104 | static inline void rcu_seq_end(unsigned long *sp) | ||
105 | { | ||
106 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
107 | WARN_ON_ONCE(!rcu_seq_state(*sp)); | ||
108 | WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); | ||
109 | } | ||
110 | |||
111 | /* Take a snapshot of the update side's sequence number. */ | ||
112 | static inline unsigned long rcu_seq_snap(unsigned long *sp) | ||
113 | { | ||
114 | unsigned long s; | ||
115 | |||
116 | s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; | ||
117 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
118 | return s; | ||
119 | } | ||
120 | |||
121 | /* Return the current value the update side's sequence number, no ordering. */ | ||
122 | static inline unsigned long rcu_seq_current(unsigned long *sp) | ||
123 | { | ||
124 | return READ_ONCE(*sp); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
129 | * full update-side operation has occurred. | ||
130 | */ | ||
131 | static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
132 | { | ||
133 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
134 | } | ||
135 | |||
59 | /* | 136 | /* |
60 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 137 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
61 | * by call_rcu() and rcu callback execution, and are therefore not part of the | 138 | * by call_rcu() and rcu callback execution, and are therefore not part of the |
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
109 | 186 | ||
110 | rcu_lock_acquire(&rcu_callback_map); | 187 | rcu_lock_acquire(&rcu_callback_map); |
111 | if (__is_kfree_rcu_offset(offset)) { | 188 | if (__is_kfree_rcu_offset(offset)) { |
112 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 189 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) |
113 | kfree((void *)head - offset); | 190 | kfree((void *)head - offset); |
114 | rcu_lock_release(&rcu_callback_map); | 191 | rcu_lock_release(&rcu_callback_map); |
115 | return true; | 192 | return true; |
116 | } else { | 193 | } else { |
117 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 194 | RCU_TRACE(trace_rcu_invoke_callback(rn, head);) |
118 | head->func(head); | 195 | head->func(head); |
119 | rcu_lock_release(&rcu_callback_map); | 196 | rcu_lock_release(&rcu_callback_map); |
120 | return false; | 197 | return false; |
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); | |||
144 | */ | 221 | */ |
145 | extern void resched_cpu(int cpu); | 222 | extern void resched_cpu(int cpu); |
146 | 223 | ||
224 | #if defined(SRCU) || !defined(TINY_RCU) | ||
225 | |||
226 | #include <linux/rcu_node_tree.h> | ||
227 | |||
228 | extern int rcu_num_lvls; | ||
229 | extern int num_rcu_lvl[]; | ||
230 | extern int rcu_num_nodes; | ||
231 | static bool rcu_fanout_exact; | ||
232 | static int rcu_fanout_leaf; | ||
233 | |||
234 | /* | ||
235 | * Compute the per-level fanout, either using the exact fanout specified | ||
236 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
237 | */ | ||
238 | static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
239 | { | ||
240 | int i; | ||
241 | |||
242 | if (rcu_fanout_exact) { | ||
243 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
244 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
245 | levelspread[i] = RCU_FANOUT; | ||
246 | } else { | ||
247 | int ccur; | ||
248 | int cprv; | ||
249 | |||
250 | cprv = nr_cpu_ids; | ||
251 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
252 | ccur = levelcnt[i]; | ||
253 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
254 | cprv = ccur; | ||
255 | } | ||
256 | } | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Do a full breadth-first scan of the rcu_node structures for the | ||
261 | * specified rcu_state structure. | ||
262 | */ | ||
263 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
264 | for ((rnp) = &(rsp)->node[0]; \ | ||
265 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
266 | |||
267 | /* | ||
268 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
269 | * specified rcu_state structure. Note that if there is a singleton | ||
270 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
271 | */ | ||
272 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
273 | for ((rnp) = &(rsp)->node[0]; \ | ||
274 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
275 | |||
276 | /* | ||
277 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
278 | * structure. Note that if there is a singleton rcu_node tree with but | ||
279 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
280 | * It is still a leaf node, even if it is also the root node. | ||
281 | */ | ||
282 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
283 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
284 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
285 | |||
286 | /* | ||
287 | * Iterate over all possible CPUs in a leaf RCU node. | ||
288 | */ | ||
289 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
290 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
291 | cpu <= rnp->grphi; \ | ||
292 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
293 | |||
294 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | ||
295 | |||
147 | #endif /* __LINUX_RCU_H */ | 296 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c new file mode 100644 index 000000000000..2b62a38b080f --- /dev/null +++ b/kernel/rcu/rcu_segcblist.c | |||
@@ -0,0 +1,505 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists, function definitions | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/types.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | |||
27 | #include "rcu_segcblist.h" | ||
28 | |||
29 | /* Initialize simple callback list. */ | ||
30 | void rcu_cblist_init(struct rcu_cblist *rclp) | ||
31 | { | ||
32 | rclp->head = NULL; | ||
33 | rclp->tail = &rclp->head; | ||
34 | rclp->len = 0; | ||
35 | rclp->len_lazy = 0; | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * Debug function to actually count the number of callbacks. | ||
40 | * If the number exceeds the limit specified, return -1. | ||
41 | */ | ||
42 | long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) | ||
43 | { | ||
44 | int cnt = 0; | ||
45 | struct rcu_head **rhpp = &rclp->head; | ||
46 | |||
47 | for (;;) { | ||
48 | if (!*rhpp) | ||
49 | return cnt; | ||
50 | if (++cnt > lim) | ||
51 | return -1; | ||
52 | rhpp = &(*rhpp)->next; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * Dequeue the oldest rcu_head structure from the specified callback | ||
58 | * list. This function assumes that the callback is non-lazy, but | ||
59 | * the caller can later invoke rcu_cblist_dequeued_lazy() if it | ||
60 | * finds otherwise (and if it cares about laziness). This allows | ||
61 | * different users to have different ways of determining laziness. | ||
62 | */ | ||
63 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) | ||
64 | { | ||
65 | struct rcu_head *rhp; | ||
66 | |||
67 | rhp = rclp->head; | ||
68 | if (!rhp) | ||
69 | return NULL; | ||
70 | rclp->len--; | ||
71 | rclp->head = rhp->next; | ||
72 | if (!rclp->head) | ||
73 | rclp->tail = &rclp->head; | ||
74 | return rhp; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Initialize an rcu_segcblist structure. | ||
79 | */ | ||
80 | void rcu_segcblist_init(struct rcu_segcblist *rsclp) | ||
81 | { | ||
82 | int i; | ||
83 | |||
84 | BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); | ||
85 | BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); | ||
86 | rsclp->head = NULL; | ||
87 | for (i = 0; i < RCU_CBLIST_NSEGS; i++) | ||
88 | rsclp->tails[i] = &rsclp->head; | ||
89 | rsclp->len = 0; | ||
90 | rsclp->len_lazy = 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Disable the specified rcu_segcblist structure, so that callbacks can | ||
95 | * no longer be posted to it. This structure must be empty. | ||
96 | */ | ||
97 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp) | ||
98 | { | ||
99 | WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); | ||
100 | WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); | ||
101 | WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); | ||
102 | rsclp->tails[RCU_NEXT_TAIL] = NULL; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Is the specified segment of the specified rcu_segcblist structure | ||
107 | * empty of callbacks? | ||
108 | */ | ||
109 | bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) | ||
110 | { | ||
111 | if (seg == RCU_DONE_TAIL) | ||
112 | return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; | ||
113 | return rsclp->tails[seg - 1] == rsclp->tails[seg]; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * Does the specified rcu_segcblist structure contain callbacks that | ||
118 | * are ready to be invoked? | ||
119 | */ | ||
120 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) | ||
121 | { | ||
122 | return rcu_segcblist_is_enabled(rsclp) && | ||
123 | &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Does the specified rcu_segcblist structure contain callbacks that | ||
128 | * are still pending, that is, not yet ready to be invoked? | ||
129 | */ | ||
130 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) | ||
131 | { | ||
132 | return rcu_segcblist_is_enabled(rsclp) && | ||
133 | !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Dequeue and return the first ready-to-invoke callback. If there | ||
138 | * are no ready-to-invoke callbacks, return NULL. Disables interrupts | ||
139 | * to avoid interference. Does not protect from interference from other | ||
140 | * CPUs or tasks. | ||
141 | */ | ||
142 | struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) | ||
143 | { | ||
144 | unsigned long flags; | ||
145 | int i; | ||
146 | struct rcu_head *rhp; | ||
147 | |||
148 | local_irq_save(flags); | ||
149 | if (!rcu_segcblist_ready_cbs(rsclp)) { | ||
150 | local_irq_restore(flags); | ||
151 | return NULL; | ||
152 | } | ||
153 | rhp = rsclp->head; | ||
154 | BUG_ON(!rhp); | ||
155 | rsclp->head = rhp->next; | ||
156 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { | ||
157 | if (rsclp->tails[i] != &rhp->next) | ||
158 | break; | ||
159 | rsclp->tails[i] = &rsclp->head; | ||
160 | } | ||
161 | smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ | ||
162 | WRITE_ONCE(rsclp->len, rsclp->len - 1); | ||
163 | local_irq_restore(flags); | ||
164 | return rhp; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Account for the fact that a previously dequeued callback turned out | ||
169 | * to be marked as lazy. | ||
170 | */ | ||
171 | void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) | ||
172 | { | ||
173 | unsigned long flags; | ||
174 | |||
175 | local_irq_save(flags); | ||
176 | rsclp->len_lazy--; | ||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Return a pointer to the first callback in the specified rcu_segcblist | ||
182 | * structure. This is useful for diagnostics. | ||
183 | */ | ||
184 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) | ||
185 | { | ||
186 | if (rcu_segcblist_is_enabled(rsclp)) | ||
187 | return rsclp->head; | ||
188 | return NULL; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * Return a pointer to the first pending callback in the specified | ||
193 | * rcu_segcblist structure. This is useful just after posting a given | ||
194 | * callback -- if that callback is the first pending callback, then | ||
195 | * you cannot rely on someone else having already started up the required | ||
196 | * grace period. | ||
197 | */ | ||
198 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) | ||
199 | { | ||
200 | if (rcu_segcblist_is_enabled(rsclp)) | ||
201 | return *rsclp->tails[RCU_DONE_TAIL]; | ||
202 | return NULL; | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * Does the specified rcu_segcblist structure contain callbacks that | ||
207 | * have not yet been processed beyond having been posted, that is, | ||
208 | * does it contain callbacks in its last segment? | ||
209 | */ | ||
210 | bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) | ||
211 | { | ||
212 | return rcu_segcblist_is_enabled(rsclp) && | ||
213 | !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * Enqueue the specified callback onto the specified rcu_segcblist | ||
218 | * structure, updating accounting as needed. Note that the ->len | ||
219 | * field may be accessed locklessly, hence the WRITE_ONCE(). | ||
220 | * The ->len field is used by rcu_barrier() and friends to determine | ||
221 | * if it must post a callback on this structure, and it is OK | ||
222 | * for rcu_barrier() to sometimes post callbacks needlessly, but | ||
223 | * absolutely not OK for it to ever miss posting a callback. | ||
224 | */ | ||
225 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | ||
226 | struct rcu_head *rhp, bool lazy) | ||
227 | { | ||
228 | WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ | ||
229 | if (lazy) | ||
230 | rsclp->len_lazy++; | ||
231 | smp_mb(); /* Ensure counts are updated before callback is enqueued. */ | ||
232 | rhp->next = NULL; | ||
233 | *rsclp->tails[RCU_NEXT_TAIL] = rhp; | ||
234 | rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Entrain the specified callback onto the specified rcu_segcblist at | ||
239 | * the end of the last non-empty segment. If the entire rcu_segcblist | ||
240 | * is empty, make no change, but return false. | ||
241 | * | ||
242 | * This is intended for use by rcu_barrier()-like primitives, -not- | ||
243 | * for normal grace-period use. IMPORTANT: The callback you enqueue | ||
244 | * will wait for all prior callbacks, NOT necessarily for a grace | ||
245 | * period. You have been warned. | ||
246 | */ | ||
247 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | ||
248 | struct rcu_head *rhp, bool lazy) | ||
249 | { | ||
250 | int i; | ||
251 | |||
252 | if (rcu_segcblist_n_cbs(rsclp) == 0) | ||
253 | return false; | ||
254 | WRITE_ONCE(rsclp->len, rsclp->len + 1); | ||
255 | if (lazy) | ||
256 | rsclp->len_lazy++; | ||
257 | smp_mb(); /* Ensure counts are updated before callback is entrained. */ | ||
258 | rhp->next = NULL; | ||
259 | for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) | ||
260 | if (rsclp->tails[i] != rsclp->tails[i - 1]) | ||
261 | break; | ||
262 | *rsclp->tails[i] = rhp; | ||
263 | for (; i <= RCU_NEXT_TAIL; i++) | ||
264 | rsclp->tails[i] = &rhp->next; | ||
265 | return true; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * Extract only the counts from the specified rcu_segcblist structure, | ||
270 | * and place them in the specified rcu_cblist structure. This function | ||
271 | * supports both callback orphaning and invocation, hence the separation | ||
272 | * of counts and callbacks. (Callbacks ready for invocation must be | ||
273 | * orphaned and adopted separately from pending callbacks, but counts | ||
274 | * apply to all callbacks. Locking must be used to make sure that | ||
275 | * both orphaned-callbacks lists are consistent.) | ||
276 | */ | ||
277 | void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | ||
278 | struct rcu_cblist *rclp) | ||
279 | { | ||
280 | rclp->len_lazy += rsclp->len_lazy; | ||
281 | rclp->len += rsclp->len; | ||
282 | rsclp->len_lazy = 0; | ||
283 | WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Extract only those callbacks ready to be invoked from the specified | ||
288 | * rcu_segcblist structure and place them in the specified rcu_cblist | ||
289 | * structure. | ||
290 | */ | ||
291 | void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | ||
292 | struct rcu_cblist *rclp) | ||
293 | { | ||
294 | int i; | ||
295 | |||
296 | if (!rcu_segcblist_ready_cbs(rsclp)) | ||
297 | return; /* Nothing to do. */ | ||
298 | *rclp->tail = rsclp->head; | ||
299 | rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; | ||
300 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
301 | rclp->tail = rsclp->tails[RCU_DONE_TAIL]; | ||
302 | for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) | ||
303 | if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) | ||
304 | rsclp->tails[i] = &rsclp->head; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * Extract only those callbacks still pending (not yet ready to be | ||
309 | * invoked) from the specified rcu_segcblist structure and place them in | ||
310 | * the specified rcu_cblist structure. Note that this loses information | ||
311 | * about any callbacks that might have been partway done waiting for | ||
312 | * their grace period. Too bad! They will have to start over. | ||
313 | */ | ||
314 | void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | ||
315 | struct rcu_cblist *rclp) | ||
316 | { | ||
317 | int i; | ||
318 | |||
319 | if (!rcu_segcblist_pend_cbs(rsclp)) | ||
320 | return; /* Nothing to do. */ | ||
321 | *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; | ||
322 | rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; | ||
323 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
324 | for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) | ||
325 | rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Insert counts from the specified rcu_cblist structure in the | ||
330 | * specified rcu_segcblist structure. | ||
331 | */ | ||
332 | void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | ||
333 | struct rcu_cblist *rclp) | ||
334 | { | ||
335 | rsclp->len_lazy += rclp->len_lazy; | ||
336 | /* ->len sampled locklessly. */ | ||
337 | WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); | ||
338 | rclp->len_lazy = 0; | ||
339 | rclp->len = 0; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * Move callbacks from the specified rcu_cblist to the beginning of the | ||
344 | * done-callbacks segment of the specified rcu_segcblist. | ||
345 | */ | ||
346 | void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | ||
347 | struct rcu_cblist *rclp) | ||
348 | { | ||
349 | int i; | ||
350 | |||
351 | if (!rclp->head) | ||
352 | return; /* No callbacks to move. */ | ||
353 | *rclp->tail = rsclp->head; | ||
354 | rsclp->head = rclp->head; | ||
355 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) | ||
356 | if (&rsclp->head == rsclp->tails[i]) | ||
357 | rsclp->tails[i] = rclp->tail; | ||
358 | else | ||
359 | break; | ||
360 | rclp->head = NULL; | ||
361 | rclp->tail = &rclp->head; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Move callbacks from the specified rcu_cblist to the end of the | ||
366 | * new-callbacks segment of the specified rcu_segcblist. | ||
367 | */ | ||
368 | void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | ||
369 | struct rcu_cblist *rclp) | ||
370 | { | ||
371 | if (!rclp->head) | ||
372 | return; /* Nothing to do. */ | ||
373 | *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; | ||
374 | rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; | ||
375 | rclp->head = NULL; | ||
376 | rclp->tail = &rclp->head; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Advance the callbacks in the specified rcu_segcblist structure based | ||
381 | * on the current value passed in for the grace-period counter. | ||
382 | */ | ||
383 | void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) | ||
384 | { | ||
385 | int i, j; | ||
386 | |||
387 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
388 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
389 | return; | ||
390 | |||
391 | /* | ||
392 | * Find all callbacks whose ->gp_seq numbers indicate that they | ||
393 | * are ready to invoke, and put them into the RCU_DONE_TAIL segment. | ||
394 | */ | ||
395 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
396 | if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
397 | break; | ||
398 | rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; | ||
399 | } | ||
400 | |||
401 | /* If no callbacks moved, nothing more need be done. */ | ||
402 | if (i == RCU_WAIT_TAIL) | ||
403 | return; | ||
404 | |||
405 | /* Clean up tail pointers that might have been misordered above. */ | ||
406 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
407 | rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; | ||
408 | |||
409 | /* | ||
410 | * Callbacks moved, so clean up the misordered ->tails[] pointers | ||
411 | * that now point into the middle of the list of ready-to-invoke | ||
412 | * callbacks. The overall effect is to copy down the later pointers | ||
413 | * into the gap that was created by the now-ready segments. | ||
414 | */ | ||
415 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
416 | if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) | ||
417 | break; /* No more callbacks. */ | ||
418 | rsclp->tails[j] = rsclp->tails[i]; | ||
419 | rsclp->gp_seq[j] = rsclp->gp_seq[i]; | ||
420 | } | ||
421 | } | ||
422 | |||
423 | /* | ||
424 | * "Accelerate" callbacks based on more-accurate grace-period information. | ||
425 | * The reason for this is that RCU does not synchronize the beginnings and | ||
426 | * ends of grace periods, and that callbacks are posted locally. This in | ||
427 | * turn means that the callbacks must be labelled conservatively early | ||
428 | * on, as getting exact information would degrade both performance and | ||
429 | * scalability. When more accurate grace-period information becomes | ||
430 | * available, previously posted callbacks can be "accelerated", marking | ||
431 | * them to complete at the end of the earlier grace period. | ||
432 | * | ||
433 | * This function operates on an rcu_segcblist structure, and also the | ||
434 | * grace-period sequence number seq at which new callbacks would become | ||
435 | * ready to invoke. Returns true if there are callbacks that won't be | ||
436 | * ready to invoke until seq, false otherwise. | ||
437 | */ | ||
438 | bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) | ||
439 | { | ||
440 | int i; | ||
441 | |||
442 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
443 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
444 | return false; | ||
445 | |||
446 | /* | ||
447 | * Find the segment preceding the oldest segment of callbacks | ||
448 | * whose ->gp_seq[] completion is at or after that passed in via | ||
449 | * "seq", skipping any empty segments. This oldest segment, along | ||
450 | * with any later segments, can be merged in with any newly arrived | ||
451 | * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" | ||
452 | * as their ->gp_seq[] grace-period completion sequence number. | ||
453 | */ | ||
454 | for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) | ||
455 | if (rsclp->tails[i] != rsclp->tails[i - 1] && | ||
456 | ULONG_CMP_LT(rsclp->gp_seq[i], seq)) | ||
457 | break; | ||
458 | |||
459 | /* | ||
460 | * If all the segments contain callbacks that correspond to | ||
461 | * earlier grace-period sequence numbers than "seq", leave. | ||
462 | * Assuming that the rcu_segcblist structure has enough | ||
463 | * segments in its arrays, this can only happen if some of | ||
464 | * the non-done segments contain callbacks that really are | ||
465 | * ready to invoke. This situation will get straightened | ||
466 | * out by the next call to rcu_segcblist_advance(). | ||
467 | * | ||
468 | * Also advance to the oldest segment of callbacks whose | ||
469 | * ->gp_seq[] completion is at or after that passed in via "seq", | ||
470 | * skipping any empty segments. | ||
471 | */ | ||
472 | if (++i >= RCU_NEXT_TAIL) | ||
473 | return false; | ||
474 | |||
475 | /* | ||
476 | * Merge all later callbacks, including newly arrived callbacks, | ||
477 | * into the segment located by the for-loop above. Assign "seq" | ||
478 | * as the ->gp_seq[] value in order to correctly handle the case | ||
479 | * where there were no pending callbacks in the rcu_segcblist | ||
480 | * structure other than in the RCU_NEXT_TAIL segment. | ||
481 | */ | ||
482 | for (; i < RCU_NEXT_TAIL; i++) { | ||
483 | rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; | ||
484 | rsclp->gp_seq[i] = seq; | ||
485 | } | ||
486 | return true; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Scan the specified rcu_segcblist structure for callbacks that need | ||
491 | * a grace period later than the one specified by "seq". We don't look | ||
492 | * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't | ||
493 | * have a grace-period sequence number. | ||
494 | */ | ||
495 | bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, | ||
496 | unsigned long seq) | ||
497 | { | ||
498 | int i; | ||
499 | |||
500 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
501 | if (rsclp->tails[i - 1] != rsclp->tails[i] && | ||
502 | ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
503 | return true; | ||
504 | return false; | ||
505 | } | ||
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h new file mode 100644 index 000000000000..6e36e36478cd --- /dev/null +++ b/kernel/rcu/rcu_segcblist.h | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists, internal-to-rcu header file | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/rcu_segcblist.h> | ||
24 | |||
25 | /* | ||
26 | * Account for the fact that a previously dequeued callback turned out | ||
27 | * to be marked as lazy. | ||
28 | */ | ||
29 | static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) | ||
30 | { | ||
31 | rclp->len_lazy--; | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
36 | * rcu_cblist will be used more pervasively, removing the need for this | ||
37 | * function. | ||
38 | */ | ||
39 | static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) | ||
40 | { | ||
41 | return rclp->head; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
46 | * rcu_cblist will be used more pervasively, removing the need for this | ||
47 | * function. | ||
48 | */ | ||
49 | static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) | ||
50 | { | ||
51 | WARN_ON_ONCE(!rclp->head); | ||
52 | return rclp->tail; | ||
53 | } | ||
54 | |||
55 | void rcu_cblist_init(struct rcu_cblist *rclp); | ||
56 | long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); | ||
57 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); | ||
58 | |||
59 | /* | ||
60 | * Is the specified rcu_segcblist structure empty? | ||
61 | * | ||
62 | * But careful! The fact that the ->head field is NULL does not | ||
63 | * necessarily imply that there are no callbacks associated with | ||
64 | * this structure. When callbacks are being invoked, they are | ||
65 | * removed as a group. If callback invocation must be preempted, | ||
66 | * the remaining callbacks will be added back to the list. Either | ||
67 | * way, the counts are updated later. | ||
68 | * | ||
69 | * So it is often the case that rcu_segcblist_n_cbs() should be used | ||
70 | * instead. | ||
71 | */ | ||
72 | static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) | ||
73 | { | ||
74 | return !rsclp->head; | ||
75 | } | ||
76 | |||
77 | /* Return number of callbacks in segmented callback list. */ | ||
78 | static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) | ||
79 | { | ||
80 | return READ_ONCE(rsclp->len); | ||
81 | } | ||
82 | |||
83 | /* Return number of lazy callbacks in segmented callback list. */ | ||
84 | static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) | ||
85 | { | ||
86 | return rsclp->len_lazy; | ||
87 | } | ||
88 | |||
89 | /* Return number of lazy callbacks in segmented callback list. */ | ||
90 | static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) | ||
91 | { | ||
92 | return rsclp->len - rsclp->len_lazy; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Is the specified rcu_segcblist enabled, for example, not corresponding | ||
97 | * to an offline or callback-offloaded CPU? | ||
98 | */ | ||
99 | static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) | ||
100 | { | ||
101 | return !!rsclp->tails[RCU_NEXT_TAIL]; | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * Are all segments following the specified segment of the specified | ||
106 | * rcu_segcblist structure empty of callbacks? (The specified | ||
107 | * segment might well contain callbacks.) | ||
108 | */ | ||
109 | static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) | ||
110 | { | ||
111 | return !*rsclp->tails[seg]; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
116 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
117 | * function. | ||
118 | */ | ||
119 | static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) | ||
120 | { | ||
121 | return rsclp->head; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
126 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
127 | * function. | ||
128 | */ | ||
129 | static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) | ||
130 | { | ||
131 | WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); | ||
132 | return rsclp->tails[RCU_NEXT_TAIL]; | ||
133 | } | ||
134 | |||
135 | void rcu_segcblist_init(struct rcu_segcblist *rsclp); | ||
136 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp); | ||
137 | bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); | ||
138 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); | ||
139 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); | ||
140 | struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); | ||
141 | void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); | ||
142 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); | ||
143 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); | ||
144 | bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); | ||
145 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | ||
146 | struct rcu_head *rhp, bool lazy); | ||
147 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | ||
148 | struct rcu_head *rhp, bool lazy); | ||
149 | void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | ||
150 | struct rcu_cblist *rclp); | ||
151 | void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | ||
152 | struct rcu_cblist *rclp); | ||
153 | void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | ||
154 | struct rcu_cblist *rclp); | ||
155 | void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | ||
156 | struct rcu_cblist *rclp); | ||
157 | void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | ||
158 | struct rcu_cblist *rclp); | ||
159 | void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | ||
160 | struct rcu_cblist *rclp); | ||
161 | void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); | ||
162 | bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); | ||
163 | bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, | ||
164 | unsigned long seq); | ||
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a8135..ae6e574d4cf5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void) | |||
559 | 559 | ||
560 | static void srcu_torture_stats(void) | 560 | static void srcu_torture_stats(void) |
561 | { | 561 | { |
562 | int cpu; | 562 | int __maybe_unused cpu; |
563 | int idx = srcu_ctlp->completed & 0x1; | 563 | int idx; |
564 | 564 | ||
565 | pr_alert("%s%s per-CPU(idx=%d):", | 565 | #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) |
566 | #ifdef CONFIG_TREE_SRCU | ||
567 | idx = srcu_ctlp->srcu_idx & 0x1; | ||
568 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
569 | idx = srcu_ctlp->completed & 0x1; | ||
570 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
571 | pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", | ||
566 | torture_type, TORTURE_FLAG, idx); | 572 | torture_type, TORTURE_FLAG, idx); |
567 | for_each_possible_cpu(cpu) { | 573 | for_each_possible_cpu(cpu) { |
568 | unsigned long l0, l1; | 574 | unsigned long l0, l1; |
569 | unsigned long u0, u1; | 575 | unsigned long u0, u1; |
570 | long c0, c1; | 576 | long c0, c1; |
571 | struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | 577 | #ifdef CONFIG_TREE_SRCU |
578 | struct srcu_data *counts; | ||
572 | 579 | ||
580 | counts = per_cpu_ptr(srcu_ctlp->sda, cpu); | ||
581 | u0 = counts->srcu_unlock_count[!idx]; | ||
582 | u1 = counts->srcu_unlock_count[idx]; | ||
583 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
584 | struct srcu_array *counts; | ||
585 | |||
586 | counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | ||
573 | u0 = counts->unlock_count[!idx]; | 587 | u0 = counts->unlock_count[!idx]; |
574 | u1 = counts->unlock_count[idx]; | 588 | u1 = counts->unlock_count[idx]; |
589 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
575 | 590 | ||
576 | /* | 591 | /* |
577 | * Make sure that a lock is always counted if the corresponding | 592 | * Make sure that a lock is always counted if the corresponding |
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void) | |||
579 | */ | 594 | */ |
580 | smp_rmb(); | 595 | smp_rmb(); |
581 | 596 | ||
597 | #ifdef CONFIG_TREE_SRCU | ||
598 | l0 = counts->srcu_lock_count[!idx]; | ||
599 | l1 = counts->srcu_lock_count[idx]; | ||
600 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
582 | l0 = counts->lock_count[!idx]; | 601 | l0 = counts->lock_count[!idx]; |
583 | l1 = counts->lock_count[idx]; | 602 | l1 = counts->lock_count[idx]; |
603 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
584 | 604 | ||
585 | c0 = l0 - u0; | 605 | c0 = l0 - u0; |
586 | c1 = l1 - u1; | 606 | c1 = l1 - u1; |
587 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); | 607 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); |
588 | } | 608 | } |
589 | pr_cont("\n"); | 609 | pr_cont("\n"); |
610 | #elif defined(CONFIG_TINY_SRCU) | ||
611 | idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; | ||
612 | pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", | ||
613 | torture_type, TORTURE_FLAG, idx, | ||
614 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), | ||
615 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); | ||
616 | #endif | ||
590 | } | 617 | } |
591 | 618 | ||
592 | static void srcu_torture_synchronize_expedited(void) | 619 | static void srcu_torture_synchronize_expedited(void) |
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void) | |||
1333 | cur_ops->stats(); | 1360 | cur_ops->stats(); |
1334 | if (rtcv_snap == rcu_torture_current_version && | 1361 | if (rtcv_snap == rcu_torture_current_version && |
1335 | rcu_torture_current != NULL) { | 1362 | rcu_torture_current != NULL) { |
1336 | int __maybe_unused flags; | 1363 | int __maybe_unused flags = 0; |
1337 | unsigned long __maybe_unused gpnum; | 1364 | unsigned long __maybe_unused gpnum = 0; |
1338 | unsigned long __maybe_unused completed; | 1365 | unsigned long __maybe_unused completed = 0; |
1339 | 1366 | ||
1340 | rcutorture_get_gp_data(cur_ops->ttype, | 1367 | rcutorture_get_gp_data(cur_ops->ttype, |
1341 | &flags, &gpnum, &completed); | 1368 | &flags, &gpnum, &completed); |
1369 | srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, | ||
1370 | &flags, &gpnum, &completed); | ||
1342 | wtp = READ_ONCE(writer_task); | 1371 | wtp = READ_ONCE(writer_task); |
1343 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", | 1372 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", |
1344 | rcu_torture_writer_state_getname(), | 1373 | rcu_torture_writer_state_getname(), |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index ef3bcfb15b39..584d8a983883 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -22,7 +22,7 @@ | |||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> |
23 | * | 23 | * |
24 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
25 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) | |||
243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | 243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure |
244 | * @sp: structure to clean up. | 244 | * @sp: structure to clean up. |
245 | * | 245 | * |
246 | * Must invoke this after you are finished using a given srcu_struct that | 246 | * Must invoke this only after you are finished using a given srcu_struct |
247 | * was initialized via init_srcu_struct(), else you leak memory. | 247 | * that was initialized via init_srcu_struct(). This code does some |
248 | * probabalistic checking, spotting late uses of srcu_read_lock(), | ||
249 | * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). | ||
250 | * If any such late uses are detected, the per-CPU memory associated with | ||
251 | * the srcu_struct is simply leaked and WARN_ON() is invoked. If the | ||
252 | * caller frees the srcu_struct itself, a use-after-free crash will likely | ||
253 | * ensue, but at least there will be a warning printed. | ||
248 | */ | 254 | */ |
249 | void cleanup_srcu_struct(struct srcu_struct *sp) | 255 | void cleanup_srcu_struct(struct srcu_struct *sp) |
250 | { | 256 | { |
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..36e1f82faed1 --- /dev/null +++ b/kernel/rcu/srcutiny.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tiny version for non-preemptible single-CPU use. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #include <linux/export.h> | ||
25 | #include <linux/mutex.h> | ||
26 | #include <linux/preempt.h> | ||
27 | #include <linux/rcupdate_wait.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/srcu.h> | ||
31 | |||
32 | #include <linux/rcu_node_tree.h> | ||
33 | #include "rcu_segcblist.h" | ||
34 | #include "rcu.h" | ||
35 | |||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
37 | { | ||
38 | sp->srcu_lock_nesting[0] = 0; | ||
39 | sp->srcu_lock_nesting[1] = 0; | ||
40 | init_swait_queue_head(&sp->srcu_wq); | ||
41 | sp->srcu_gp_seq = 0; | ||
42 | rcu_segcblist_init(&sp->srcu_cblist); | ||
43 | sp->srcu_gp_running = false; | ||
44 | sp->srcu_gp_waiting = false; | ||
45 | sp->srcu_idx = 0; | ||
46 | INIT_WORK(&sp->srcu_work, srcu_drive_gp); | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
51 | |||
52 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
53 | struct lock_class_key *key) | ||
54 | { | ||
55 | /* Don't re-initialize a lock while it is held. */ | ||
56 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
57 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
58 | return init_srcu_struct_fields(sp); | ||
59 | } | ||
60 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
61 | |||
62 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
63 | |||
64 | /* | ||
65 | * init_srcu_struct - initialize a sleep-RCU structure | ||
66 | * @sp: structure to initialize. | ||
67 | * | ||
68 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
69 | * to any other function. Each srcu_struct represents a separate domain | ||
70 | * of SRCU protection. | ||
71 | */ | ||
72 | int init_srcu_struct(struct srcu_struct *sp) | ||
73 | { | ||
74 | return init_srcu_struct_fields(sp); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
77 | |||
78 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
79 | |||
80 | /* | ||
81 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
82 | * @sp: structure to clean up. | ||
83 | * | ||
84 | * Must invoke this after you are finished using a given srcu_struct that | ||
85 | * was initialized via init_srcu_struct(), else you leak memory. | ||
86 | */ | ||
87 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
88 | { | ||
89 | WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); | ||
90 | flush_work(&sp->srcu_work); | ||
91 | WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); | ||
92 | WARN_ON(sp->srcu_gp_running); | ||
93 | WARN_ON(sp->srcu_gp_waiting); | ||
94 | WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); | ||
95 | } | ||
96 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
97 | |||
98 | /* | ||
99 | * Counts the new reader in the appropriate per-CPU element of the | ||
100 | * srcu_struct. Must be called from process context. | ||
101 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
102 | */ | ||
103 | int __srcu_read_lock(struct srcu_struct *sp) | ||
104 | { | ||
105 | int idx; | ||
106 | |||
107 | idx = READ_ONCE(sp->srcu_idx); | ||
108 | WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); | ||
109 | return idx; | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
112 | |||
113 | /* | ||
114 | * Removes the count for the old reader from the appropriate element of | ||
115 | * the srcu_struct. Must be called from process context. | ||
116 | */ | ||
117 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
118 | { | ||
119 | int newval = sp->srcu_lock_nesting[idx] - 1; | ||
120 | |||
121 | WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); | ||
122 | if (!newval && READ_ONCE(sp->srcu_gp_waiting)) | ||
123 | swake_up(&sp->srcu_wq); | ||
124 | } | ||
125 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
126 | |||
127 | /* | ||
128 | * Workqueue handler to drive one grace period and invoke any callbacks | ||
129 | * that become ready as a result. Single-CPU and !PREEMPT operation | ||
130 | * means that we get away with murder on synchronization. ;-) | ||
131 | */ | ||
132 | void srcu_drive_gp(struct work_struct *wp) | ||
133 | { | ||
134 | int idx; | ||
135 | struct rcu_cblist ready_cbs; | ||
136 | struct srcu_struct *sp; | ||
137 | struct rcu_head *rhp; | ||
138 | |||
139 | sp = container_of(wp, struct srcu_struct, srcu_work); | ||
140 | if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) | ||
141 | return; /* Already running or nothing to do. */ | ||
142 | |||
143 | /* Tag recently arrived callbacks and wait for readers. */ | ||
144 | WRITE_ONCE(sp->srcu_gp_running, true); | ||
145 | rcu_segcblist_accelerate(&sp->srcu_cblist, | ||
146 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
147 | rcu_seq_start(&sp->srcu_gp_seq); | ||
148 | idx = sp->srcu_idx; | ||
149 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); | ||
150 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ | ||
151 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); | ||
152 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ | ||
153 | rcu_seq_end(&sp->srcu_gp_seq); | ||
154 | |||
155 | /* Update callback list based on GP, and invoke ready callbacks. */ | ||
156 | rcu_segcblist_advance(&sp->srcu_cblist, | ||
157 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
158 | if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { | ||
159 | rcu_cblist_init(&ready_cbs); | ||
160 | local_irq_disable(); | ||
161 | rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); | ||
162 | local_irq_enable(); | ||
163 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
164 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
165 | local_bh_disable(); | ||
166 | rhp->func(rhp); | ||
167 | local_bh_enable(); | ||
168 | } | ||
169 | local_irq_disable(); | ||
170 | rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); | ||
171 | local_irq_enable(); | ||
172 | } | ||
173 | WRITE_ONCE(sp->srcu_gp_running, false); | ||
174 | |||
175 | /* | ||
176 | * If more callbacks, reschedule ourselves. This can race with | ||
177 | * a call_srcu() at interrupt level, but the ->srcu_gp_running | ||
178 | * checks will straighten that out. | ||
179 | */ | ||
180 | if (!rcu_segcblist_empty(&sp->srcu_cblist)) | ||
181 | schedule_work(&sp->srcu_work); | ||
182 | } | ||
183 | EXPORT_SYMBOL_GPL(srcu_drive_gp); | ||
184 | |||
185 | /* | ||
186 | * Enqueue an SRCU callback on the specified srcu_struct structure, | ||
187 | * initiating grace-period processing if it is not already running. | ||
188 | */ | ||
189 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
190 | rcu_callback_t func) | ||
191 | { | ||
192 | unsigned long flags; | ||
193 | |||
194 | head->func = func; | ||
195 | local_irq_save(flags); | ||
196 | rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); | ||
197 | local_irq_restore(flags); | ||
198 | if (!READ_ONCE(sp->srcu_gp_running)) | ||
199 | schedule_work(&sp->srcu_work); | ||
200 | } | ||
201 | EXPORT_SYMBOL_GPL(call_srcu); | ||
202 | |||
203 | /* | ||
204 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
205 | */ | ||
206 | void synchronize_srcu(struct srcu_struct *sp) | ||
207 | { | ||
208 | struct rcu_synchronize rs; | ||
209 | |||
210 | init_rcu_head_on_stack(&rs.head); | ||
211 | init_completion(&rs.completion); | ||
212 | call_srcu(sp, &rs.head, wakeme_after_rcu); | ||
213 | wait_for_completion(&rs.completion); | ||
214 | destroy_rcu_head_on_stack(&rs.head); | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..3ae8474557df --- /dev/null +++ b/kernel/rcu/srcutree.c | |||
@@ -0,0 +1,1155 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2006 | ||
19 | * Copyright (C) Fujitsu, 2012 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
23 | * | ||
24 | * For detailed explanation of Read-Copy Update mechanism see - | ||
25 | * Documentation/RCU/ *.txt | ||
26 | * | ||
27 | */ | ||
28 | |||
29 | #include <linux/export.h> | ||
30 | #include <linux/mutex.h> | ||
31 | #include <linux/percpu.h> | ||
32 | #include <linux/preempt.h> | ||
33 | #include <linux/rcupdate_wait.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/smp.h> | ||
36 | #include <linux/delay.h> | ||
37 | #include <linux/module.h> | ||
38 | #include <linux/srcu.h> | ||
39 | |||
40 | #include "rcu.h" | ||
41 | #include "rcu_segcblist.h" | ||
42 | |||
43 | ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ | ||
44 | module_param(exp_holdoff, ulong, 0444); | ||
45 | |||
46 | static void srcu_invoke_callbacks(struct work_struct *work); | ||
47 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); | ||
48 | |||
49 | /* | ||
50 | * Initialize SRCU combining tree. Note that statically allocated | ||
51 | * srcu_struct structures might already have srcu_read_lock() and | ||
52 | * srcu_read_unlock() running against them. So if the is_static parameter | ||
53 | * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. | ||
54 | */ | ||
55 | static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) | ||
56 | { | ||
57 | int cpu; | ||
58 | int i; | ||
59 | int level = 0; | ||
60 | int levelspread[RCU_NUM_LVLS]; | ||
61 | struct srcu_data *sdp; | ||
62 | struct srcu_node *snp; | ||
63 | struct srcu_node *snp_first; | ||
64 | |||
65 | /* Work out the overall tree geometry. */ | ||
66 | sp->level[0] = &sp->node[0]; | ||
67 | for (i = 1; i < rcu_num_lvls; i++) | ||
68 | sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; | ||
69 | rcu_init_levelspread(levelspread, num_rcu_lvl); | ||
70 | |||
71 | /* Each pass through this loop initializes one srcu_node structure. */ | ||
72 | rcu_for_each_node_breadth_first(sp, snp) { | ||
73 | spin_lock_init(&snp->lock); | ||
74 | WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != | ||
75 | ARRAY_SIZE(snp->srcu_data_have_cbs)); | ||
76 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { | ||
77 | snp->srcu_have_cbs[i] = 0; | ||
78 | snp->srcu_data_have_cbs[i] = 0; | ||
79 | } | ||
80 | snp->srcu_gp_seq_needed_exp = 0; | ||
81 | snp->grplo = -1; | ||
82 | snp->grphi = -1; | ||
83 | if (snp == &sp->node[0]) { | ||
84 | /* Root node, special case. */ | ||
85 | snp->srcu_parent = NULL; | ||
86 | continue; | ||
87 | } | ||
88 | |||
89 | /* Non-root node. */ | ||
90 | if (snp == sp->level[level + 1]) | ||
91 | level++; | ||
92 | snp->srcu_parent = sp->level[level - 1] + | ||
93 | (snp - sp->level[level]) / | ||
94 | levelspread[level - 1]; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Initialize the per-CPU srcu_data array, which feeds into the | ||
99 | * leaves of the srcu_node tree. | ||
100 | */ | ||
101 | WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != | ||
102 | ARRAY_SIZE(sdp->srcu_unlock_count)); | ||
103 | level = rcu_num_lvls - 1; | ||
104 | snp_first = sp->level[level]; | ||
105 | for_each_possible_cpu(cpu) { | ||
106 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
107 | spin_lock_init(&sdp->lock); | ||
108 | rcu_segcblist_init(&sdp->srcu_cblist); | ||
109 | sdp->srcu_cblist_invoking = false; | ||
110 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; | ||
111 | sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; | ||
112 | sdp->mynode = &snp_first[cpu / levelspread[level]]; | ||
113 | for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { | ||
114 | if (snp->grplo < 0) | ||
115 | snp->grplo = cpu; | ||
116 | snp->grphi = cpu; | ||
117 | } | ||
118 | sdp->cpu = cpu; | ||
119 | INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); | ||
120 | sdp->sp = sp; | ||
121 | sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); | ||
122 | if (is_static) | ||
123 | continue; | ||
124 | |||
125 | /* Dynamically allocated, better be no srcu_read_locks()! */ | ||
126 | for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { | ||
127 | sdp->srcu_lock_count[i] = 0; | ||
128 | sdp->srcu_unlock_count[i] = 0; | ||
129 | } | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Initialize non-compile-time initialized fields, including the | ||
135 | * associated srcu_node and srcu_data structures. The is_static | ||
136 | * parameter is passed through to init_srcu_struct_nodes(), and | ||
137 | * also tells us that ->sda has already been wired up to srcu_data. | ||
138 | */ | ||
139 | static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) | ||
140 | { | ||
141 | mutex_init(&sp->srcu_cb_mutex); | ||
142 | mutex_init(&sp->srcu_gp_mutex); | ||
143 | sp->srcu_idx = 0; | ||
144 | sp->srcu_gp_seq = 0; | ||
145 | sp->srcu_barrier_seq = 0; | ||
146 | mutex_init(&sp->srcu_barrier_mutex); | ||
147 | atomic_set(&sp->srcu_barrier_cpu_cnt, 0); | ||
148 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
149 | if (!is_static) | ||
150 | sp->sda = alloc_percpu(struct srcu_data); | ||
151 | init_srcu_struct_nodes(sp, is_static); | ||
152 | sp->srcu_gp_seq_needed_exp = 0; | ||
153 | sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); | ||
154 | smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ | ||
155 | return sp->sda ? 0 : -ENOMEM; | ||
156 | } | ||
157 | |||
158 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
159 | |||
160 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
161 | struct lock_class_key *key) | ||
162 | { | ||
163 | /* Don't re-initialize a lock while it is held. */ | ||
164 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
165 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
166 | spin_lock_init(&sp->gp_lock); | ||
167 | return init_srcu_struct_fields(sp, false); | ||
168 | } | ||
169 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
170 | |||
171 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
172 | |||
173 | /** | ||
174 | * init_srcu_struct - initialize a sleep-RCU structure | ||
175 | * @sp: structure to initialize. | ||
176 | * | ||
177 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
178 | * to any other function. Each srcu_struct represents a separate domain | ||
179 | * of SRCU protection. | ||
180 | */ | ||
181 | int init_srcu_struct(struct srcu_struct *sp) | ||
182 | { | ||
183 | spin_lock_init(&sp->gp_lock); | ||
184 | return init_srcu_struct_fields(sp, false); | ||
185 | } | ||
186 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
187 | |||
188 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
189 | |||
190 | /* | ||
191 | * First-use initialization of statically allocated srcu_struct | ||
192 | * structure. Wiring up the combining tree is more than can be | ||
193 | * done with compile-time initialization, so this check is added | ||
194 | * to each update-side SRCU primitive. Use ->gp_lock, which -is- | ||
195 | * compile-time initialized, to resolve races involving multiple | ||
196 | * CPUs trying to garner first-use privileges. | ||
197 | */ | ||
198 | static void check_init_srcu_struct(struct srcu_struct *sp) | ||
199 | { | ||
200 | unsigned long flags; | ||
201 | |||
202 | WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); | ||
203 | /* The smp_load_acquire() pairs with the smp_store_release(). */ | ||
204 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ | ||
205 | return; /* Already initialized. */ | ||
206 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
207 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { | ||
208 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
209 | return; | ||
210 | } | ||
211 | init_srcu_struct_fields(sp, true); | ||
212 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Returns approximate total of the readers' ->srcu_lock_count[] values | ||
217 | * for the rank of per-CPU counters specified by idx. | ||
218 | */ | ||
219 | static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) | ||
220 | { | ||
221 | int cpu; | ||
222 | unsigned long sum = 0; | ||
223 | |||
224 | for_each_possible_cpu(cpu) { | ||
225 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
226 | |||
227 | sum += READ_ONCE(cpuc->srcu_lock_count[idx]); | ||
228 | } | ||
229 | return sum; | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Returns approximate total of the readers' ->srcu_unlock_count[] values | ||
234 | * for the rank of per-CPU counters specified by idx. | ||
235 | */ | ||
236 | static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) | ||
237 | { | ||
238 | int cpu; | ||
239 | unsigned long sum = 0; | ||
240 | |||
241 | for_each_possible_cpu(cpu) { | ||
242 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
243 | |||
244 | sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); | ||
245 | } | ||
246 | return sum; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Return true if the number of pre-existing readers is determined to | ||
251 | * be zero. | ||
252 | */ | ||
253 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
254 | { | ||
255 | unsigned long unlocks; | ||
256 | |||
257 | unlocks = srcu_readers_unlock_idx(sp, idx); | ||
258 | |||
259 | /* | ||
260 | * Make sure that a lock is always counted if the corresponding | ||
261 | * unlock is counted. Needs to be a smp_mb() as the read side may | ||
262 | * contain a read from a variable that is written to before the | ||
263 | * synchronize_srcu() in the write side. In this case smp_mb()s | ||
264 | * A and B act like the store buffering pattern. | ||
265 | * | ||
266 | * This smp_mb() also pairs with smp_mb() C to prevent accesses | ||
267 | * after the synchronize_srcu() from being executed before the | ||
268 | * grace period ends. | ||
269 | */ | ||
270 | smp_mb(); /* A */ | ||
271 | |||
272 | /* | ||
273 | * If the locks are the same as the unlocks, then there must have | ||
274 | * been no readers on this index at some time in between. This does | ||
275 | * not mean that there are no more readers, as one could have read | ||
276 | * the current index but not have incremented the lock counter yet. | ||
277 | * | ||
278 | * Possible bug: There is no guarantee that there haven't been | ||
279 | * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were | ||
280 | * counted, meaning that this could return true even if there are | ||
281 | * still active readers. Since there are no memory barriers around | ||
282 | * srcu_flip(), the CPU is not required to increment ->srcu_idx | ||
283 | * before running srcu_readers_unlock_idx(), which means that there | ||
284 | * could be an arbitrarily large number of critical sections that | ||
285 | * execute after srcu_readers_unlock_idx() but use the old value | ||
286 | * of ->srcu_idx. | ||
287 | */ | ||
288 | return srcu_readers_lock_idx(sp, idx) == unlocks; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * srcu_readers_active - returns true if there are readers. and false | ||
293 | * otherwise | ||
294 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | ||
295 | * | ||
296 | * Note that this is not an atomic primitive, and can therefore suffer | ||
297 | * severe errors when invoked on an active srcu_struct. That said, it | ||
298 | * can be useful as an error check at cleanup time. | ||
299 | */ | ||
300 | static bool srcu_readers_active(struct srcu_struct *sp) | ||
301 | { | ||
302 | int cpu; | ||
303 | unsigned long sum = 0; | ||
304 | |||
305 | for_each_possible_cpu(cpu) { | ||
306 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
307 | |||
308 | sum += READ_ONCE(cpuc->srcu_lock_count[0]); | ||
309 | sum += READ_ONCE(cpuc->srcu_lock_count[1]); | ||
310 | sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); | ||
311 | sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); | ||
312 | } | ||
313 | return sum; | ||
314 | } | ||
315 | |||
316 | #define SRCU_INTERVAL 1 | ||
317 | |||
318 | /* | ||
319 | * Return grace-period delay, zero if there are expedited grace | ||
320 | * periods pending, SRCU_INTERVAL otherwise. | ||
321 | */ | ||
322 | static unsigned long srcu_get_delay(struct srcu_struct *sp) | ||
323 | { | ||
324 | if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), | ||
325 | READ_ONCE(sp->srcu_gp_seq_needed_exp))) | ||
326 | return 0; | ||
327 | return SRCU_INTERVAL; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
332 | * @sp: structure to clean up. | ||
333 | * | ||
334 | * Must invoke this after you are finished using a given srcu_struct that | ||
335 | * was initialized via init_srcu_struct(), else you leak memory. | ||
336 | */ | ||
337 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
338 | { | ||
339 | int cpu; | ||
340 | |||
341 | if (WARN_ON(!srcu_get_delay(sp))) | ||
342 | return; /* Leakage unless caller handles error. */ | ||
343 | if (WARN_ON(srcu_readers_active(sp))) | ||
344 | return; /* Leakage unless caller handles error. */ | ||
345 | flush_delayed_work(&sp->work); | ||
346 | for_each_possible_cpu(cpu) | ||
347 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); | ||
348 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | ||
349 | WARN_ON(srcu_readers_active(sp))) { | ||
350 | pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); | ||
351 | return; /* Caller forgot to stop doing call_srcu()? */ | ||
352 | } | ||
353 | free_percpu(sp->sda); | ||
354 | sp->sda = NULL; | ||
355 | } | ||
356 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
357 | |||
358 | /* | ||
359 | * Counts the new reader in the appropriate per-CPU element of the | ||
360 | * srcu_struct. Must be called from process context. | ||
361 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
362 | */ | ||
363 | int __srcu_read_lock(struct srcu_struct *sp) | ||
364 | { | ||
365 | int idx; | ||
366 | |||
367 | idx = READ_ONCE(sp->srcu_idx) & 0x1; | ||
368 | __this_cpu_inc(sp->sda->srcu_lock_count[idx]); | ||
369 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
370 | return idx; | ||
371 | } | ||
372 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
373 | |||
374 | /* | ||
375 | * Removes the count for the old reader from the appropriate per-CPU | ||
376 | * element of the srcu_struct. Note that this may well be a different | ||
377 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | ||
378 | * Must be called from process context. | ||
379 | */ | ||
380 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
381 | { | ||
382 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | ||
383 | this_cpu_inc(sp->sda->srcu_unlock_count[idx]); | ||
384 | } | ||
385 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
386 | |||
387 | /* | ||
388 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
389 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
390 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
391 | * sections. If there are still some readers after a few microseconds, | ||
392 | * we repeatedly block for 1-millisecond time periods. | ||
393 | */ | ||
394 | #define SRCU_RETRY_CHECK_DELAY 5 | ||
395 | |||
396 | /* | ||
397 | * Start an SRCU grace period. | ||
398 | */ | ||
399 | static void srcu_gp_start(struct srcu_struct *sp) | ||
400 | { | ||
401 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | ||
402 | int state; | ||
403 | |||
404 | RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), | ||
405 | "Invoked srcu_gp_start() without ->gp_lock!"); | ||
406 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
407 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
408 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
409 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
410 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
411 | smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ | ||
412 | rcu_seq_start(&sp->srcu_gp_seq); | ||
413 | state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
414 | WARN_ON_ONCE(state != SRCU_STATE_SCAN1); | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Track online CPUs to guide callback workqueue placement. | ||
419 | */ | ||
420 | DEFINE_PER_CPU(bool, srcu_online); | ||
421 | |||
422 | void srcu_online_cpu(unsigned int cpu) | ||
423 | { | ||
424 | WRITE_ONCE(per_cpu(srcu_online, cpu), true); | ||
425 | } | ||
426 | |||
427 | void srcu_offline_cpu(unsigned int cpu) | ||
428 | { | ||
429 | WRITE_ONCE(per_cpu(srcu_online, cpu), false); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Place the workqueue handler on the specified CPU if online, otherwise | ||
434 | * just run it whereever. This is useful for placing workqueue handlers | ||
435 | * that are to invoke the specified CPU's callbacks. | ||
436 | */ | ||
437 | static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
438 | struct delayed_work *dwork, | ||
439 | unsigned long delay) | ||
440 | { | ||
441 | bool ret; | ||
442 | |||
443 | preempt_disable(); | ||
444 | if (READ_ONCE(per_cpu(srcu_online, cpu))) | ||
445 | ret = queue_delayed_work_on(cpu, wq, dwork, delay); | ||
446 | else | ||
447 | ret = queue_delayed_work(wq, dwork, delay); | ||
448 | preempt_enable(); | ||
449 | return ret; | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Schedule callback invocation for the specified srcu_data structure, | ||
454 | * if possible, on the corresponding CPU. | ||
455 | */ | ||
456 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) | ||
457 | { | ||
458 | srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, | ||
459 | &sdp->work, delay); | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * Schedule callback invocation for all srcu_data structures associated | ||
464 | * with the specified srcu_node structure that have callbacks for the | ||
465 | * just-completed grace period, the one corresponding to idx. If possible, | ||
466 | * schedule this invocation on the corresponding CPUs. | ||
467 | */ | ||
468 | static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, | ||
469 | unsigned long mask, unsigned long delay) | ||
470 | { | ||
471 | int cpu; | ||
472 | |||
473 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | ||
474 | if (!(mask & (1 << (cpu - snp->grplo)))) | ||
475 | continue; | ||
476 | srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); | ||
477 | } | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Note the end of an SRCU grace period. Initiates callback invocation | ||
482 | * and starts a new grace period if needed. | ||
483 | * | ||
484 | * The ->srcu_cb_mutex acquisition does not protect any data, but | ||
485 | * instead prevents more than one grace period from starting while we | ||
486 | * are initiating callback invocation. This allows the ->srcu_have_cbs[] | ||
487 | * array to have a finite number of elements. | ||
488 | */ | ||
489 | static void srcu_gp_end(struct srcu_struct *sp) | ||
490 | { | ||
491 | unsigned long cbdelay; | ||
492 | bool cbs; | ||
493 | unsigned long gpseq; | ||
494 | int idx; | ||
495 | int idxnext; | ||
496 | unsigned long mask; | ||
497 | struct srcu_node *snp; | ||
498 | |||
499 | /* Prevent more than one additional grace period. */ | ||
500 | mutex_lock(&sp->srcu_cb_mutex); | ||
501 | |||
502 | /* End the current grace period. */ | ||
503 | spin_lock_irq(&sp->gp_lock); | ||
504 | idx = rcu_seq_state(sp->srcu_gp_seq); | ||
505 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); | ||
506 | cbdelay = srcu_get_delay(sp); | ||
507 | sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); | ||
508 | rcu_seq_end(&sp->srcu_gp_seq); | ||
509 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
510 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) | ||
511 | sp->srcu_gp_seq_needed_exp = gpseq; | ||
512 | spin_unlock_irq(&sp->gp_lock); | ||
513 | mutex_unlock(&sp->srcu_gp_mutex); | ||
514 | /* A new grace period can start at this point. But only one. */ | ||
515 | |||
516 | /* Initiate callback invocation as needed. */ | ||
517 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
518 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
519 | rcu_for_each_node_breadth_first(sp, snp) { | ||
520 | spin_lock_irq(&snp->lock); | ||
521 | cbs = false; | ||
522 | if (snp >= sp->level[rcu_num_lvls - 1]) | ||
523 | cbs = snp->srcu_have_cbs[idx] == gpseq; | ||
524 | snp->srcu_have_cbs[idx] = gpseq; | ||
525 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); | ||
526 | if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) | ||
527 | snp->srcu_gp_seq_needed_exp = gpseq; | ||
528 | mask = snp->srcu_data_have_cbs[idx]; | ||
529 | snp->srcu_data_have_cbs[idx] = 0; | ||
530 | spin_unlock_irq(&snp->lock); | ||
531 | if (cbs) { | ||
532 | smp_mb(); /* GP end before CB invocation. */ | ||
533 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | ||
534 | } | ||
535 | } | ||
536 | |||
537 | /* Callback initiation done, allow grace periods after next. */ | ||
538 | mutex_unlock(&sp->srcu_cb_mutex); | ||
539 | |||
540 | /* Start a new grace period if needed. */ | ||
541 | spin_lock_irq(&sp->gp_lock); | ||
542 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
543 | if (!rcu_seq_state(gpseq) && | ||
544 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | ||
545 | srcu_gp_start(sp); | ||
546 | spin_unlock_irq(&sp->gp_lock); | ||
547 | /* Throttle expedited grace periods: Should be rare! */ | ||
548 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | ||
549 | ? 0 : SRCU_INTERVAL); | ||
550 | } else { | ||
551 | spin_unlock_irq(&sp->gp_lock); | ||
552 | } | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * Funnel-locking scheme to scalably mediate many concurrent expedited | ||
557 | * grace-period requests. This function is invoked for the first known | ||
558 | * expedited request for a grace period that has already been requested, | ||
559 | * but without expediting. To start a completely new grace period, | ||
560 | * whether expedited or not, use srcu_funnel_gp_start() instead. | ||
561 | */ | ||
562 | static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, | ||
563 | unsigned long s) | ||
564 | { | ||
565 | unsigned long flags; | ||
566 | |||
567 | for (; snp != NULL; snp = snp->srcu_parent) { | ||
568 | if (rcu_seq_done(&sp->srcu_gp_seq, s) || | ||
569 | ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) | ||
570 | return; | ||
571 | spin_lock_irqsave(&snp->lock, flags); | ||
572 | if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { | ||
573 | spin_unlock_irqrestore(&snp->lock, flags); | ||
574 | return; | ||
575 | } | ||
576 | WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); | ||
577 | spin_unlock_irqrestore(&snp->lock, flags); | ||
578 | } | ||
579 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
580 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | ||
581 | sp->srcu_gp_seq_needed_exp = s; | ||
582 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Funnel-locking scheme to scalably mediate many concurrent grace-period | ||
587 | * requests. The winner has to do the work of actually starting grace | ||
588 | * period s. Losers must either ensure that their desired grace-period | ||
589 | * number is recorded on at least their leaf srcu_node structure, or they | ||
590 | * must take steps to invoke their own callbacks. | ||
591 | */ | ||
592 | static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | ||
593 | unsigned long s, bool do_norm) | ||
594 | { | ||
595 | unsigned long flags; | ||
596 | int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); | ||
597 | struct srcu_node *snp = sdp->mynode; | ||
598 | unsigned long snp_seq; | ||
599 | |||
600 | /* Each pass through the loop does one level of the srcu_node tree. */ | ||
601 | for (; snp != NULL; snp = snp->srcu_parent) { | ||
602 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) | ||
603 | return; /* GP already done and CBs recorded. */ | ||
604 | spin_lock_irqsave(&snp->lock, flags); | ||
605 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { | ||
606 | snp_seq = snp->srcu_have_cbs[idx]; | ||
607 | if (snp == sdp->mynode && snp_seq == s) | ||
608 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | ||
609 | spin_unlock_irqrestore(&snp->lock, flags); | ||
610 | if (snp == sdp->mynode && snp_seq != s) { | ||
611 | smp_mb(); /* CBs after GP! */ | ||
612 | srcu_schedule_cbs_sdp(sdp, do_norm | ||
613 | ? SRCU_INTERVAL | ||
614 | : 0); | ||
615 | return; | ||
616 | } | ||
617 | if (!do_norm) | ||
618 | srcu_funnel_exp_start(sp, snp, s); | ||
619 | return; | ||
620 | } | ||
621 | snp->srcu_have_cbs[idx] = s; | ||
622 | if (snp == sdp->mynode) | ||
623 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | ||
624 | if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) | ||
625 | snp->srcu_gp_seq_needed_exp = s; | ||
626 | spin_unlock_irqrestore(&snp->lock, flags); | ||
627 | } | ||
628 | |||
629 | /* Top of tree, must ensure the grace period will be started. */ | ||
630 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
631 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { | ||
632 | /* | ||
633 | * Record need for grace period s. Pair with load | ||
634 | * acquire setting up for initialization. | ||
635 | */ | ||
636 | smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ | ||
637 | } | ||
638 | if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | ||
639 | sp->srcu_gp_seq_needed_exp = s; | ||
640 | |||
641 | /* If grace period not already done and none in progress, start it. */ | ||
642 | if (!rcu_seq_done(&sp->srcu_gp_seq, s) && | ||
643 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { | ||
644 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
645 | srcu_gp_start(sp); | ||
646 | queue_delayed_work(system_power_efficient_wq, &sp->work, | ||
647 | srcu_get_delay(sp)); | ||
648 | } | ||
649 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * Wait until all readers counted by array index idx complete, but | ||
654 | * loop an additional time if there is an expedited grace period pending. | ||
655 | * The caller must ensure that ->srcu_idx is not changed while checking. | ||
656 | */ | ||
657 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | ||
658 | { | ||
659 | for (;;) { | ||
660 | if (srcu_readers_active_idx_check(sp, idx)) | ||
661 | return true; | ||
662 | if (--trycount + !srcu_get_delay(sp) <= 0) | ||
663 | return false; | ||
664 | udelay(SRCU_RETRY_CHECK_DELAY); | ||
665 | } | ||
666 | } | ||
667 | |||
668 | /* | ||
669 | * Increment the ->srcu_idx counter so that future SRCU readers will | ||
670 | * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows | ||
671 | * us to wait for pre-existing readers in a starvation-free manner. | ||
672 | */ | ||
673 | static void srcu_flip(struct srcu_struct *sp) | ||
674 | { | ||
675 | WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); | ||
676 | |||
677 | /* | ||
678 | * Ensure that if the updater misses an __srcu_read_unlock() | ||
679 | * increment, that task's next __srcu_read_lock() will see the | ||
680 | * above counter update. Note that both this memory barrier | ||
681 | * and the one in srcu_readers_active_idx_check() provide the | ||
682 | * guarantee for __srcu_read_lock(). | ||
683 | */ | ||
684 | smp_mb(); /* D */ /* Pairs with C. */ | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * If SRCU is likely idle, return true, otherwise return false. | ||
689 | * | ||
690 | * Note that it is OK for several current from-idle requests for a new | ||
691 | * grace period from idle to specify expediting because they will all end | ||
692 | * up requesting the same grace period anyhow. So no loss. | ||
693 | * | ||
694 | * Note also that if any CPU (including the current one) is still invoking | ||
695 | * callbacks, this function will nevertheless say "idle". This is not | ||
696 | * ideal, but the overhead of checking all CPUs' callback lists is even | ||
697 | * less ideal, especially on large systems. Furthermore, the wakeup | ||
698 | * can happen before the callback is fully removed, so we have no choice | ||
699 | * but to accept this type of error. | ||
700 | * | ||
701 | * This function is also subject to counter-wrap errors, but let's face | ||
702 | * it, if this function was preempted for enough time for the counters | ||
703 | * to wrap, it really doesn't matter whether or not we expedite the grace | ||
704 | * period. The extra overhead of a needlessly expedited grace period is | ||
705 | * negligible when amoritized over that time period, and the extra latency | ||
706 | * of a needlessly non-expedited grace period is similarly negligible. | ||
707 | */ | ||
708 | static bool srcu_might_be_idle(struct srcu_struct *sp) | ||
709 | { | ||
710 | unsigned long curseq; | ||
711 | unsigned long flags; | ||
712 | struct srcu_data *sdp; | ||
713 | unsigned long t; | ||
714 | |||
715 | /* If the local srcu_data structure has callbacks, not idle. */ | ||
716 | local_irq_save(flags); | ||
717 | sdp = this_cpu_ptr(sp->sda); | ||
718 | if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { | ||
719 | local_irq_restore(flags); | ||
720 | return false; /* Callbacks already present, so not idle. */ | ||
721 | } | ||
722 | local_irq_restore(flags); | ||
723 | |||
724 | /* | ||
725 | * No local callbacks, so probabalistically probe global state. | ||
726 | * Exact information would require acquiring locks, which would | ||
727 | * kill scalability, hence the probabalistic nature of the probe. | ||
728 | */ | ||
729 | |||
730 | /* First, see if enough time has passed since the last GP. */ | ||
731 | t = ktime_get_mono_fast_ns(); | ||
732 | if (exp_holdoff == 0 || | ||
733 | time_in_range_open(t, sp->srcu_last_gp_end, | ||
734 | sp->srcu_last_gp_end + exp_holdoff)) | ||
735 | return false; /* Too soon after last GP. */ | ||
736 | |||
737 | /* Next, check for probable idleness. */ | ||
738 | curseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
739 | smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ | ||
740 | if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) | ||
741 | return false; /* Grace period in progress, so not idle. */ | ||
742 | smp_mb(); /* Order ->srcu_gp_seq with prior access. */ | ||
743 | if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) | ||
744 | return false; /* GP # changed, so not idle. */ | ||
745 | return true; /* With reasonable probability, idle! */ | ||
746 | } | ||
747 | |||
748 | /* | ||
749 | * Enqueue an SRCU callback on the srcu_data structure associated with | ||
750 | * the current CPU and the specified srcu_struct structure, initiating | ||
751 | * grace-period processing if it is not already running. | ||
752 | * | ||
753 | * Note that all CPUs must agree that the grace period extended beyond | ||
754 | * all pre-existing SRCU read-side critical section. On systems with | ||
755 | * more than one CPU, this means that when "func()" is invoked, each CPU | ||
756 | * is guaranteed to have executed a full memory barrier since the end of | ||
757 | * its last corresponding SRCU read-side critical section whose beginning | ||
758 | * preceded the call to call_rcu(). It also means that each CPU executing | ||
759 | * an SRCU read-side critical section that continues beyond the start of | ||
760 | * "func()" must have executed a memory barrier after the call_rcu() | ||
761 | * but before the beginning of that SRCU read-side critical section. | ||
762 | * Note that these guarantees include CPUs that are offline, idle, or | ||
763 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
764 | * | ||
765 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
766 | * resulting SRCU callback function "func()", then both CPU A and CPU | ||
767 | * B are guaranteed to execute a full memory barrier during the time | ||
768 | * interval between the call to call_rcu() and the invocation of "func()". | ||
769 | * This guarantee applies even if CPU A and CPU B are the same CPU (but | ||
770 | * again only if the system has more than one CPU). | ||
771 | * | ||
772 | * Of course, these guarantees apply only for invocations of call_srcu(), | ||
773 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | ||
774 | * srcu_struct structure. | ||
775 | */ | ||
776 | void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | ||
777 | rcu_callback_t func, bool do_norm) | ||
778 | { | ||
779 | unsigned long flags; | ||
780 | bool needexp = false; | ||
781 | bool needgp = false; | ||
782 | unsigned long s; | ||
783 | struct srcu_data *sdp; | ||
784 | |||
785 | check_init_srcu_struct(sp); | ||
786 | rhp->func = func; | ||
787 | local_irq_save(flags); | ||
788 | sdp = this_cpu_ptr(sp->sda); | ||
789 | spin_lock(&sdp->lock); | ||
790 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); | ||
791 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
792 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
793 | s = rcu_seq_snap(&sp->srcu_gp_seq); | ||
794 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); | ||
795 | if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { | ||
796 | sdp->srcu_gp_seq_needed = s; | ||
797 | needgp = true; | ||
798 | } | ||
799 | if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { | ||
800 | sdp->srcu_gp_seq_needed_exp = s; | ||
801 | needexp = true; | ||
802 | } | ||
803 | spin_unlock_irqrestore(&sdp->lock, flags); | ||
804 | if (needgp) | ||
805 | srcu_funnel_gp_start(sp, sdp, s, do_norm); | ||
806 | else if (needexp) | ||
807 | srcu_funnel_exp_start(sp, sdp->mynode, s); | ||
808 | } | ||
809 | |||
810 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | ||
811 | rcu_callback_t func) | ||
812 | { | ||
813 | __call_srcu(sp, rhp, func, true); | ||
814 | } | ||
815 | EXPORT_SYMBOL_GPL(call_srcu); | ||
816 | |||
817 | /* | ||
818 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | ||
819 | */ | ||
820 | static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) | ||
821 | { | ||
822 | struct rcu_synchronize rcu; | ||
823 | |||
824 | RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || | ||
825 | lock_is_held(&rcu_bh_lock_map) || | ||
826 | lock_is_held(&rcu_lock_map) || | ||
827 | lock_is_held(&rcu_sched_lock_map), | ||
828 | "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); | ||
829 | |||
830 | if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) | ||
831 | return; | ||
832 | might_sleep(); | ||
833 | check_init_srcu_struct(sp); | ||
834 | init_completion(&rcu.completion); | ||
835 | init_rcu_head_on_stack(&rcu.head); | ||
836 | __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); | ||
837 | wait_for_completion(&rcu.completion); | ||
838 | destroy_rcu_head_on_stack(&rcu.head); | ||
839 | } | ||
840 | |||
841 | /** | ||
842 | * synchronize_srcu_expedited - Brute-force SRCU grace period | ||
843 | * @sp: srcu_struct with which to synchronize. | ||
844 | * | ||
845 | * Wait for an SRCU grace period to elapse, but be more aggressive about | ||
846 | * spinning rather than blocking when waiting. | ||
847 | * | ||
848 | * Note that synchronize_srcu_expedited() has the same deadlock and | ||
849 | * memory-ordering properties as does synchronize_srcu(). | ||
850 | */ | ||
851 | void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
852 | { | ||
853 | __synchronize_srcu(sp, rcu_gp_is_normal()); | ||
854 | } | ||
855 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | ||
856 | |||
857 | /** | ||
858 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
859 | * @sp: srcu_struct with which to synchronize. | ||
860 | * | ||
861 | * Wait for the count to drain to zero of both indexes. To avoid the | ||
862 | * possible starvation of synchronize_srcu(), it waits for the count of | ||
863 | * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, | ||
864 | * and then flip the srcu_idx and wait for the count of the other index. | ||
865 | * | ||
866 | * Can block; must be called from process context. | ||
867 | * | ||
868 | * Note that it is illegal to call synchronize_srcu() from the corresponding | ||
869 | * SRCU read-side critical section; doing so will result in deadlock. | ||
870 | * However, it is perfectly legal to call synchronize_srcu() on one | ||
871 | * srcu_struct from some other srcu_struct's read-side critical section, | ||
872 | * as long as the resulting graph of srcu_structs is acyclic. | ||
873 | * | ||
874 | * There are memory-ordering constraints implied by synchronize_srcu(). | ||
875 | * On systems with more than one CPU, when synchronize_srcu() returns, | ||
876 | * each CPU is guaranteed to have executed a full memory barrier since | ||
877 | * the end of its last corresponding SRCU-sched read-side critical section | ||
878 | * whose beginning preceded the call to synchronize_srcu(). In addition, | ||
879 | * each CPU having an SRCU read-side critical section that extends beyond | ||
880 | * the return from synchronize_srcu() is guaranteed to have executed a | ||
881 | * full memory barrier after the beginning of synchronize_srcu() and before | ||
882 | * the beginning of that SRCU read-side critical section. Note that these | ||
883 | * guarantees include CPUs that are offline, idle, or executing in user mode, | ||
884 | * as well as CPUs that are executing in the kernel. | ||
885 | * | ||
886 | * Furthermore, if CPU A invoked synchronize_srcu(), which returned | ||
887 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
888 | * to have executed a full memory barrier during the execution of | ||
889 | * synchronize_srcu(). This guarantee applies even if CPU A and CPU B | ||
890 | * are the same CPU, but again only if the system has more than one CPU. | ||
891 | * | ||
892 | * Of course, these memory-ordering guarantees apply only when | ||
893 | * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are | ||
894 | * passed the same srcu_struct structure. | ||
895 | * | ||
896 | * If SRCU is likely idle, expedite the first request. This semantic | ||
897 | * was provided by Classic SRCU, and is relied upon by its users, so TREE | ||
898 | * SRCU must also provide it. Note that detecting idleness is heuristic | ||
899 | * and subject to both false positives and negatives. | ||
900 | */ | ||
901 | void synchronize_srcu(struct srcu_struct *sp) | ||
902 | { | ||
903 | if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) | ||
904 | synchronize_srcu_expedited(sp); | ||
905 | else | ||
906 | __synchronize_srcu(sp, true); | ||
907 | } | ||
908 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
909 | |||
910 | /* | ||
911 | * Callback function for srcu_barrier() use. | ||
912 | */ | ||
913 | static void srcu_barrier_cb(struct rcu_head *rhp) | ||
914 | { | ||
915 | struct srcu_data *sdp; | ||
916 | struct srcu_struct *sp; | ||
917 | |||
918 | sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); | ||
919 | sp = sdp->sp; | ||
920 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
921 | complete(&sp->srcu_barrier_completion); | ||
922 | } | ||
923 | |||
924 | /** | ||
925 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
926 | * @sp: srcu_struct on which to wait for in-flight callbacks. | ||
927 | */ | ||
928 | void srcu_barrier(struct srcu_struct *sp) | ||
929 | { | ||
930 | int cpu; | ||
931 | struct srcu_data *sdp; | ||
932 | unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); | ||
933 | |||
934 | check_init_srcu_struct(sp); | ||
935 | mutex_lock(&sp->srcu_barrier_mutex); | ||
936 | if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { | ||
937 | smp_mb(); /* Force ordering following return. */ | ||
938 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
939 | return; /* Someone else did our work for us. */ | ||
940 | } | ||
941 | rcu_seq_start(&sp->srcu_barrier_seq); | ||
942 | init_completion(&sp->srcu_barrier_completion); | ||
943 | |||
944 | /* Initial count prevents reaching zero until all CBs are posted. */ | ||
945 | atomic_set(&sp->srcu_barrier_cpu_cnt, 1); | ||
946 | |||
947 | /* | ||
948 | * Each pass through this loop enqueues a callback, but only | ||
949 | * on CPUs already having callbacks enqueued. Note that if | ||
950 | * a CPU already has callbacks enqueue, it must have already | ||
951 | * registered the need for a future grace period, so all we | ||
952 | * need do is enqueue a callback that will use the same | ||
953 | * grace period as the last callback already in the queue. | ||
954 | */ | ||
955 | for_each_possible_cpu(cpu) { | ||
956 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
957 | spin_lock_irq(&sdp->lock); | ||
958 | atomic_inc(&sp->srcu_barrier_cpu_cnt); | ||
959 | sdp->srcu_barrier_head.func = srcu_barrier_cb; | ||
960 | if (!rcu_segcblist_entrain(&sdp->srcu_cblist, | ||
961 | &sdp->srcu_barrier_head, 0)) | ||
962 | atomic_dec(&sp->srcu_barrier_cpu_cnt); | ||
963 | spin_unlock_irq(&sdp->lock); | ||
964 | } | ||
965 | |||
966 | /* Remove the initial count, at which point reaching zero can happen. */ | ||
967 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
968 | complete(&sp->srcu_barrier_completion); | ||
969 | wait_for_completion(&sp->srcu_barrier_completion); | ||
970 | |||
971 | rcu_seq_end(&sp->srcu_barrier_seq); | ||
972 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
973 | } | ||
974 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
975 | |||
976 | /** | ||
977 | * srcu_batches_completed - return batches completed. | ||
978 | * @sp: srcu_struct on which to report batch completion. | ||
979 | * | ||
980 | * Report the number of batches, correlated with, but not necessarily | ||
981 | * precisely the same as, the number of grace periods that have elapsed. | ||
982 | */ | ||
983 | unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
984 | { | ||
985 | return sp->srcu_idx; | ||
986 | } | ||
987 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | ||
988 | |||
989 | /* | ||
990 | * Core SRCU state machine. Push state bits of ->srcu_gp_seq | ||
991 | * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has | ||
992 | * completed in that state. | ||
993 | */ | ||
994 | static void srcu_advance_state(struct srcu_struct *sp) | ||
995 | { | ||
996 | int idx; | ||
997 | |||
998 | mutex_lock(&sp->srcu_gp_mutex); | ||
999 | |||
1000 | /* | ||
1001 | * Because readers might be delayed for an extended period after | ||
1002 | * fetching ->srcu_idx for their index, at any point in time there | ||
1003 | * might well be readers using both idx=0 and idx=1. We therefore | ||
1004 | * need to wait for readers to clear from both index values before | ||
1005 | * invoking a callback. | ||
1006 | * | ||
1007 | * The load-acquire ensures that we see the accesses performed | ||
1008 | * by the prior grace period. | ||
1009 | */ | ||
1010 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ | ||
1011 | if (idx == SRCU_STATE_IDLE) { | ||
1012 | spin_lock_irq(&sp->gp_lock); | ||
1013 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
1014 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); | ||
1015 | spin_unlock_irq(&sp->gp_lock); | ||
1016 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1017 | return; | ||
1018 | } | ||
1019 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
1020 | if (idx == SRCU_STATE_IDLE) | ||
1021 | srcu_gp_start(sp); | ||
1022 | spin_unlock_irq(&sp->gp_lock); | ||
1023 | if (idx != SRCU_STATE_IDLE) { | ||
1024 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1025 | return; /* Someone else started the grace period. */ | ||
1026 | } | ||
1027 | } | ||
1028 | |||
1029 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { | ||
1030 | idx = 1 ^ (sp->srcu_idx & 1); | ||
1031 | if (!try_check_zero(sp, idx, 1)) { | ||
1032 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1033 | return; /* readers present, retry later. */ | ||
1034 | } | ||
1035 | srcu_flip(sp); | ||
1036 | rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); | ||
1037 | } | ||
1038 | |||
1039 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { | ||
1040 | |||
1041 | /* | ||
1042 | * SRCU read-side critical sections are normally short, | ||
1043 | * so check at least twice in quick succession after a flip. | ||
1044 | */ | ||
1045 | idx = 1 ^ (sp->srcu_idx & 1); | ||
1046 | if (!try_check_zero(sp, idx, 2)) { | ||
1047 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1048 | return; /* readers present, retry later. */ | ||
1049 | } | ||
1050 | srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * Invoke a limited number of SRCU callbacks that have passed through | ||
1056 | * their grace period. If there are more to do, SRCU will reschedule | ||
1057 | * the workqueue. Note that needed memory barriers have been executed | ||
1058 | * in this task's context by srcu_readers_active_idx_check(). | ||
1059 | */ | ||
1060 | static void srcu_invoke_callbacks(struct work_struct *work) | ||
1061 | { | ||
1062 | bool more; | ||
1063 | struct rcu_cblist ready_cbs; | ||
1064 | struct rcu_head *rhp; | ||
1065 | struct srcu_data *sdp; | ||
1066 | struct srcu_struct *sp; | ||
1067 | |||
1068 | sdp = container_of(work, struct srcu_data, work.work); | ||
1069 | sp = sdp->sp; | ||
1070 | rcu_cblist_init(&ready_cbs); | ||
1071 | spin_lock_irq(&sdp->lock); | ||
1072 | smp_mb(); /* Old grace periods before callback invocation! */ | ||
1073 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
1074 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
1075 | if (sdp->srcu_cblist_invoking || | ||
1076 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { | ||
1077 | spin_unlock_irq(&sdp->lock); | ||
1078 | return; /* Someone else on the job or nothing to do. */ | ||
1079 | } | ||
1080 | |||
1081 | /* We are on the job! Extract and invoke ready callbacks. */ | ||
1082 | sdp->srcu_cblist_invoking = true; | ||
1083 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); | ||
1084 | spin_unlock_irq(&sdp->lock); | ||
1085 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
1086 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
1087 | local_bh_disable(); | ||
1088 | rhp->func(rhp); | ||
1089 | local_bh_enable(); | ||
1090 | } | ||
1091 | |||
1092 | /* | ||
1093 | * Update counts, accelerate new callbacks, and if needed, | ||
1094 | * schedule another round of callback invocation. | ||
1095 | */ | ||
1096 | spin_lock_irq(&sdp->lock); | ||
1097 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); | ||
1098 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
1099 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
1100 | sdp->srcu_cblist_invoking = false; | ||
1101 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); | ||
1102 | spin_unlock_irq(&sdp->lock); | ||
1103 | if (more) | ||
1104 | srcu_schedule_cbs_sdp(sdp, 0); | ||
1105 | } | ||
1106 | |||
1107 | /* | ||
1108 | * Finished one round of SRCU grace period. Start another if there are | ||
1109 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
1110 | */ | ||
1111 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | ||
1112 | { | ||
1113 | bool pushgp = true; | ||
1114 | |||
1115 | spin_lock_irq(&sp->gp_lock); | ||
1116 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
1117 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { | ||
1118 | /* All requests fulfilled, time to go idle. */ | ||
1119 | pushgp = false; | ||
1120 | } | ||
1121 | } else if (!rcu_seq_state(sp->srcu_gp_seq)) { | ||
1122 | /* Outstanding request and no GP. Start one. */ | ||
1123 | srcu_gp_start(sp); | ||
1124 | } | ||
1125 | spin_unlock_irq(&sp->gp_lock); | ||
1126 | |||
1127 | if (pushgp) | ||
1128 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | ||
1129 | } | ||
1130 | |||
1131 | /* | ||
1132 | * This is the work-queue function that handles SRCU grace periods. | ||
1133 | */ | ||
1134 | void process_srcu(struct work_struct *work) | ||
1135 | { | ||
1136 | struct srcu_struct *sp; | ||
1137 | |||
1138 | sp = container_of(work, struct srcu_struct, work.work); | ||
1139 | |||
1140 | srcu_advance_state(sp); | ||
1141 | srcu_reschedule(sp, srcu_get_delay(sp)); | ||
1142 | } | ||
1143 | EXPORT_SYMBOL_GPL(process_srcu); | ||
1144 | |||
1145 | void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
1146 | struct srcu_struct *sp, int *flags, | ||
1147 | unsigned long *gpnum, unsigned long *completed) | ||
1148 | { | ||
1149 | if (test_type != SRCU_FLAVOR) | ||
1150 | return; | ||
1151 | *flags = 0; | ||
1152 | *completed = rcu_seq_ctr(sp->srcu_gp_seq); | ||
1153 | *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); | ||
1154 | } | ||
1155 | EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2..e5385731e391 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
79 | */ | 79 | */ |
80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
81 | { | 81 | { |
82 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 82 | RCU_TRACE(reset_cpu_stall_ticks(rcp);) |
83 | if (rcp->donetail != rcp->curtail) { | 83 | if (rcp->donetail != rcp->curtail) { |
84 | rcp->donetail = rcp->curtail; | 84 | rcp->donetail = rcp->curtail; |
85 | return 1; | 85 | return 1; |
@@ -125,7 +125,7 @@ void rcu_bh_qs(void) | |||
125 | */ | 125 | */ |
126 | void rcu_check_callbacks(int user) | 126 | void rcu_check_callbacks(int user) |
127 | { | 127 | { |
128 | RCU_TRACE(check_cpu_stalls()); | 128 | RCU_TRACE(check_cpu_stalls();) |
129 | if (user) | 129 | if (user) |
130 | rcu_sched_qs(); | 130 | rcu_sched_qs(); |
131 | else if (!in_softirq()) | 131 | else if (!in_softirq()) |
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
143 | const char *rn = NULL; | 143 | const char *rn = NULL; |
144 | struct rcu_head *next, *list; | 144 | struct rcu_head *next, *list; |
145 | unsigned long flags; | 145 | unsigned long flags; |
146 | RCU_TRACE(int cb_count = 0); | 146 | RCU_TRACE(int cb_count = 0;) |
147 | 147 | ||
148 | /* Move the ready-to-invoke callbacks to a local list. */ | 148 | /* Move the ready-to-invoke callbacks to a local list. */ |
149 | local_irq_save(flags); | 149 | local_irq_save(flags); |
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
152 | local_irq_restore(flags); | 152 | local_irq_restore(flags); |
153 | return; | 153 | return; |
154 | } | 154 | } |
155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) |
156 | list = rcp->rcucblist; | 156 | list = rcp->rcucblist; |
157 | rcp->rcucblist = *rcp->donetail; | 157 | rcp->rcucblist = *rcp->donetail; |
158 | *rcp->donetail = NULL; | 158 | *rcp->donetail = NULL; |
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | local_irq_restore(flags); | 162 | local_irq_restore(flags); |
163 | 163 | ||
164 | /* Invoke the callbacks on the local list. */ | 164 | /* Invoke the callbacks on the local list. */ |
165 | RCU_TRACE(rn = rcp->name); | 165 | RCU_TRACE(rn = rcp->name;) |
166 | while (list) { | 166 | while (list) { |
167 | next = list->next; | 167 | next = list->next; |
168 | prefetch(next); | 168 | prefetch(next); |
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
171 | __rcu_reclaim(rn, list); | 171 | __rcu_reclaim(rn, list); |
172 | local_bh_enable(); | 172 | local_bh_enable(); |
173 | list = next; | 173 | list = next; |
174 | RCU_TRACE(cb_count++); | 174 | RCU_TRACE(cb_count++;) |
175 | } | 175 | } |
176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) |
177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, | 177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
178 | cb_count, 0, need_resched(), | 178 | cb_count, 0, need_resched(), |
179 | is_idle_task(current), | 179 | is_idle_task(current), |
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, | |||
221 | local_irq_save(flags); | 221 | local_irq_save(flags); |
222 | *rcp->curtail = head; | 222 | *rcp->curtail = head; |
223 | rcp->curtail = &head->next; | 223 | rcp->curtail = &head->next; |
224 | RCU_TRACE(rcp->qlen++); | 224 | RCU_TRACE(rcp->qlen++;) |
225 | local_irq_restore(flags); | 225 | local_irq_restore(flags); |
226 | 226 | ||
227 | if (unlikely(is_idle_task(current))) { | 227 | if (unlikely(is_idle_task(current))) { |
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
254 | void __init rcu_init(void) | 254 | void __init rcu_init(void) |
255 | { | 255 | { |
256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); | 257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) |
258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); | 258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) |
259 | 259 | ||
260 | rcu_early_boot_tests(); | 260 | rcu_early_boot_tests(); |
261 | } | 261 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..371034e77f87 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |||
52 | RCU_TRACE(.name = "rcu_bh") | 52 | RCU_TRACE(.name = "rcu_bh") |
53 | }; | 53 | }; |
54 | 54 | ||
55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 55 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
56 | #include <linux/kernel_stat.h> | 56 | #include <linux/kernel_stat.h> |
57 | 57 | ||
58 | int rcu_scheduler_active __read_mostly; | 58 | int rcu_scheduler_active __read_mostly; |
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. | 65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. |
66 | * The reason for this is that Tiny RCU does not need kthreads, so does | 66 | * The reason for this is that Tiny RCU does not need kthreads, so does |
67 | * not have to care about the fact that the scheduler is half-initialized | 67 | * not have to care about the fact that the scheduler is half-initialized |
68 | * at a certain phase of the boot process. | 68 | * at a certain phase of the boot process. Unless SRCU is in the mix. |
69 | */ | 69 | */ |
70 | void __init rcu_scheduler_starting(void) | 70 | void __init rcu_scheduler_starting(void) |
71 | { | 71 | { |
72 | WARN_ON(nr_context_switches() > 0); | 72 | WARN_ON(nr_context_switches() > 0); |
73 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | 73 | rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) |
74 | ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; | ||
74 | } | 75 | } |
75 | 76 | ||
76 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 77 | #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
77 | 78 | ||
78 | #ifdef CONFIG_RCU_TRACE | 79 | #ifdef CONFIG_RCU_TRACE |
79 | 80 | ||
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | |||
162 | 163 | ||
163 | static void check_cpu_stalls(void) | 164 | static void check_cpu_stalls(void) |
164 | { | 165 | { |
165 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | 166 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) |
166 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | 167 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) |
167 | } | 168 | } |
168 | 169 | ||
169 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 170 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50fee7689e71..e354e475e645 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/random.h> | 57 | #include <linux/random.h> |
58 | #include <linux/trace_events.h> | 58 | #include <linux/trace_events.h> |
59 | #include <linux/suspend.h> | 59 | #include <linux/suspend.h> |
60 | #include <linux/ftrace.h> | ||
60 | 61 | ||
61 | #include "tree.h" | 62 | #include "tree.h" |
62 | #include "rcu.h" | 63 | #include "rcu.h" |
@@ -97,8 +98,8 @@ struct rcu_state sname##_state = { \ | |||
97 | .gpnum = 0UL - 300UL, \ | 98 | .gpnum = 0UL - 300UL, \ |
98 | .completed = 0UL - 300UL, \ | 99 | .completed = 0UL - 300UL, \ |
99 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ | 100 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
100 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 101 | .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ |
101 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 102 | .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ |
102 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
103 | .name = RCU_STATE_NAME(sname), \ | 104 | .name = RCU_STATE_NAME(sname), \ |
104 | .abbr = sabbr, \ | 105 | .abbr = sabbr, \ |
@@ -123,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; | |||
123 | module_param(rcu_fanout_leaf, int, 0444); | 124 | module_param(rcu_fanout_leaf, int, 0444); |
124 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 125 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
125 | /* Number of rcu_nodes at specified level. */ | 126 | /* Number of rcu_nodes at specified level. */ |
126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 127 | int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 128 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
128 | /* panic() on RCU Stall sysctl. */ | 129 | /* panic() on RCU Stall sysctl. */ |
129 | int sysctl_panic_on_rcu_stall __read_mostly; | 130 | int sysctl_panic_on_rcu_stall __read_mostly; |
@@ -199,7 +200,7 @@ static const int gp_cleanup_delay; | |||
199 | 200 | ||
200 | /* | 201 | /* |
201 | * Number of grace periods between delays, normalized by the duration of | 202 | * Number of grace periods between delays, normalized by the duration of |
202 | * the delay. The longer the the delay, the more the grace periods between | 203 | * the delay. The longer the delay, the more the grace periods between |
203 | * each delay. The reason for this normalization is that it means that, | 204 | * each delay. The reason for this normalization is that it means that, |
204 | * for non-zero delays, the overall slowdown of grace periods is constant | 205 | * for non-zero delays, the overall slowdown of grace periods is constant |
205 | * regardless of the duration of the delay. This arrangement balances | 206 | * regardless of the duration of the delay. This arrangement balances |
@@ -272,11 +273,19 @@ void rcu_bh_qs(void) | |||
272 | } | 273 | } |
273 | } | 274 | } |
274 | 275 | ||
275 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | 276 | /* |
277 | * Steal a bit from the bottom of ->dynticks for idle entry/exit | ||
278 | * control. Initially this is for TLB flushing. | ||
279 | */ | ||
280 | #define RCU_DYNTICK_CTRL_MASK 0x1 | ||
281 | #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) | ||
282 | #ifndef rcu_eqs_special_exit | ||
283 | #define rcu_eqs_special_exit() do { } while (0) | ||
284 | #endif | ||
276 | 285 | ||
277 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 286 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
278 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 287 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
279 | .dynticks = ATOMIC_INIT(1), | 288 | .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), |
280 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 289 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
281 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | 290 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, |
282 | .dynticks_idle = ATOMIC_INIT(1), | 291 | .dynticks_idle = ATOMIC_INIT(1), |
@@ -284,21 +293,40 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | |||
284 | }; | 293 | }; |
285 | 294 | ||
286 | /* | 295 | /* |
296 | * There's a few places, currently just in the tracing infrastructure, | ||
297 | * that uses rcu_irq_enter() to make sure RCU is watching. But there's | ||
298 | * a small location where that will not even work. In those cases | ||
299 | * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() | ||
300 | * can be called. | ||
301 | */ | ||
302 | static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); | ||
303 | |||
304 | bool rcu_irq_enter_disabled(void) | ||
305 | { | ||
306 | return this_cpu_read(disable_rcu_irq_enter); | ||
307 | } | ||
308 | |||
309 | /* | ||
287 | * Record entry into an extended quiescent state. This is only to be | 310 | * Record entry into an extended quiescent state. This is only to be |
288 | * called when not already in an extended quiescent state. | 311 | * called when not already in an extended quiescent state. |
289 | */ | 312 | */ |
290 | static void rcu_dynticks_eqs_enter(void) | 313 | static void rcu_dynticks_eqs_enter(void) |
291 | { | 314 | { |
292 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 315 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
293 | int special; | 316 | int seq; |
294 | 317 | ||
295 | /* | 318 | /* |
296 | * CPUs seeing atomic_inc_return() must see prior RCU read-side | 319 | * CPUs seeing atomic_add_return() must see prior RCU read-side |
297 | * critical sections, and we also must force ordering with the | 320 | * critical sections, and we also must force ordering with the |
298 | * next idle sojourn. | 321 | * next idle sojourn. |
299 | */ | 322 | */ |
300 | special = atomic_inc_return(&rdtp->dynticks); | 323 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
301 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); | 324 | /* Better be in an extended quiescent state! */ |
325 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
326 | (seq & RCU_DYNTICK_CTRL_CTR)); | ||
327 | /* Better not have special action (TLB flush) pending! */ | ||
328 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
329 | (seq & RCU_DYNTICK_CTRL_MASK)); | ||
302 | } | 330 | } |
303 | 331 | ||
304 | /* | 332 | /* |
@@ -308,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void) | |||
308 | static void rcu_dynticks_eqs_exit(void) | 336 | static void rcu_dynticks_eqs_exit(void) |
309 | { | 337 | { |
310 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 338 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
311 | int special; | 339 | int seq; |
312 | 340 | ||
313 | /* | 341 | /* |
314 | * CPUs seeing atomic_inc_return() must see prior idle sojourns, | 342 | * CPUs seeing atomic_add_return() must see prior idle sojourns, |
315 | * and we also must force ordering with the next RCU read-side | 343 | * and we also must force ordering with the next RCU read-side |
316 | * critical section. | 344 | * critical section. |
317 | */ | 345 | */ |
318 | special = atomic_inc_return(&rdtp->dynticks); | 346 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
319 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); | 347 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
348 | !(seq & RCU_DYNTICK_CTRL_CTR)); | ||
349 | if (seq & RCU_DYNTICK_CTRL_MASK) { | ||
350 | atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); | ||
351 | smp_mb__after_atomic(); /* _exit after clearing mask. */ | ||
352 | /* Prefer duplicate flushes to losing a flush. */ | ||
353 | rcu_eqs_special_exit(); | ||
354 | } | ||
320 | } | 355 | } |
321 | 356 | ||
322 | /* | 357 | /* |
@@ -333,9 +368,9 @@ static void rcu_dynticks_eqs_online(void) | |||
333 | { | 368 | { |
334 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 369 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
335 | 370 | ||
336 | if (atomic_read(&rdtp->dynticks) & 0x1) | 371 | if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) |
337 | return; | 372 | return; |
338 | atomic_add(0x1, &rdtp->dynticks); | 373 | atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
339 | } | 374 | } |
340 | 375 | ||
341 | /* | 376 | /* |
@@ -347,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) | |||
347 | { | 382 | { |
348 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 383 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
349 | 384 | ||
350 | return !(atomic_read(&rdtp->dynticks) & 0x1); | 385 | return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); |
351 | } | 386 | } |
352 | 387 | ||
353 | /* | 388 | /* |
@@ -358,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
358 | { | 393 | { |
359 | int snap = atomic_add_return(0, &rdtp->dynticks); | 394 | int snap = atomic_add_return(0, &rdtp->dynticks); |
360 | 395 | ||
361 | return snap; | 396 | return snap & ~RCU_DYNTICK_CTRL_MASK; |
362 | } | 397 | } |
363 | 398 | ||
364 | /* | 399 | /* |
@@ -367,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
367 | */ | 402 | */ |
368 | static bool rcu_dynticks_in_eqs(int snap) | 403 | static bool rcu_dynticks_in_eqs(int snap) |
369 | { | 404 | { |
370 | return !(snap & 0x1); | 405 | return !(snap & RCU_DYNTICK_CTRL_CTR); |
371 | } | 406 | } |
372 | 407 | ||
373 | /* | 408 | /* |
@@ -387,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) | |||
387 | static void rcu_dynticks_momentary_idle(void) | 422 | static void rcu_dynticks_momentary_idle(void) |
388 | { | 423 | { |
389 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 424 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
390 | int special = atomic_add_return(2, &rdtp->dynticks); | 425 | int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, |
426 | &rdtp->dynticks); | ||
391 | 427 | ||
392 | /* It is illegal to call this from idle state. */ | 428 | /* It is illegal to call this from idle state. */ |
393 | WARN_ON_ONCE(!(special & 0x1)); | 429 | WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); |
394 | } | 430 | } |
395 | 431 | ||
396 | DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | 432 | /* |
397 | EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | 433 | * Set the special (bottom) bit of the specified CPU so that it |
434 | * will take special action (such as flushing its TLB) on the | ||
435 | * next exit from an extended quiescent state. Returns true if | ||
436 | * the bit was successfully set, or false if the CPU was not in | ||
437 | * an extended quiescent state. | ||
438 | */ | ||
439 | bool rcu_eqs_special_set(int cpu) | ||
440 | { | ||
441 | int old; | ||
442 | int new; | ||
443 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
444 | |||
445 | do { | ||
446 | old = atomic_read(&rdtp->dynticks); | ||
447 | if (old & RCU_DYNTICK_CTRL_CTR) | ||
448 | return false; | ||
449 | new = old | RCU_DYNTICK_CTRL_MASK; | ||
450 | } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); | ||
451 | return true; | ||
452 | } | ||
398 | 453 | ||
399 | /* | 454 | /* |
400 | * Let the RCU core know that this CPU has gone through the scheduler, | 455 | * Let the RCU core know that this CPU has gone through the scheduler, |
@@ -403,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | |||
403 | * memory barriers to let the RCU core know about it, regardless of what | 458 | * memory barriers to let the RCU core know about it, regardless of what |
404 | * this CPU might (or might not) do in the near future. | 459 | * this CPU might (or might not) do in the near future. |
405 | * | 460 | * |
406 | * We inform the RCU core by emulating a zero-duration dyntick-idle | 461 | * We inform the RCU core by emulating a zero-duration dyntick-idle period. |
407 | * period, which we in turn do by incrementing the ->dynticks counter | ||
408 | * by two. | ||
409 | * | 462 | * |
410 | * The caller must have disabled interrupts. | 463 | * The caller must have disabled interrupts. |
411 | */ | 464 | */ |
412 | static void rcu_momentary_dyntick_idle(void) | 465 | static void rcu_momentary_dyntick_idle(void) |
413 | { | 466 | { |
414 | struct rcu_data *rdp; | 467 | raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); |
415 | int resched_mask; | 468 | rcu_dynticks_momentary_idle(); |
416 | struct rcu_state *rsp; | ||
417 | |||
418 | /* | ||
419 | * Yes, we can lose flag-setting operations. This is OK, because | ||
420 | * the flag will be set again after some delay. | ||
421 | */ | ||
422 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
423 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
424 | |||
425 | /* Find the flavor that needs a quiescent state. */ | ||
426 | for_each_rcu_flavor(rsp) { | ||
427 | rdp = raw_cpu_ptr(rsp->rda); | ||
428 | if (!(resched_mask & rsp->flavor_mask)) | ||
429 | continue; | ||
430 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
431 | if (READ_ONCE(rdp->mynode->completed) != | ||
432 | READ_ONCE(rdp->cond_resched_completed)) | ||
433 | continue; | ||
434 | |||
435 | /* | ||
436 | * Pretend to be momentarily idle for the quiescent state. | ||
437 | * This allows the grace-period kthread to record the | ||
438 | * quiescent state, with no need for this CPU to do anything | ||
439 | * further. | ||
440 | */ | ||
441 | rcu_dynticks_momentary_idle(); | ||
442 | break; | ||
443 | } | ||
444 | } | 469 | } |
445 | 470 | ||
446 | /* | 471 | /* |
@@ -448,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void) | |||
448 | * and requires special handling for preemptible RCU. | 473 | * and requires special handling for preemptible RCU. |
449 | * The caller must have disabled interrupts. | 474 | * The caller must have disabled interrupts. |
450 | */ | 475 | */ |
451 | void rcu_note_context_switch(void) | 476 | void rcu_note_context_switch(bool preempt) |
452 | { | 477 | { |
453 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 478 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
454 | trace_rcu_utilization(TPS("Start context switch")); | 479 | trace_rcu_utilization(TPS("Start context switch")); |
455 | rcu_sched_qs(); | 480 | rcu_sched_qs(); |
456 | rcu_preempt_note_context_switch(); | 481 | rcu_preempt_note_context_switch(); |
457 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 482 | /* Load rcu_urgent_qs before other flags. */ |
483 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) | ||
484 | goto out; | ||
485 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
486 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) | ||
458 | rcu_momentary_dyntick_idle(); | 487 | rcu_momentary_dyntick_idle(); |
488 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); | ||
489 | if (!preempt) | ||
490 | rcu_note_voluntary_context_switch_lite(current); | ||
491 | out: | ||
459 | trace_rcu_utilization(TPS("End context switch")); | 492 | trace_rcu_utilization(TPS("End context switch")); |
460 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 493 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
461 | } | 494 | } |
@@ -478,29 +511,26 @@ void rcu_all_qs(void) | |||
478 | { | 511 | { |
479 | unsigned long flags; | 512 | unsigned long flags; |
480 | 513 | ||
514 | if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) | ||
515 | return; | ||
516 | preempt_disable(); | ||
517 | /* Load rcu_urgent_qs before other flags. */ | ||
518 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { | ||
519 | preempt_enable(); | ||
520 | return; | ||
521 | } | ||
522 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
481 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 523 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
482 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { | 524 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { |
483 | local_irq_save(flags); | 525 | local_irq_save(flags); |
484 | rcu_momentary_dyntick_idle(); | 526 | rcu_momentary_dyntick_idle(); |
485 | local_irq_restore(flags); | 527 | local_irq_restore(flags); |
486 | } | 528 | } |
487 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { | 529 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) |
488 | /* | ||
489 | * Yes, we just checked a per-CPU variable with preemption | ||
490 | * enabled, so we might be migrated to some other CPU at | ||
491 | * this point. That is OK because in that case, the | ||
492 | * migration will supply the needed quiescent state. | ||
493 | * We might end up needlessly disabling preemption and | ||
494 | * invoking rcu_sched_qs() on the destination CPU, but | ||
495 | * the probability and cost are both quite low, so this | ||
496 | * should not be a problem in practice. | ||
497 | */ | ||
498 | preempt_disable(); | ||
499 | rcu_sched_qs(); | 530 | rcu_sched_qs(); |
500 | preempt_enable(); | 531 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); |
501 | } | ||
502 | this_cpu_inc(rcu_qs_ctr); | ||
503 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 532 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
533 | preempt_enable(); | ||
504 | } | 534 | } |
505 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 535 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
506 | 536 | ||
@@ -689,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | |||
689 | default: | 719 | default: |
690 | break; | 720 | break; |
691 | } | 721 | } |
692 | if (rsp != NULL) { | 722 | if (rsp == NULL) |
693 | *flags = READ_ONCE(rsp->gp_flags); | ||
694 | *gpnum = READ_ONCE(rsp->gpnum); | ||
695 | *completed = READ_ONCE(rsp->completed); | ||
696 | return; | 723 | return; |
697 | } | 724 | *flags = READ_ONCE(rsp->gp_flags); |
698 | *flags = 0; | 725 | *gpnum = READ_ONCE(rsp->gpnum); |
699 | *gpnum = 0; | 726 | *completed = READ_ONCE(rsp->completed); |
700 | *completed = 0; | ||
701 | } | 727 | } |
702 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); | 728 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); |
703 | 729 | ||
@@ -713,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum) | |||
713 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 739 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); |
714 | 740 | ||
715 | /* | 741 | /* |
716 | * Does the CPU have callbacks ready to be invoked? | ||
717 | */ | ||
718 | static int | ||
719 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | ||
720 | { | ||
721 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && | ||
722 | rdp->nxttail[RCU_NEXT_TAIL] != NULL; | ||
723 | } | ||
724 | |||
725 | /* | ||
726 | * Return the root node of the specified rcu_state structure. | 742 | * Return the root node of the specified rcu_state structure. |
727 | */ | 743 | */ |
728 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | 744 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) |
@@ -752,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
752 | static bool | 768 | static bool |
753 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 769 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
754 | { | 770 | { |
755 | int i; | ||
756 | |||
757 | if (rcu_gp_in_progress(rsp)) | 771 | if (rcu_gp_in_progress(rsp)) |
758 | return false; /* No, a grace period is already in progress. */ | 772 | return false; /* No, a grace period is already in progress. */ |
759 | if (rcu_future_needs_gp(rsp)) | 773 | if (rcu_future_needs_gp(rsp)) |
760 | return true; /* Yes, a no-CBs CPU needs one. */ | 774 | return true; /* Yes, a no-CBs CPU needs one. */ |
761 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 775 | if (!rcu_segcblist_is_enabled(&rdp->cblist)) |
762 | return false; /* No, this is a no-CBs (or offline) CPU. */ | 776 | return false; /* No, this is a no-CBs (or offline) CPU. */ |
763 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 777 | if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) |
764 | return true; /* Yes, CPU has newly registered callbacks. */ | 778 | return true; /* Yes, CPU has newly registered callbacks. */ |
765 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | 779 | if (rcu_segcblist_future_gp_needed(&rdp->cblist, |
766 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | 780 | READ_ONCE(rsp->completed))) |
767 | ULONG_CMP_LT(READ_ONCE(rsp->completed), | 781 | return true; /* Yes, CBs for future grace period. */ |
768 | rdp->nxtcompleted[i])) | ||
769 | return true; /* Yes, CBs for future grace period. */ | ||
770 | return false; /* No grace period needed. */ | 782 | return false; /* No grace period needed. */ |
771 | } | 783 | } |
772 | 784 | ||
773 | /* | 785 | /* |
774 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state | 786 | * rcu_eqs_enter_common - current CPU is entering an extended quiescent state |
775 | * | 787 | * |
776 | * If the new value of the ->dynticks_nesting counter now is zero, | 788 | * Enter idle, doing appropriate accounting. The caller must have |
777 | * we really have entered idle, and must do the appropriate accounting. | 789 | * disabled interrupts. |
778 | * The caller must have disabled interrupts. | ||
779 | */ | 790 | */ |
780 | static void rcu_eqs_enter_common(long long oldval, bool user) | 791 | static void rcu_eqs_enter_common(bool user) |
781 | { | 792 | { |
782 | struct rcu_state *rsp; | 793 | struct rcu_state *rsp; |
783 | struct rcu_data *rdp; | 794 | struct rcu_data *rdp; |
784 | RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) | 795 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
785 | 796 | ||
786 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 797 | trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); |
787 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 798 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
788 | !user && !is_idle_task(current)) { | 799 | !user && !is_idle_task(current)) { |
789 | struct task_struct *idle __maybe_unused = | 800 | struct task_struct *idle __maybe_unused = |
790 | idle_task(smp_processor_id()); | 801 | idle_task(smp_processor_id()); |
791 | 802 | ||
792 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); | 803 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); |
793 | rcu_ftrace_dump(DUMP_ORIG); | 804 | rcu_ftrace_dump(DUMP_ORIG); |
794 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 805 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
795 | current->pid, current->comm, | 806 | current->pid, current->comm, |
@@ -800,7 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user) | |||
800 | do_nocb_deferred_wakeup(rdp); | 811 | do_nocb_deferred_wakeup(rdp); |
801 | } | 812 | } |
802 | rcu_prepare_for_idle(); | 813 | rcu_prepare_for_idle(); |
803 | rcu_dynticks_eqs_enter(); | 814 | __this_cpu_inc(disable_rcu_irq_enter); |
815 | rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ | ||
816 | rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ | ||
817 | __this_cpu_dec(disable_rcu_irq_enter); | ||
804 | rcu_dynticks_task_enter(); | 818 | rcu_dynticks_task_enter(); |
805 | 819 | ||
806 | /* | 820 | /* |
@@ -821,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user) | |||
821 | */ | 835 | */ |
822 | static void rcu_eqs_enter(bool user) | 836 | static void rcu_eqs_enter(bool user) |
823 | { | 837 | { |
824 | long long oldval; | ||
825 | struct rcu_dynticks *rdtp; | 838 | struct rcu_dynticks *rdtp; |
826 | 839 | ||
827 | rdtp = this_cpu_ptr(&rcu_dynticks); | 840 | rdtp = this_cpu_ptr(&rcu_dynticks); |
828 | oldval = rdtp->dynticks_nesting; | ||
829 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 841 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
830 | (oldval & DYNTICK_TASK_NEST_MASK) == 0); | 842 | (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); |
831 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { | 843 | if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) |
832 | rdtp->dynticks_nesting = 0; | 844 | rcu_eqs_enter_common(user); |
833 | rcu_eqs_enter_common(oldval, user); | 845 | else |
834 | } else { | ||
835 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 846 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; |
836 | } | ||
837 | } | 847 | } |
838 | 848 | ||
839 | /** | 849 | /** |
@@ -892,19 +902,18 @@ void rcu_user_enter(void) | |||
892 | */ | 902 | */ |
893 | void rcu_irq_exit(void) | 903 | void rcu_irq_exit(void) |
894 | { | 904 | { |
895 | long long oldval; | ||
896 | struct rcu_dynticks *rdtp; | 905 | struct rcu_dynticks *rdtp; |
897 | 906 | ||
898 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); | 907 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); |
899 | rdtp = this_cpu_ptr(&rcu_dynticks); | 908 | rdtp = this_cpu_ptr(&rcu_dynticks); |
900 | oldval = rdtp->dynticks_nesting; | ||
901 | rdtp->dynticks_nesting--; | ||
902 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 909 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
903 | rdtp->dynticks_nesting < 0); | 910 | rdtp->dynticks_nesting < 1); |
904 | if (rdtp->dynticks_nesting) | 911 | if (rdtp->dynticks_nesting <= 1) { |
905 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); | 912 | rcu_eqs_enter_common(true); |
906 | else | 913 | } else { |
907 | rcu_eqs_enter_common(oldval, true); | 914 | trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); |
915 | rdtp->dynticks_nesting--; | ||
916 | } | ||
908 | rcu_sysidle_enter(1); | 917 | rcu_sysidle_enter(1); |
909 | } | 918 | } |
910 | 919 | ||
@@ -1150,6 +1159,24 @@ bool notrace rcu_is_watching(void) | |||
1150 | } | 1159 | } |
1151 | EXPORT_SYMBOL_GPL(rcu_is_watching); | 1160 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
1152 | 1161 | ||
1162 | /* | ||
1163 | * If a holdout task is actually running, request an urgent quiescent | ||
1164 | * state from its CPU. This is unsynchronized, so migrations can cause | ||
1165 | * the request to go to the wrong CPU. Which is OK, all that will happen | ||
1166 | * is that the CPU's next context switch will be a bit slower and next | ||
1167 | * time around this task will generate another request. | ||
1168 | */ | ||
1169 | void rcu_request_urgent_qs_task(struct task_struct *t) | ||
1170 | { | ||
1171 | int cpu; | ||
1172 | |||
1173 | barrier(); | ||
1174 | cpu = task_cpu(t); | ||
1175 | if (!task_curr(t)) | ||
1176 | return; /* This task is not running on that CPU. */ | ||
1177 | smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); | ||
1178 | } | ||
1179 | |||
1153 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 1180 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
1154 | 1181 | ||
1155 | /* | 1182 | /* |
@@ -1235,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1235 | bool *isidle, unsigned long *maxj) | 1262 | bool *isidle, unsigned long *maxj) |
1236 | { | 1263 | { |
1237 | unsigned long jtsq; | 1264 | unsigned long jtsq; |
1238 | int *rcrmp; | 1265 | bool *rnhqp; |
1266 | bool *ruqp; | ||
1239 | unsigned long rjtsc; | 1267 | unsigned long rjtsc; |
1240 | struct rcu_node *rnp; | 1268 | struct rcu_node *rnp; |
1241 | 1269 | ||
@@ -1271,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1271 | * might not be the case for nohz_full CPUs looping in the kernel. | 1299 | * might not be the case for nohz_full CPUs looping in the kernel. |
1272 | */ | 1300 | */ |
1273 | rnp = rdp->mynode; | 1301 | rnp = rdp->mynode; |
1302 | ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); | ||
1274 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && | 1303 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && |
1275 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && | 1304 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && |
1276 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { | 1305 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { |
1277 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); | 1306 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); |
1278 | return 1; | 1307 | return 1; |
1308 | } else { | ||
1309 | /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ | ||
1310 | smp_store_release(ruqp, true); | ||
1279 | } | 1311 | } |
1280 | 1312 | ||
1281 | /* Check for the CPU being offline. */ | 1313 | /* Check for the CPU being offline. */ |
@@ -1292,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1292 | * in-kernel CPU-bound tasks cannot advance grace periods. | 1324 | * in-kernel CPU-bound tasks cannot advance grace periods. |
1293 | * So if the grace period is old enough, make the CPU pay attention. | 1325 | * So if the grace period is old enough, make the CPU pay attention. |
1294 | * Note that the unsynchronized assignments to the per-CPU | 1326 | * Note that the unsynchronized assignments to the per-CPU |
1295 | * rcu_sched_qs_mask variable are safe. Yes, setting of | 1327 | * rcu_need_heavy_qs variable are safe. Yes, setting of |
1296 | * bits can be lost, but they will be set again on the next | 1328 | * bits can be lost, but they will be set again on the next |
1297 | * force-quiescent-state pass. So lost bit sets do not result | 1329 | * force-quiescent-state pass. So lost bit sets do not result |
1298 | * in incorrect behavior, merely in a grace period lasting | 1330 | * in incorrect behavior, merely in a grace period lasting |
@@ -1306,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1306 | * is set too high, we override with half of the RCU CPU stall | 1338 | * is set too high, we override with half of the RCU CPU stall |
1307 | * warning delay. | 1339 | * warning delay. |
1308 | */ | 1340 | */ |
1309 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); | 1341 | rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); |
1310 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || | 1342 | if (!READ_ONCE(*rnhqp) && |
1311 | time_after(jiffies, rdp->rsp->jiffies_resched)) { | 1343 | (time_after(jiffies, rdp->rsp->gp_start + jtsq) || |
1312 | if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { | 1344 | time_after(jiffies, rdp->rsp->jiffies_resched))) { |
1313 | WRITE_ONCE(rdp->cond_resched_completed, | 1345 | WRITE_ONCE(*rnhqp, true); |
1314 | READ_ONCE(rdp->mynode->completed)); | 1346 | /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ |
1315 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | 1347 | smp_store_release(ruqp, true); |
1316 | WRITE_ONCE(*rcrmp, | ||
1317 | READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); | ||
1318 | } | ||
1319 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | 1348 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ |
1320 | } | 1349 | } |
1321 | 1350 | ||
@@ -1475,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1475 | 1504 | ||
1476 | print_cpu_stall_info_end(); | 1505 | print_cpu_stall_info_end(); |
1477 | for_each_possible_cpu(cpu) | 1506 | for_each_possible_cpu(cpu) |
1478 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1507 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1508 | cpu)->cblist); | ||
1479 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", | 1509 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
1480 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1510 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
1481 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1511 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1529,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
1529 | print_cpu_stall_info(rsp, smp_processor_id()); | 1559 | print_cpu_stall_info(rsp, smp_processor_id()); |
1530 | print_cpu_stall_info_end(); | 1560 | print_cpu_stall_info_end(); |
1531 | for_each_possible_cpu(cpu) | 1561 | for_each_possible_cpu(cpu) |
1532 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1562 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1563 | cpu)->cblist); | ||
1533 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", | 1564 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
1534 | jiffies - rsp->gp_start, | 1565 | jiffies - rsp->gp_start, |
1535 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1566 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1632,30 +1663,6 @@ void rcu_cpu_stall_reset(void) | |||
1632 | } | 1663 | } |
1633 | 1664 | ||
1634 | /* | 1665 | /* |
1635 | * Initialize the specified rcu_data structure's default callback list | ||
1636 | * to empty. The default callback list is the one that is not used by | ||
1637 | * no-callbacks CPUs. | ||
1638 | */ | ||
1639 | static void init_default_callback_list(struct rcu_data *rdp) | ||
1640 | { | ||
1641 | int i; | ||
1642 | |||
1643 | rdp->nxtlist = NULL; | ||
1644 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1645 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1646 | } | ||
1647 | |||
1648 | /* | ||
1649 | * Initialize the specified rcu_data structure's callback list to empty. | ||
1650 | */ | ||
1651 | static void init_callback_list(struct rcu_data *rdp) | ||
1652 | { | ||
1653 | if (init_nocb_callback_list(rdp)) | ||
1654 | return; | ||
1655 | init_default_callback_list(rdp); | ||
1656 | } | ||
1657 | |||
1658 | /* | ||
1659 | * Determine the value that ->completed will have at the end of the | 1666 | * Determine the value that ->completed will have at the end of the |
1660 | * next subsequent grace period. This is used to tag callbacks so that | 1667 | * next subsequent grace period. This is used to tag callbacks so that |
1661 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 1668 | * a CPU can invoke callbacks in a timely fashion even if that CPU has |
@@ -1709,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1709 | unsigned long *c_out) | 1716 | unsigned long *c_out) |
1710 | { | 1717 | { |
1711 | unsigned long c; | 1718 | unsigned long c; |
1712 | int i; | ||
1713 | bool ret = false; | 1719 | bool ret = false; |
1714 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1720 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
1715 | 1721 | ||
@@ -1755,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1755 | /* | 1761 | /* |
1756 | * Get a new grace-period number. If there really is no grace | 1762 | * Get a new grace-period number. If there really is no grace |
1757 | * period in progress, it will be smaller than the one we obtained | 1763 | * period in progress, it will be smaller than the one we obtained |
1758 | * earlier. Adjust callbacks as needed. Note that even no-CBs | 1764 | * earlier. Adjust callbacks as needed. |
1759 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
1760 | */ | 1765 | */ |
1761 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | 1766 | c = rcu_cbs_completed(rdp->rsp, rnp_root); |
1762 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | 1767 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
1763 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | 1768 | (void)rcu_segcblist_accelerate(&rdp->cblist, c); |
1764 | rdp->nxtcompleted[i] = c; | ||
1765 | 1769 | ||
1766 | /* | 1770 | /* |
1767 | * If the needed for the required grace period is already | 1771 | * If the needed for the required grace period is already |
@@ -1793,9 +1797,7 @@ out: | |||
1793 | 1797 | ||
1794 | /* | 1798 | /* |
1795 | * Clean up any old requests for the just-ended grace period. Also return | 1799 | * Clean up any old requests for the just-ended grace period. Also return |
1796 | * whether any additional grace periods have been requested. Also invoke | 1800 | * whether any additional grace periods have been requested. |
1797 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
1798 | * waiting for this grace period to complete. | ||
1799 | */ | 1801 | */ |
1800 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 1802 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
1801 | { | 1803 | { |
@@ -1841,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
1841 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1843 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1842 | struct rcu_data *rdp) | 1844 | struct rcu_data *rdp) |
1843 | { | 1845 | { |
1844 | unsigned long c; | 1846 | bool ret = false; |
1845 | int i; | ||
1846 | bool ret; | ||
1847 | |||
1848 | /* If the CPU has no callbacks, nothing to do. */ | ||
1849 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1850 | return false; | ||
1851 | |||
1852 | /* | ||
1853 | * Starting from the sublist containing the callbacks most | ||
1854 | * recently assigned a ->completed number and working down, find the | ||
1855 | * first sublist that is not assignable to an upcoming grace period. | ||
1856 | * Such a sublist has something in it (first two tests) and has | ||
1857 | * a ->completed number assigned that will complete sooner than | ||
1858 | * the ->completed number for newly arrived callbacks (last test). | ||
1859 | * | ||
1860 | * The key point is that any later sublist can be assigned the | ||
1861 | * same ->completed number as the newly arrived callbacks, which | ||
1862 | * means that the callbacks in any of these later sublist can be | ||
1863 | * grouped into a single sublist, whether or not they have already | ||
1864 | * been assigned a ->completed number. | ||
1865 | */ | ||
1866 | c = rcu_cbs_completed(rsp, rnp); | ||
1867 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
1868 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
1869 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
1870 | break; | ||
1871 | 1847 | ||
1872 | /* | 1848 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1873 | * If there are no sublist for unassigned callbacks, leave. | 1849 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
1874 | * At the same time, advance "i" one sublist, so that "i" will | ||
1875 | * index into the sublist where all the remaining callbacks should | ||
1876 | * be grouped into. | ||
1877 | */ | ||
1878 | if (++i >= RCU_NEXT_TAIL) | ||
1879 | return false; | 1850 | return false; |
1880 | 1851 | ||
1881 | /* | 1852 | /* |
1882 | * Assign all subsequent callbacks' ->completed number to the next | 1853 | * Callbacks are often registered with incomplete grace-period |
1883 | * full grace period and group them all in the sublist initially | 1854 | * information. Something about the fact that getting exact |
1884 | * indexed by "i". | 1855 | * information requires acquiring a global lock... RCU therefore |
1856 | * makes a conservative estimate of the grace period number at which | ||
1857 | * a given callback will become ready to invoke. The following | ||
1858 | * code checks this estimate and improves it when possible, thus | ||
1859 | * accelerating callback invocation to an earlier grace-period | ||
1860 | * number. | ||
1885 | */ | 1861 | */ |
1886 | for (; i <= RCU_NEXT_TAIL; i++) { | 1862 | if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) |
1887 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | 1863 | ret = rcu_start_future_gp(rnp, rdp, NULL); |
1888 | rdp->nxtcompleted[i] = c; | ||
1889 | } | ||
1890 | /* Record any needed additional grace periods. */ | ||
1891 | ret = rcu_start_future_gp(rnp, rdp, NULL); | ||
1892 | 1864 | ||
1893 | /* Trace depending on how much we were able to accelerate. */ | 1865 | /* Trace depending on how much we were able to accelerate. */ |
1894 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1866 | if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) |
1895 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | 1867 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1896 | else | 1868 | else |
1897 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | 1869 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
@@ -1911,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1911 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1883 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1912 | struct rcu_data *rdp) | 1884 | struct rcu_data *rdp) |
1913 | { | 1885 | { |
1914 | int i, j; | 1886 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1915 | 1887 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | |
1916 | /* If the CPU has no callbacks, nothing to do. */ | ||
1917 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1918 | return false; | 1888 | return false; |
1919 | 1889 | ||
1920 | /* | 1890 | /* |
1921 | * Find all callbacks whose ->completed numbers indicate that they | 1891 | * Find all callbacks whose ->completed numbers indicate that they |
1922 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | 1892 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. |
1923 | */ | 1893 | */ |
1924 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | 1894 | rcu_segcblist_advance(&rdp->cblist, rnp->completed); |
1925 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
1926 | break; | ||
1927 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
1928 | } | ||
1929 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
1930 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
1931 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
1932 | |||
1933 | /* Copy down callbacks to fill in empty sublists. */ | ||
1934 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
1935 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
1936 | break; | ||
1937 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
1938 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
1939 | } | ||
1940 | 1895 | ||
1941 | /* Classify any remaining callbacks. */ | 1896 | /* Classify any remaining callbacks. */ |
1942 | return rcu_accelerate_cbs(rsp, rnp, rdp); | 1897 | return rcu_accelerate_cbs(rsp, rnp, rdp); |
@@ -1981,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1981 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1936 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1982 | need_gp = !!(rnp->qsmask & rdp->grpmask); | 1937 | need_gp = !!(rnp->qsmask & rdp->grpmask); |
1983 | rdp->cpu_no_qs.b.norm = need_gp; | 1938 | rdp->cpu_no_qs.b.norm = need_gp; |
1984 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1939 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
1985 | rdp->core_needs_qs = need_gp; | 1940 | rdp->core_needs_qs = need_gp; |
1986 | zero_cpu_stall_ticks(rdp); | 1941 | zero_cpu_stall_ticks(rdp); |
1987 | WRITE_ONCE(rdp->gpwrap, false); | 1942 | WRITE_ONCE(rdp->gpwrap, false); |
@@ -2579,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2579 | * within the current grace period. | 2534 | * within the current grace period. |
2580 | */ | 2535 | */ |
2581 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ | 2536 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ |
2582 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 2537 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
2583 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2538 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
2584 | return; | 2539 | return; |
2585 | } | 2540 | } |
@@ -2653,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2653 | * because _rcu_barrier() excludes CPU-hotplug operations, so it | 2608 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
2654 | * cannot be running now. Thus no memory barrier is required. | 2609 | * cannot be running now. Thus no memory barrier is required. |
2655 | */ | 2610 | */ |
2656 | if (rdp->nxtlist != NULL) { | 2611 | rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); |
2657 | rsp->qlen_lazy += rdp->qlen_lazy; | 2612 | rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); |
2658 | rsp->qlen += rdp->qlen; | ||
2659 | rdp->n_cbs_orphaned += rdp->qlen; | ||
2660 | rdp->qlen_lazy = 0; | ||
2661 | WRITE_ONCE(rdp->qlen, 0); | ||
2662 | } | ||
2663 | 2613 | ||
2664 | /* | 2614 | /* |
2665 | * Next, move those callbacks still needing a grace period to | 2615 | * Next, move those callbacks still needing a grace period to |
@@ -2667,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2667 | * Some of the callbacks might have gone partway through a grace | 2617 | * Some of the callbacks might have gone partway through a grace |
2668 | * period, but that is too bad. They get to start over because we | 2618 | * period, but that is too bad. They get to start over because we |
2669 | * cannot assume that grace periods are synchronized across CPUs. | 2619 | * cannot assume that grace periods are synchronized across CPUs. |
2670 | * We don't bother updating the ->nxttail[] array yet, instead | ||
2671 | * we just reset the whole thing later on. | ||
2672 | */ | 2620 | */ |
2673 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { | 2621 | rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2674 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
2675 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
2676 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2677 | } | ||
2678 | 2622 | ||
2679 | /* | 2623 | /* |
2680 | * Then move the ready-to-invoke callbacks to the orphanage, | 2624 | * Then move the ready-to-invoke callbacks to the orphanage, |
2681 | * where some other CPU will pick them up. These will not be | 2625 | * where some other CPU will pick them up. These will not be |
2682 | * required to pass though another grace period: They are done. | 2626 | * required to pass though another grace period: They are done. |
2683 | */ | 2627 | */ |
2684 | if (rdp->nxtlist != NULL) { | 2628 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2685 | *rsp->orphan_donetail = rdp->nxtlist; | ||
2686 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2687 | } | ||
2688 | 2629 | ||
2689 | /* | 2630 | /* Finally, disallow further callbacks on this CPU. */ |
2690 | * Finally, initialize the rcu_data structure's list to empty and | 2631 | rcu_segcblist_disable(&rdp->cblist); |
2691 | * disallow further callbacks on this CPU. | ||
2692 | */ | ||
2693 | init_callback_list(rdp); | ||
2694 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2695 | } | 2632 | } |
2696 | 2633 | ||
2697 | /* | 2634 | /* |
@@ -2700,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2700 | */ | 2637 | */ |
2701 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | 2638 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
2702 | { | 2639 | { |
2703 | int i; | ||
2704 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 2640 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
2705 | 2641 | ||
2706 | /* No-CBs CPUs are handled specially. */ | 2642 | /* No-CBs CPUs are handled specially. */ |
@@ -2709,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2709 | return; | 2645 | return; |
2710 | 2646 | ||
2711 | /* Do the accounting first. */ | 2647 | /* Do the accounting first. */ |
2712 | rdp->qlen_lazy += rsp->qlen_lazy; | 2648 | rdp->n_cbs_adopted += rsp->orphan_done.len; |
2713 | rdp->qlen += rsp->qlen; | 2649 | if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) |
2714 | rdp->n_cbs_adopted += rsp->qlen; | ||
2715 | if (rsp->qlen_lazy != rsp->qlen) | ||
2716 | rcu_idle_count_callbacks_posted(); | 2650 | rcu_idle_count_callbacks_posted(); |
2717 | rsp->qlen_lazy = 0; | 2651 | rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); |
2718 | rsp->qlen = 0; | ||
2719 | 2652 | ||
2720 | /* | 2653 | /* |
2721 | * We do not need a memory barrier here because the only way we | 2654 | * We do not need a memory barrier here because the only way we |
@@ -2723,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2723 | * we are the task doing the rcu_barrier(). | 2656 | * we are the task doing the rcu_barrier(). |
2724 | */ | 2657 | */ |
2725 | 2658 | ||
2726 | /* First adopt the ready-to-invoke callbacks. */ | 2659 | /* First adopt the ready-to-invoke callbacks, then the done ones. */ |
2727 | if (rsp->orphan_donelist != NULL) { | 2660 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2728 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | 2661 | WARN_ON_ONCE(rsp->orphan_done.head); |
2729 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | 2662 | rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2730 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | 2663 | WARN_ON_ONCE(rsp->orphan_pend.head); |
2731 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | 2664 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != |
2732 | rdp->nxttail[i] = rsp->orphan_donetail; | 2665 | !rcu_segcblist_n_cbs(&rdp->cblist)); |
2733 | rsp->orphan_donelist = NULL; | ||
2734 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2735 | } | ||
2736 | |||
2737 | /* And then adopt the callbacks that still need a grace period. */ | ||
2738 | if (rsp->orphan_nxtlist != NULL) { | ||
2739 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
2740 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
2741 | rsp->orphan_nxtlist = NULL; | ||
2742 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2743 | } | ||
2744 | } | 2666 | } |
2745 | 2667 | ||
2746 | /* | 2668 | /* |
@@ -2748,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2748 | */ | 2670 | */ |
2749 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 2671 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
2750 | { | 2672 | { |
2751 | RCU_TRACE(unsigned long mask); | 2673 | RCU_TRACE(unsigned long mask;) |
2752 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | 2674 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) |
2753 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | 2675 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) |
2754 | 2676 | ||
2755 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | 2677 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) |
2756 | return; | 2678 | return; |
2757 | 2679 | ||
2758 | RCU_TRACE(mask = rdp->grpmask); | 2680 | RCU_TRACE(mask = rdp->grpmask;) |
2759 | trace_rcu_grace_period(rsp->name, | 2681 | trace_rcu_grace_period(rsp->name, |
2760 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 2682 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
2761 | TPS("cpuofl")); | 2683 | TPS("cpuofl")); |
@@ -2828,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2828 | rcu_adopt_orphan_cbs(rsp, flags); | 2750 | rcu_adopt_orphan_cbs(rsp, flags); |
2829 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 2751 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); |
2830 | 2752 | ||
2831 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 2753 | WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || |
2832 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 2754 | !rcu_segcblist_empty(&rdp->cblist), |
2833 | cpu, rdp->qlen, rdp->nxtlist); | 2755 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", |
2756 | cpu, rcu_segcblist_n_cbs(&rdp->cblist), | ||
2757 | rcu_segcblist_first_cb(&rdp->cblist)); | ||
2834 | } | 2758 | } |
2835 | 2759 | ||
2836 | /* | 2760 | /* |
@@ -2840,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2840 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | 2764 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) |
2841 | { | 2765 | { |
2842 | unsigned long flags; | 2766 | unsigned long flags; |
2843 | struct rcu_head *next, *list, **tail; | 2767 | struct rcu_head *rhp; |
2844 | long bl, count, count_lazy; | 2768 | struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); |
2845 | int i; | 2769 | long bl, count; |
2846 | 2770 | ||
2847 | /* If no callbacks are ready, just return. */ | 2771 | /* If no callbacks are ready, just return. */ |
2848 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 2772 | if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { |
2849 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | 2773 | trace_rcu_batch_start(rsp->name, |
2850 | trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), | 2774 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2775 | rcu_segcblist_n_cbs(&rdp->cblist), 0); | ||
2776 | trace_rcu_batch_end(rsp->name, 0, | ||
2777 | !rcu_segcblist_empty(&rdp->cblist), | ||
2851 | need_resched(), is_idle_task(current), | 2778 | need_resched(), is_idle_task(current), |
2852 | rcu_is_callbacks_kthread()); | 2779 | rcu_is_callbacks_kthread()); |
2853 | return; | 2780 | return; |
@@ -2855,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2855 | 2782 | ||
2856 | /* | 2783 | /* |
2857 | * Extract the list of ready callbacks, disabling to prevent | 2784 | * Extract the list of ready callbacks, disabling to prevent |
2858 | * races with call_rcu() from interrupt handlers. | 2785 | * races with call_rcu() from interrupt handlers. Leave the |
2786 | * callback counts, as rcu_barrier() needs to be conservative. | ||
2859 | */ | 2787 | */ |
2860 | local_irq_save(flags); | 2788 | local_irq_save(flags); |
2861 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 2789 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
2862 | bl = rdp->blimit; | 2790 | bl = rdp->blimit; |
2863 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); | 2791 | trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2864 | list = rdp->nxtlist; | 2792 | rcu_segcblist_n_cbs(&rdp->cblist), bl); |
2865 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 2793 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); |
2866 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2867 | tail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2868 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) | ||
2869 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
2870 | rdp->nxttail[i] = &rdp->nxtlist; | ||
2871 | local_irq_restore(flags); | 2794 | local_irq_restore(flags); |
2872 | 2795 | ||
2873 | /* Invoke callbacks. */ | 2796 | /* Invoke callbacks. */ |
2874 | count = count_lazy = 0; | 2797 | rhp = rcu_cblist_dequeue(&rcl); |
2875 | while (list) { | 2798 | for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { |
2876 | next = list->next; | 2799 | debug_rcu_head_unqueue(rhp); |
2877 | prefetch(next); | 2800 | if (__rcu_reclaim(rsp->name, rhp)) |
2878 | debug_rcu_head_unqueue(list); | 2801 | rcu_cblist_dequeued_lazy(&rcl); |
2879 | if (__rcu_reclaim(rsp->name, list)) | 2802 | /* |
2880 | count_lazy++; | 2803 | * Stop only if limit reached and CPU has something to do. |
2881 | list = next; | 2804 | * Note: The rcl structure counts down from zero. |
2882 | /* Stop only if limit reached and CPU has something to do. */ | 2805 | */ |
2883 | if (++count >= bl && | 2806 | if (-rcl.len >= bl && |
2884 | (need_resched() || | 2807 | (need_resched() || |
2885 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | 2808 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) |
2886 | break; | 2809 | break; |
2887 | } | 2810 | } |
2888 | 2811 | ||
2889 | local_irq_save(flags); | 2812 | local_irq_save(flags); |
2890 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), | 2813 | count = -rcl.len; |
2891 | is_idle_task(current), | 2814 | trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), |
2892 | rcu_is_callbacks_kthread()); | 2815 | is_idle_task(current), rcu_is_callbacks_kthread()); |
2893 | 2816 | ||
2894 | /* Update count, and requeue any remaining callbacks. */ | 2817 | /* Update counts and requeue any remaining callbacks. */ |
2895 | if (list != NULL) { | 2818 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); |
2896 | *tail = rdp->nxtlist; | ||
2897 | rdp->nxtlist = list; | ||
2898 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
2899 | if (&rdp->nxtlist == rdp->nxttail[i]) | ||
2900 | rdp->nxttail[i] = tail; | ||
2901 | else | ||
2902 | break; | ||
2903 | } | ||
2904 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2819 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
2905 | rdp->qlen_lazy -= count_lazy; | ||
2906 | WRITE_ONCE(rdp->qlen, rdp->qlen - count); | ||
2907 | rdp->n_cbs_invoked += count; | 2820 | rdp->n_cbs_invoked += count; |
2821 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); | ||
2908 | 2822 | ||
2909 | /* Reinstate batch limit if we have worked down the excess. */ | 2823 | /* Reinstate batch limit if we have worked down the excess. */ |
2910 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 2824 | count = rcu_segcblist_n_cbs(&rdp->cblist); |
2825 | if (rdp->blimit == LONG_MAX && count <= qlowmark) | ||
2911 | rdp->blimit = blimit; | 2826 | rdp->blimit = blimit; |
2912 | 2827 | ||
2913 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | 2828 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ |
2914 | if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { | 2829 | if (count == 0 && rdp->qlen_last_fqs_check != 0) { |
2915 | rdp->qlen_last_fqs_check = 0; | 2830 | rdp->qlen_last_fqs_check = 0; |
2916 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2831 | rdp->n_force_qs_snap = rsp->n_force_qs; |
2917 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | 2832 | } else if (count < rdp->qlen_last_fqs_check - qhimark) |
2918 | rdp->qlen_last_fqs_check = rdp->qlen; | 2833 | rdp->qlen_last_fqs_check = count; |
2919 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | 2834 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); |
2920 | 2835 | ||
2921 | local_irq_restore(flags); | 2836 | local_irq_restore(flags); |
2922 | 2837 | ||
2923 | /* Re-invoke RCU core processing if there are callbacks remaining. */ | 2838 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
2924 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 2839 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
2925 | invoke_rcu_core(); | 2840 | invoke_rcu_core(); |
2926 | } | 2841 | } |
2927 | 2842 | ||
@@ -3087,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3087 | bool needwake; | 3002 | bool needwake; |
3088 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 3003 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
3089 | 3004 | ||
3090 | WARN_ON_ONCE(rdp->beenonline == 0); | 3005 | WARN_ON_ONCE(!rdp->beenonline); |
3091 | 3006 | ||
3092 | /* Update RCU state based on any recent quiescent states. */ | 3007 | /* Update RCU state based on any recent quiescent states. */ |
3093 | rcu_check_quiescent_state(rsp, rdp); | 3008 | rcu_check_quiescent_state(rsp, rdp); |
@@ -3105,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3105 | } | 3020 | } |
3106 | 3021 | ||
3107 | /* If there are callbacks ready, invoke them. */ | 3022 | /* If there are callbacks ready, invoke them. */ |
3108 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 3023 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
3109 | invoke_rcu_callbacks(rsp, rdp); | 3024 | invoke_rcu_callbacks(rsp, rdp); |
3110 | 3025 | ||
3111 | /* Do any needed deferred wakeups of rcuo kthreads. */ | 3026 | /* Do any needed deferred wakeups of rcuo kthreads. */ |
@@ -3177,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3177 | * invoking force_quiescent_state() if the newly enqueued callback | 3092 | * invoking force_quiescent_state() if the newly enqueued callback |
3178 | * is the only one waiting for a grace period to complete. | 3093 | * is the only one waiting for a grace period to complete. |
3179 | */ | 3094 | */ |
3180 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 3095 | if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > |
3096 | rdp->qlen_last_fqs_check + qhimark)) { | ||
3181 | 3097 | ||
3182 | /* Are we ignoring a completed grace period? */ | 3098 | /* Are we ignoring a completed grace period? */ |
3183 | note_gp_changes(rsp, rdp); | 3099 | note_gp_changes(rsp, rdp); |
@@ -3195,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3195 | /* Give the grace period a kick. */ | 3111 | /* Give the grace period a kick. */ |
3196 | rdp->blimit = LONG_MAX; | 3112 | rdp->blimit = LONG_MAX; |
3197 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 3113 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
3198 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 3114 | rcu_segcblist_first_pend_cb(&rdp->cblist) != head) |
3199 | force_quiescent_state(rsp); | 3115 | force_quiescent_state(rsp); |
3200 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3116 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3201 | rdp->qlen_last_fqs_check = rdp->qlen; | 3117 | rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); |
3202 | } | 3118 | } |
3203 | } | 3119 | } |
3204 | } | 3120 | } |
@@ -3238,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3238 | rdp = this_cpu_ptr(rsp->rda); | 3154 | rdp = this_cpu_ptr(rsp->rda); |
3239 | 3155 | ||
3240 | /* Add the callback to our list. */ | 3156 | /* Add the callback to our list. */ |
3241 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { | 3157 | if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { |
3242 | int offline; | 3158 | int offline; |
3243 | 3159 | ||
3244 | if (cpu != -1) | 3160 | if (cpu != -1) |
@@ -3257,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3257 | */ | 3173 | */ |
3258 | BUG_ON(cpu != -1); | 3174 | BUG_ON(cpu != -1); |
3259 | WARN_ON_ONCE(!rcu_is_watching()); | 3175 | WARN_ON_ONCE(!rcu_is_watching()); |
3260 | if (!likely(rdp->nxtlist)) | 3176 | if (rcu_segcblist_empty(&rdp->cblist)) |
3261 | init_default_callback_list(rdp); | 3177 | rcu_segcblist_init(&rdp->cblist); |
3262 | } | 3178 | } |
3263 | WRITE_ONCE(rdp->qlen, rdp->qlen + 1); | 3179 | rcu_segcblist_enqueue(&rdp->cblist, head, lazy); |
3264 | if (lazy) | 3180 | if (!lazy) |
3265 | rdp->qlen_lazy++; | ||
3266 | else | ||
3267 | rcu_idle_count_callbacks_posted(); | 3181 | rcu_idle_count_callbacks_posted(); |
3268 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
3269 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
3270 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
3271 | 3182 | ||
3272 | if (__is_kfree_rcu_offset((unsigned long)func)) | 3183 | if (__is_kfree_rcu_offset((unsigned long)func)) |
3273 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 3184 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
3274 | rdp->qlen_lazy, rdp->qlen); | 3185 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
3186 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3275 | else | 3187 | else |
3276 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | 3188 | trace_rcu_callback(rsp->name, head, |
3189 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), | ||
3190 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3277 | 3191 | ||
3278 | /* Go handle any RCU core processing required. */ | 3192 | /* Go handle any RCU core processing required. */ |
3279 | __call_rcu_core(rsp, rdp, head, flags); | 3193 | __call_rcu_core(rsp, rdp, head, flags); |
@@ -3519,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate) | |||
3519 | } | 3433 | } |
3520 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); | 3434 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); |
3521 | 3435 | ||
3522 | /* Adjust sequence number for start of update-side operation. */ | ||
3523 | static void rcu_seq_start(unsigned long *sp) | ||
3524 | { | ||
3525 | WRITE_ONCE(*sp, *sp + 1); | ||
3526 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
3527 | WARN_ON_ONCE(!(*sp & 0x1)); | ||
3528 | } | ||
3529 | |||
3530 | /* Adjust sequence number for end of update-side operation. */ | ||
3531 | static void rcu_seq_end(unsigned long *sp) | ||
3532 | { | ||
3533 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
3534 | WRITE_ONCE(*sp, *sp + 1); | ||
3535 | WARN_ON_ONCE(*sp & 0x1); | ||
3536 | } | ||
3537 | |||
3538 | /* Take a snapshot of the update side's sequence number. */ | ||
3539 | static unsigned long rcu_seq_snap(unsigned long *sp) | ||
3540 | { | ||
3541 | unsigned long s; | ||
3542 | |||
3543 | s = (READ_ONCE(*sp) + 3) & ~0x1; | ||
3544 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
3545 | return s; | ||
3546 | } | ||
3547 | |||
3548 | /* | ||
3549 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
3550 | * full update-side operation has occurred. | ||
3551 | */ | ||
3552 | static bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
3553 | { | ||
3554 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
3555 | } | ||
3556 | |||
3557 | /* | 3436 | /* |
3558 | * Check to see if there is any immediate RCU-related work to be done | 3437 | * Check to see if there is any immediate RCU-related work to be done |
3559 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 3438 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
@@ -3577,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3577 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3456 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
3578 | if (rcu_scheduler_fully_active && | 3457 | if (rcu_scheduler_fully_active && |
3579 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && | 3458 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && |
3580 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { | 3459 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { |
3581 | rdp->n_rp_core_needs_qs++; | 3460 | rdp->n_rp_core_needs_qs++; |
3582 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { | 3461 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { |
3583 | rdp->n_rp_report_qs++; | 3462 | rdp->n_rp_report_qs++; |
@@ -3585,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3585 | } | 3464 | } |
3586 | 3465 | ||
3587 | /* Does this CPU have callbacks ready to invoke? */ | 3466 | /* Does this CPU have callbacks ready to invoke? */ |
3588 | if (cpu_has_callbacks_ready_to_invoke(rdp)) { | 3467 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { |
3589 | rdp->n_rp_cb_ready++; | 3468 | rdp->n_rp_cb_ready++; |
3590 | return 1; | 3469 | return 1; |
3591 | } | 3470 | } |
@@ -3649,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) | |||
3649 | 3528 | ||
3650 | for_each_rcu_flavor(rsp) { | 3529 | for_each_rcu_flavor(rsp) { |
3651 | rdp = this_cpu_ptr(rsp->rda); | 3530 | rdp = this_cpu_ptr(rsp->rda); |
3652 | if (!rdp->nxtlist) | 3531 | if (rcu_segcblist_empty(&rdp->cblist)) |
3653 | continue; | 3532 | continue; |
3654 | hc = true; | 3533 | hc = true; |
3655 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | 3534 | if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { |
3656 | al = false; | 3535 | al = false; |
3657 | break; | 3536 | break; |
3658 | } | 3537 | } |
@@ -3761,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3761 | __call_rcu(&rdp->barrier_head, | 3640 | __call_rcu(&rdp->barrier_head, |
3762 | rcu_barrier_callback, rsp, cpu, 0); | 3641 | rcu_barrier_callback, rsp, cpu, 0); |
3763 | } | 3642 | } |
3764 | } else if (READ_ONCE(rdp->qlen)) { | 3643 | } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { |
3765 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3644 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
3766 | rsp->barrier_sequence); | 3645 | rsp->barrier_sequence); |
3767 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 3646 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
@@ -3870,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3870 | rdp->qlen_last_fqs_check = 0; | 3749 | rdp->qlen_last_fqs_check = 0; |
3871 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3750 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3872 | rdp->blimit = blimit; | 3751 | rdp->blimit = blimit; |
3873 | if (!rdp->nxtlist) | 3752 | if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ |
3874 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3753 | !init_nocb_callback_list(rdp)) |
3754 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ | ||
3875 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3755 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
3876 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 3756 | rcu_sysidle_init_percpu_data(rdp->dynticks); |
3877 | rcu_dynticks_eqs_online(); | 3757 | rcu_dynticks_eqs_online(); |
@@ -3890,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3890 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ | 3770 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ |
3891 | rdp->completed = rnp->completed; | 3771 | rdp->completed = rnp->completed; |
3892 | rdp->cpu_no_qs.b.norm = true; | 3772 | rdp->cpu_no_qs.b.norm = true; |
3893 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); | 3773 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); |
3894 | rdp->core_needs_qs = false; | 3774 | rdp->core_needs_qs = false; |
3895 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | 3775 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
3896 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3776 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
3897 | } | 3777 | } |
3898 | 3778 | ||
3779 | /* | ||
3780 | * Invoked early in the CPU-online process, when pretty much all | ||
3781 | * services are available. The incoming CPU is not present. | ||
3782 | */ | ||
3899 | int rcutree_prepare_cpu(unsigned int cpu) | 3783 | int rcutree_prepare_cpu(unsigned int cpu) |
3900 | { | 3784 | { |
3901 | struct rcu_state *rsp; | 3785 | struct rcu_state *rsp; |
@@ -3909,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu) | |||
3909 | return 0; | 3793 | return 0; |
3910 | } | 3794 | } |
3911 | 3795 | ||
3796 | /* | ||
3797 | * Update RCU priority boot kthread affinity for CPU-hotplug changes. | ||
3798 | */ | ||
3912 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | 3799 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) |
3913 | { | 3800 | { |
3914 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 3801 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
@@ -3916,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | |||
3916 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); | 3803 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); |
3917 | } | 3804 | } |
3918 | 3805 | ||
3806 | /* | ||
3807 | * Near the end of the CPU-online process. Pretty much all services | ||
3808 | * enabled, and the CPU is now very much alive. | ||
3809 | */ | ||
3919 | int rcutree_online_cpu(unsigned int cpu) | 3810 | int rcutree_online_cpu(unsigned int cpu) |
3920 | { | 3811 | { |
3921 | sync_sched_exp_online_cleanup(cpu); | 3812 | sync_sched_exp_online_cleanup(cpu); |
3922 | rcutree_affinity_setting(cpu, -1); | 3813 | rcutree_affinity_setting(cpu, -1); |
3814 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3815 | srcu_online_cpu(cpu); | ||
3923 | return 0; | 3816 | return 0; |
3924 | } | 3817 | } |
3925 | 3818 | ||
3819 | /* | ||
3820 | * Near the beginning of the process. The CPU is still very much alive | ||
3821 | * with pretty much all services enabled. | ||
3822 | */ | ||
3926 | int rcutree_offline_cpu(unsigned int cpu) | 3823 | int rcutree_offline_cpu(unsigned int cpu) |
3927 | { | 3824 | { |
3928 | rcutree_affinity_setting(cpu, cpu); | 3825 | rcutree_affinity_setting(cpu, cpu); |
3826 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3827 | srcu_offline_cpu(cpu); | ||
3929 | return 0; | 3828 | return 0; |
3930 | } | 3829 | } |
3931 | 3830 | ||
3932 | 3831 | /* | |
3832 | * Near the end of the offline process. We do only tracing here. | ||
3833 | */ | ||
3933 | int rcutree_dying_cpu(unsigned int cpu) | 3834 | int rcutree_dying_cpu(unsigned int cpu) |
3934 | { | 3835 | { |
3935 | struct rcu_state *rsp; | 3836 | struct rcu_state *rsp; |
@@ -3939,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu) | |||
3939 | return 0; | 3840 | return 0; |
3940 | } | 3841 | } |
3941 | 3842 | ||
3843 | /* | ||
3844 | * The outgoing CPU is gone and we are running elsewhere. | ||
3845 | */ | ||
3942 | int rcutree_dead_cpu(unsigned int cpu) | 3846 | int rcutree_dead_cpu(unsigned int cpu) |
3943 | { | 3847 | { |
3944 | struct rcu_state *rsp; | 3848 | struct rcu_state *rsp; |
@@ -3956,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
3956 | * incoming CPUs are not allowed to use RCU read-side critical sections | 3860 | * incoming CPUs are not allowed to use RCU read-side critical sections |
3957 | * until this function is called. Failing to observe this restriction | 3861 | * until this function is called. Failing to observe this restriction |
3958 | * will result in lockdep splats. | 3862 | * will result in lockdep splats. |
3863 | * | ||
3864 | * Note that this function is special in that it is invoked directly | ||
3865 | * from the incoming CPU rather than from the cpuhp_step mechanism. | ||
3866 | * This is because this function must be invoked at a precise location. | ||
3959 | */ | 3867 | */ |
3960 | void rcu_cpu_starting(unsigned int cpu) | 3868 | void rcu_cpu_starting(unsigned int cpu) |
3961 | { | 3869 | { |
@@ -3981,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu) | |||
3981 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | 3889 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() |
3982 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | 3890 | * function. We now remove it from the rcu_node tree's ->qsmaskinit |
3983 | * bit masks. | 3891 | * bit masks. |
3984 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | ||
3985 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | ||
3986 | * bit masks. | ||
3987 | */ | 3892 | */ |
3988 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | 3893 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) |
3989 | { | 3894 | { |
@@ -3999,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
3999 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3904 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
4000 | } | 3905 | } |
4001 | 3906 | ||
3907 | /* | ||
3908 | * The outgoing function has no further need of RCU, so remove it from | ||
3909 | * the list of CPUs that RCU must track. | ||
3910 | * | ||
3911 | * Note that this function is special in that it is invoked directly | ||
3912 | * from the outgoing CPU rather than from the cpuhp_step mechanism. | ||
3913 | * This is because this function must be invoked at a precise location. | ||
3914 | */ | ||
4002 | void rcu_report_dead(unsigned int cpu) | 3915 | void rcu_report_dead(unsigned int cpu) |
4003 | { | 3916 | { |
4004 | struct rcu_state *rsp; | 3917 | struct rcu_state *rsp; |
@@ -4013,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu) | |||
4013 | } | 3926 | } |
4014 | #endif | 3927 | #endif |
4015 | 3928 | ||
3929 | /* | ||
3930 | * On non-huge systems, use expedited RCU grace periods to make suspend | ||
3931 | * and hibernation run faster. | ||
3932 | */ | ||
4016 | static int rcu_pm_notify(struct notifier_block *self, | 3933 | static int rcu_pm_notify(struct notifier_block *self, |
4017 | unsigned long action, void *hcpu) | 3934 | unsigned long action, void *hcpu) |
4018 | { | 3935 | { |
@@ -4083,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread); | |||
4083 | * task is booting the system, and such primitives are no-ops). After this | 4000 | * task is booting the system, and such primitives are no-ops). After this |
4084 | * function is called, any synchronous grace-period primitives are run as | 4001 | * function is called, any synchronous grace-period primitives are run as |
4085 | * expedited, with the requesting task driving the grace period forward. | 4002 | * expedited, with the requesting task driving the grace period forward. |
4086 | * A later core_initcall() rcu_exp_runtime_mode() will switch to full | 4003 | * A later core_initcall() rcu_set_runtime_mode() will switch to full |
4087 | * runtime RCU functionality. | 4004 | * runtime RCU functionality. |
4088 | */ | 4005 | */ |
4089 | void rcu_scheduler_starting(void) | 4006 | void rcu_scheduler_starting(void) |
@@ -4096,31 +4013,6 @@ void rcu_scheduler_starting(void) | |||
4096 | } | 4013 | } |
4097 | 4014 | ||
4098 | /* | 4015 | /* |
4099 | * Compute the per-level fanout, either using the exact fanout specified | ||
4100 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
4101 | */ | ||
4102 | static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
4103 | { | ||
4104 | int i; | ||
4105 | |||
4106 | if (rcu_fanout_exact) { | ||
4107 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
4108 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
4109 | levelspread[i] = RCU_FANOUT; | ||
4110 | } else { | ||
4111 | int ccur; | ||
4112 | int cprv; | ||
4113 | |||
4114 | cprv = nr_cpu_ids; | ||
4115 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
4116 | ccur = levelcnt[i]; | ||
4117 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
4118 | cprv = ccur; | ||
4119 | } | ||
4120 | } | ||
4121 | } | ||
4122 | |||
4123 | /* | ||
4124 | * Helper function for rcu_init() that initializes one rcu_state structure. | 4016 | * Helper function for rcu_init() that initializes one rcu_state structure. |
4125 | */ | 4017 | */ |
4126 | static void __init rcu_init_one(struct rcu_state *rsp) | 4018 | static void __init rcu_init_one(struct rcu_state *rsp) |
@@ -4129,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4129 | static const char * const fqs[] = RCU_FQS_NAME_INIT; | 4021 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
4130 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 4022 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
4131 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 4023 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
4132 | static u8 fl_mask = 0x1; | ||
4133 | 4024 | ||
4134 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | ||
4135 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 4025 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
4136 | int cpustride = 1; | 4026 | int cpustride = 1; |
4137 | int i; | 4027 | int i; |
@@ -4146,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4146 | 4036 | ||
4147 | /* Initialize the level-tracking arrays. */ | 4037 | /* Initialize the level-tracking arrays. */ |
4148 | 4038 | ||
4149 | for (i = 0; i < rcu_num_lvls; i++) | ||
4150 | levelcnt[i] = num_rcu_lvl[i]; | ||
4151 | for (i = 1; i < rcu_num_lvls; i++) | 4039 | for (i = 1; i < rcu_num_lvls; i++) |
4152 | rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; | 4040 | rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; |
4153 | rcu_init_levelspread(levelspread, levelcnt); | 4041 | rcu_init_levelspread(levelspread, num_rcu_lvl); |
4154 | rsp->flavor_mask = fl_mask; | ||
4155 | fl_mask <<= 1; | ||
4156 | 4042 | ||
4157 | /* Initialize the elements themselves, starting from the leaves. */ | 4043 | /* Initialize the elements themselves, starting from the leaves. */ |
4158 | 4044 | ||
4159 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 4045 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
4160 | cpustride *= levelspread[i]; | 4046 | cpustride *= levelspread[i]; |
4161 | rnp = rsp->level[i]; | 4047 | rnp = rsp->level[i]; |
4162 | for (j = 0; j < levelcnt[i]; j++, rnp++) { | 4048 | for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { |
4163 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); | 4049 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); |
4164 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), | 4050 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), |
4165 | &rcu_node_class[i], buf[i]); | 4051 | &rcu_node_class[i], buf[i]); |
@@ -4332,6 +4218,8 @@ void __init rcu_init(void) | |||
4332 | for_each_online_cpu(cpu) { | 4218 | for_each_online_cpu(cpu) { |
4333 | rcutree_prepare_cpu(cpu); | 4219 | rcutree_prepare_cpu(cpu); |
4334 | rcu_cpu_starting(cpu); | 4220 | rcu_cpu_starting(cpu); |
4221 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
4222 | srcu_online_cpu(cpu); | ||
4335 | } | 4223 | } |
4336 | } | 4224 | } |
4337 | 4225 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..ba38262c3554 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -30,80 +30,9 @@ | |||
30 | #include <linux/seqlock.h> | 30 | #include <linux/seqlock.h> |
31 | #include <linux/swait.h> | 31 | #include <linux/swait.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | #include <linux/rcu_node_tree.h> | ||
33 | 34 | ||
34 | /* | 35 | #include "rcu_segcblist.h" |
35 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
36 | * CONFIG_RCU_FANOUT_LEAF. | ||
37 | * In theory, it should be possible to add more levels straightforwardly. | ||
38 | * In practice, this did work well going from three levels to four. | ||
39 | * Of course, your mileage may vary. | ||
40 | */ | ||
41 | |||
42 | #ifdef CONFIG_RCU_FANOUT | ||
43 | #define RCU_FANOUT CONFIG_RCU_FANOUT | ||
44 | #else /* #ifdef CONFIG_RCU_FANOUT */ | ||
45 | # ifdef CONFIG_64BIT | ||
46 | # define RCU_FANOUT 64 | ||
47 | # else | ||
48 | # define RCU_FANOUT 32 | ||
49 | # endif | ||
50 | #endif /* #else #ifdef CONFIG_RCU_FANOUT */ | ||
51 | |||
52 | #ifdef CONFIG_RCU_FANOUT_LEAF | ||
53 | #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF | ||
54 | #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
55 | # ifdef CONFIG_64BIT | ||
56 | # define RCU_FANOUT_LEAF 64 | ||
57 | # else | ||
58 | # define RCU_FANOUT_LEAF 32 | ||
59 | # endif | ||
60 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
61 | |||
62 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
63 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) | ||
64 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) | ||
65 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) | ||
66 | |||
67 | #if NR_CPUS <= RCU_FANOUT_1 | ||
68 | # define RCU_NUM_LVLS 1 | ||
69 | # define NUM_RCU_LVL_0 1 | ||
70 | # define NUM_RCU_NODES NUM_RCU_LVL_0 | ||
71 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } | ||
72 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | ||
73 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | ||
74 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
75 | # define RCU_NUM_LVLS 2 | ||
76 | # define NUM_RCU_LVL_0 1 | ||
77 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
78 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) | ||
79 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } | ||
80 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | ||
81 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
82 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
83 | # define RCU_NUM_LVLS 3 | ||
84 | # define NUM_RCU_LVL_0 1 | ||
85 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
86 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
87 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) | ||
88 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } | ||
89 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
90 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
91 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
92 | # define RCU_NUM_LVLS 4 | ||
93 | # define NUM_RCU_LVL_0 1 | ||
94 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
95 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
96 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
97 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) | ||
98 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
99 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
100 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
101 | #else | ||
102 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
103 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
104 | |||
105 | extern int rcu_num_lvls; | ||
106 | extern int rcu_num_nodes; | ||
107 | 36 | ||
108 | /* | 37 | /* |
109 | * Dynticks per-CPU state. | 38 | * Dynticks per-CPU state. |
@@ -113,6 +42,9 @@ struct rcu_dynticks { | |||
113 | /* Process level is worth LLONG_MAX/2. */ | 42 | /* Process level is worth LLONG_MAX/2. */ |
114 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 43 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
115 | atomic_t dynticks; /* Even value for idle, else odd. */ | 44 | atomic_t dynticks; /* Even value for idle, else odd. */ |
45 | bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ | ||
46 | unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ | ||
47 | bool rcu_urgent_qs; /* GP old need light quiescent state. */ | ||
116 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 48 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
117 | long long dynticks_idle_nesting; | 49 | long long dynticks_idle_nesting; |
118 | /* irq/process nesting level from idle. */ | 50 | /* irq/process nesting level from idle. */ |
@@ -262,41 +194,6 @@ struct rcu_node { | |||
262 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) | 194 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) |
263 | 195 | ||
264 | /* | 196 | /* |
265 | * Do a full breadth-first scan of the rcu_node structures for the | ||
266 | * specified rcu_state structure. | ||
267 | */ | ||
268 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
269 | for ((rnp) = &(rsp)->node[0]; \ | ||
270 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
271 | |||
272 | /* | ||
273 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
274 | * specified rcu_state structure. Note that if there is a singleton | ||
275 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
276 | */ | ||
277 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
278 | for ((rnp) = &(rsp)->node[0]; \ | ||
279 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
280 | |||
281 | /* | ||
282 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
283 | * structure. Note that if there is a singleton rcu_node tree with but | ||
284 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
285 | * It is still a leaf node, even if it is also the root node. | ||
286 | */ | ||
287 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
288 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
289 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
290 | |||
291 | /* | ||
292 | * Iterate over all possible CPUs in a leaf RCU node. | ||
293 | */ | ||
294 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
295 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
296 | cpu <= rnp->grphi; \ | ||
297 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
298 | |||
299 | /* | ||
300 | * Union to allow "aggregate OR" operation on the need for a quiescent | 197 | * Union to allow "aggregate OR" operation on the need for a quiescent |
301 | * state by the normal and expedited grace periods. | 198 | * state by the normal and expedited grace periods. |
302 | */ | 199 | */ |
@@ -336,34 +233,9 @@ struct rcu_data { | |||
336 | /* period it is aware of. */ | 233 | /* period it is aware of. */ |
337 | 234 | ||
338 | /* 2) batch handling */ | 235 | /* 2) batch handling */ |
339 | /* | 236 | struct rcu_segcblist cblist; /* Segmented callback list, with */ |
340 | * If nxtlist is not NULL, it is partitioned as follows. | 237 | /* different callbacks waiting for */ |
341 | * Any of the partitions might be empty, in which case the | 238 | /* different grace periods. */ |
342 | * pointer to that partition will be equal to the pointer for | ||
343 | * the following partition. When the list is empty, all of | ||
344 | * the nxttail elements point to the ->nxtlist pointer itself, | ||
345 | * which in that case is NULL. | ||
346 | * | ||
347 | * [nxtlist, *nxttail[RCU_DONE_TAIL]): | ||
348 | * Entries that batch # <= ->completed | ||
349 | * The grace period for these entries has completed, and | ||
350 | * the other grace-period-completed entries may be moved | ||
351 | * here temporarily in rcu_process_callbacks(). | ||
352 | * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): | ||
353 | * Entries that batch # <= ->completed - 1: waiting for current GP | ||
354 | * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): | ||
355 | * Entries known to have arrived before current GP ended | ||
356 | * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): | ||
357 | * Entries that might have arrived after current GP ended | ||
358 | * Note that the value of *nxttail[RCU_NEXT_TAIL] will | ||
359 | * always be NULL, as this is the end of the list. | ||
360 | */ | ||
361 | struct rcu_head *nxtlist; | ||
362 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | ||
363 | unsigned long nxtcompleted[RCU_NEXT_SIZE]; | ||
364 | /* grace periods for sublists. */ | ||
365 | long qlen_lazy; /* # of lazy queued callbacks */ | ||
366 | long qlen; /* # of queued callbacks, incl lazy */ | ||
367 | long qlen_last_fqs_check; | 239 | long qlen_last_fqs_check; |
368 | /* qlen at last check for QS forcing */ | 240 | /* qlen at last check for QS forcing */ |
369 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 241 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
@@ -482,7 +354,6 @@ struct rcu_state { | |||
482 | struct rcu_node *level[RCU_NUM_LVLS + 1]; | 354 | struct rcu_node *level[RCU_NUM_LVLS + 1]; |
483 | /* Hierarchy levels (+1 to */ | 355 | /* Hierarchy levels (+1 to */ |
484 | /* shut bogus gcc warning) */ | 356 | /* shut bogus gcc warning) */ |
485 | u8 flavor_mask; /* bit in flavor mask. */ | ||
486 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 357 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
487 | call_rcu_func_t call; /* call_rcu() flavor. */ | 358 | call_rcu_func_t call; /* call_rcu() flavor. */ |
488 | int ncpus; /* # CPUs seen so far. */ | 359 | int ncpus; /* # CPUs seen so far. */ |
@@ -502,14 +373,11 @@ struct rcu_state { | |||
502 | 373 | ||
503 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; | 374 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
504 | /* Protect following fields. */ | 375 | /* Protect following fields. */ |
505 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 376 | struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ |
506 | /* need a grace period. */ | 377 | /* need a grace period. */ |
507 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 378 | struct rcu_cblist orphan_done; /* Orphaned callbacks that */ |
508 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
509 | /* are ready to invoke. */ | 379 | /* are ready to invoke. */ |
510 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 380 | /* (Contains counts.) */ |
511 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
512 | long qlen; /* Total number of callbacks. */ | ||
513 | /* End of fields guarded by orphan_lock. */ | 381 | /* End of fields guarded by orphan_lock. */ |
514 | 382 | ||
515 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 383 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state; | |||
596 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 464 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
597 | 465 | ||
598 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); | 466 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); |
467 | bool rcu_eqs_special_set(int cpu); | ||
599 | 468 | ||
600 | #ifdef CONFIG_RCU_BOOST | 469 | #ifdef CONFIG_RCU_BOOST |
601 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 470 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | |||
673 | static void rcu_dynticks_task_enter(void); | 542 | static void rcu_dynticks_task_enter(void); |
674 | static void rcu_dynticks_task_exit(void); | 543 | static void rcu_dynticks_task_exit(void); |
675 | 544 | ||
545 | #ifdef CONFIG_SRCU | ||
546 | void srcu_online_cpu(unsigned int cpu); | ||
547 | void srcu_offline_cpu(unsigned int cpu); | ||
548 | #else /* #ifdef CONFIG_SRCU */ | ||
549 | void srcu_online_cpu(unsigned int cpu) { } | ||
550 | void srcu_offline_cpu(unsigned int cpu) { } | ||
551 | #endif /* #else #ifdef CONFIG_SRCU */ | ||
552 | |||
676 | #endif /* #ifndef RCU_TREE_NONCORE */ | 553 | #endif /* #ifndef RCU_TREE_NONCORE */ |
677 | 554 | ||
678 | #ifdef CONFIG_RCU_TRACE | 555 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..e513b4ab1197 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | 292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, |
293 | rnp->grplo, rnp->grphi, | 293 | rnp->grplo, rnp->grphi, |
294 | TPS("wait")); | 294 | TPS("wait")); |
295 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 295 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
296 | sync_exp_work_done(rsp, | 296 | sync_exp_work_done(rsp, |
297 | &rdp->exp_workdone2, s)); | 297 | &rdp->exp_workdone2, s)); |
298 | return true; | 298 | return true; |
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) | |||
331 | return; | 331 | return; |
332 | } | 332 | } |
333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | 333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); |
334 | /* Store .exp before .rcu_urgent_qs. */ | ||
335 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
334 | resched_cpu(smp_processor_id()); | 336 | resched_cpu(smp_processor_id()); |
335 | } | 337 | } |
336 | 338 | ||
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
531 | rnp->exp_seq_rq = s; | 533 | rnp->exp_seq_rq = s; |
532 | spin_unlock(&rnp->exp_lock); | 534 | spin_unlock(&rnp->exp_lock); |
533 | } | 535 | } |
534 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | 536 | smp_mb(); /* All above changes before wakeup. */ |
537 | wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); | ||
535 | } | 538 | } |
536 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | 539 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); |
537 | mutex_unlock(&rsp->exp_wake_mutex); | 540 | mutex_unlock(&rsp->exp_wake_mutex); |
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, | |||
609 | /* Wait for expedited grace period to complete. */ | 612 | /* Wait for expedited grace period to complete. */ |
610 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | 613 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); |
611 | rnp = rcu_get_root(rsp); | 614 | rnp = rcu_get_root(rsp); |
612 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 615 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
613 | sync_exp_work_done(rsp, | 616 | sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); |
614 | &rdp->exp_workdone0, s)); | 617 | smp_mb(); /* Workqueue actions happen before return. */ |
615 | 618 | ||
616 | /* Let the next expedited grace period start. */ | 619 | /* Let the next expedited grace period start. */ |
617 | mutex_unlock(&rsp->exp_mutex); | 620 | mutex_unlock(&rsp->exp_mutex); |
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) | |||
735 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 738 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
736 | 739 | ||
737 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | 740 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
738 | |||
739 | /* | ||
740 | * Switch to run-time mode once Tree RCU has fully initialized. | ||
741 | */ | ||
742 | static int __init rcu_exp_runtime_mode(void) | ||
743 | { | ||
744 | rcu_test_sync_prims(); | ||
745 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
746 | rcu_test_sync_prims(); | ||
747 | return 0; | ||
748 | } | ||
749 | core_initcall(rcu_exp_runtime_mode); | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..c9a48657512a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
1350 | */ | 1350 | */ |
1351 | if ((rdp->completed != rnp->completed || | 1351 | if ((rdp->completed != rnp->completed || |
1352 | unlikely(READ_ONCE(rdp->gpwrap))) && | 1352 | unlikely(READ_ONCE(rdp->gpwrap))) && |
1353 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | 1353 | rcu_segcblist_pend_cbs(&rdp->cblist)) |
1354 | note_gp_changes(rsp, rdp); | 1354 | note_gp_changes(rsp, rdp); |
1355 | 1355 | ||
1356 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1356 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
1357 | cbs_ready = true; | 1357 | cbs_ready = true; |
1358 | } | 1358 | } |
1359 | return cbs_ready; | 1359 | return cbs_ready; |
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void) | |||
1461 | rdtp->last_accelerate = jiffies; | 1461 | rdtp->last_accelerate = jiffies; |
1462 | for_each_rcu_flavor(rsp) { | 1462 | for_each_rcu_flavor(rsp) { |
1463 | rdp = this_cpu_ptr(rsp->rda); | 1463 | rdp = this_cpu_ptr(rsp->rda); |
1464 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1464 | if (rcu_segcblist_pend_cbs(&rdp->cblist)) |
1465 | continue; | 1465 | continue; |
1466 | rnp = rdp->mynode; | 1466 | rnp = rdp->mynode; |
1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) | |||
1529 | 1529 | ||
1530 | for_each_rcu_flavor(rsp) { | 1530 | for_each_rcu_flavor(rsp) { |
1531 | rdp = raw_cpu_ptr(rsp->rda); | 1531 | rdp = raw_cpu_ptr(rsp->rda); |
1532 | if (rdp->qlen_lazy != 0) { | 1532 | if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { |
1533 | atomic_inc(&oom_callback_count); | 1533 | atomic_inc(&oom_callback_count); |
1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); | 1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); |
1535 | } | 1535 | } |
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); | |||
1709 | 1709 | ||
1710 | static int __init parse_rcu_nocb_poll(char *arg) | 1710 | static int __init parse_rcu_nocb_poll(char *arg) |
1711 | { | 1711 | { |
1712 | rcu_nocb_poll = 1; | 1712 | rcu_nocb_poll = true; |
1713 | return 0; | 1713 | return 0; |
1714 | } | 1714 | } |
1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1861 | TPS("WakeEmpty")); | 1861 | TPS("WakeEmpty")); |
1862 | } else { | 1862 | } else { |
1863 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; | 1863 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); |
1864 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1865 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1864 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1866 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1865 | TPS("WakeEmptyIsDeferred")); | 1867 | TPS("WakeEmptyIsDeferred")); |
1866 | } | 1868 | } |
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1872 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1874 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1873 | TPS("WakeOvf")); | 1875 | TPS("WakeOvf")); |
1874 | } else { | 1876 | } else { |
1875 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; | 1877 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); |
1878 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1879 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1876 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1880 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1877 | TPS("WakeOvfIsDeferred")); | 1881 | TPS("WakeOvfIsDeferred")); |
1878 | } | 1882 | } |
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
1930 | struct rcu_data *rdp, | 1934 | struct rcu_data *rdp, |
1931 | unsigned long flags) | 1935 | unsigned long flags) |
1932 | { | 1936 | { |
1933 | long ql = rsp->qlen; | 1937 | long ql = rsp->orphan_done.len; |
1934 | long qll = rsp->qlen_lazy; | 1938 | long qll = rsp->orphan_done.len_lazy; |
1935 | 1939 | ||
1936 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 1940 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
1937 | if (!rcu_is_nocb_cpu(smp_processor_id())) | 1941 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
1938 | return false; | 1942 | return false; |
1939 | rsp->qlen = 0; | ||
1940 | rsp->qlen_lazy = 0; | ||
1941 | 1943 | ||
1942 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | 1944 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ |
1943 | if (rsp->orphan_donelist != NULL) { | 1945 | if (rsp->orphan_done.head) { |
1944 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | 1946 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), |
1945 | rsp->orphan_donetail, ql, qll, flags); | 1947 | rcu_cblist_tail(&rsp->orphan_done), |
1946 | ql = qll = 0; | 1948 | ql, qll, flags); |
1947 | rsp->orphan_donelist = NULL; | ||
1948 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1949 | } | 1949 | } |
1950 | if (rsp->orphan_nxtlist != NULL) { | 1950 | if (rsp->orphan_pend.head) { |
1951 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | 1951 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), |
1952 | rsp->orphan_nxttail, ql, qll, flags); | 1952 | rcu_cblist_tail(&rsp->orphan_pend), |
1953 | ql = qll = 0; | 1953 | ql, qll, flags); |
1954 | rsp->orphan_nxtlist = NULL; | ||
1955 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1956 | } | 1954 | } |
1955 | rcu_cblist_init(&rsp->orphan_done); | ||
1956 | rcu_cblist_init(&rsp->orphan_pend); | ||
1957 | return true; | 1957 | return true; |
1958 | } | 1958 | } |
1959 | 1959 | ||
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2395 | return false; | 2395 | return false; |
2396 | 2396 | ||
2397 | /* If there are early-boot callbacks, move them to nocb lists. */ | 2397 | /* If there are early-boot callbacks, move them to nocb lists. */ |
2398 | if (rdp->nxtlist) { | 2398 | if (!rcu_segcblist_empty(&rdp->cblist)) { |
2399 | rdp->nocb_head = rdp->nxtlist; | 2399 | rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); |
2400 | rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 2400 | rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); |
2401 | atomic_long_set(&rdp->nocb_q_count, rdp->qlen); | 2401 | atomic_long_set(&rdp->nocb_q_count, |
2402 | atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); | 2402 | rcu_segcblist_n_cbs(&rdp->cblist)); |
2403 | rdp->nxtlist = NULL; | 2403 | atomic_long_set(&rdp->nocb_q_count_lazy, |
2404 | rdp->qlen = 0; | 2404 | rcu_segcblist_n_lazy_cbs(&rdp->cblist)); |
2405 | rdp->qlen_lazy = 0; | 2405 | rcu_segcblist_init(&rdp->cblist); |
2406 | } | 2406 | } |
2407 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2407 | rcu_segcblist_disable(&rdp->cblist); |
2408 | return true; | 2408 | return true; |
2409 | } | 2409 | } |
2410 | 2410 | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a..6cea17a1ea30 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -41,11 +41,11 @@ | |||
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/debugfs.h> | 42 | #include <linux/debugfs.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/prefetch.h> | ||
44 | 45 | ||
45 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
46 | #include "tree.h" | 47 | #include "tree.h" |
47 | 48 | #include "rcu.h" | |
48 | DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | ||
49 | 49 | ||
50 | static int r_open(struct inode *inode, struct file *file, | 50 | static int r_open(struct inode *inode, struct file *file, |
51 | const struct seq_operations *op) | 51 | const struct seq_operations *op) |
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | 122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
123 | rdp->cpu_no_qs.b.norm, | 123 | rdp->cpu_no_qs.b.norm, |
124 | rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), | 124 | rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), |
125 | rdp->core_needs_qs); | 125 | rdp->core_needs_qs); |
126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
127 | rcu_dynticks_snap(rdp->dynticks), | 127 | rcu_dynticks_snap(rdp->dynticks), |
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
130 | rdp->dynticks_fqs); | 130 | rdp->dynticks_fqs); |
131 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 131 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
132 | rcu_nocb_q_lengths(rdp, &ql, &qll); | 132 | rcu_nocb_q_lengths(rdp, &ql, &qll); |
133 | qll += rdp->qlen_lazy; | 133 | qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); |
134 | ql += rdp->qlen; | 134 | ql += rcu_segcblist_n_cbs(&rdp->cblist); |
135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
136 | qll, ql, | 136 | qll, ql, |
137 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 137 | ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], |
138 | rdp->nxttail[RCU_NEXT_TAIL]], | 138 | ".R"[!rcu_segcblist_segempty(&rdp->cblist, |
139 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 139 | RCU_NEXT_READY_TAIL)], |
140 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | 140 | ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], |
141 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | 141 | ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); |
142 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
143 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
144 | #ifdef CONFIG_RCU_BOOST | 142 | #ifdef CONFIG_RCU_BOOST |
145 | seq_printf(m, " kt=%d/%c ktl=%x", | 143 | seq_printf(m, " kt=%d/%c ktl=%x", |
146 | per_cpu(rcu_cpu_has_work, rdp->cpu), | 144 | per_cpu(rcu_cpu_has_work, rdp->cpu), |
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
278 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 276 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
279 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 277 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
281 | READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); | 279 | READ_ONCE(rsp->n_force_qs_lh), |
280 | rsp->orphan_done.len_lazy, | ||
281 | rsp->orphan_done.len); | ||
282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { | 282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { |
283 | if (rnp->level != level) { | 283 | if (rnp->level != level) { |
284 | seq_puts(m, "\n"); | 284 | seq_puts(m, "\n"); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7..273e869ca21d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); | |||
124 | * non-expedited counterparts? Intended for use within RCU. Note | 124 | * non-expedited counterparts? Intended for use within RCU. Note |
125 | * that if the user specifies both rcu_expedited and rcu_normal, then | 125 | * that if the user specifies both rcu_expedited and rcu_normal, then |
126 | * rcu_normal wins. (Except during the time period during boot from | 126 | * rcu_normal wins. (Except during the time period during boot from |
127 | * when the first task is spawned until the rcu_exp_runtime_mode() | 127 | * when the first task is spawned until the rcu_set_runtime_mode() |
128 | * core_initcall() is invoked, at which point everything is expedited.) | 128 | * core_initcall() is invoked, at which point everything is expedited.) |
129 | */ | 129 | */ |
130 | bool rcu_gp_is_normal(void) | 130 | bool rcu_gp_is_normal(void) |
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void) | |||
190 | 190 | ||
191 | #endif /* #ifndef CONFIG_TINY_RCU */ | 191 | #endif /* #ifndef CONFIG_TINY_RCU */ |
192 | 192 | ||
193 | /* | ||
194 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
195 | * useful just after a change in mode for these primitives, and | ||
196 | * during early boot. | ||
197 | */ | ||
198 | void rcu_test_sync_prims(void) | ||
199 | { | ||
200 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
201 | return; | ||
202 | synchronize_rcu(); | ||
203 | synchronize_rcu_bh(); | ||
204 | synchronize_sched(); | ||
205 | synchronize_rcu_expedited(); | ||
206 | synchronize_rcu_bh_expedited(); | ||
207 | synchronize_sched_expedited(); | ||
208 | } | ||
209 | |||
210 | #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) | ||
211 | |||
212 | /* | ||
213 | * Switch to run-time mode once RCU has fully initialized. | ||
214 | */ | ||
215 | static int __init rcu_set_runtime_mode(void) | ||
216 | { | ||
217 | rcu_test_sync_prims(); | ||
218 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
219 | rcu_test_sync_prims(); | ||
220 | return 0; | ||
221 | } | ||
222 | core_initcall(rcu_set_runtime_mode); | ||
223 | |||
224 | #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ | ||
225 | |||
193 | #ifdef CONFIG_PREEMPT_RCU | 226 | #ifdef CONFIG_PREEMPT_RCU |
194 | 227 | ||
195 | /* | 228 | /* |
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, | |||
632 | put_task_struct(t); | 665 | put_task_struct(t); |
633 | return; | 666 | return; |
634 | } | 667 | } |
668 | rcu_request_urgent_qs_task(t); | ||
635 | if (!needreport) | 669 | if (!needreport) |
636 | return; | 670 | return; |
637 | if (*firstreport) { | 671 | if (*firstreport) { |
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) | |||
817 | 851 | ||
818 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 852 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
819 | 853 | ||
820 | /* | ||
821 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
822 | * useful just after a change in mode for these primitives, and | ||
823 | * during early boot. | ||
824 | */ | ||
825 | void rcu_test_sync_prims(void) | ||
826 | { | ||
827 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
828 | return; | ||
829 | synchronize_rcu(); | ||
830 | synchronize_rcu_bh(); | ||
831 | synchronize_sched(); | ||
832 | synchronize_rcu_expedited(); | ||
833 | synchronize_rcu_bh_expedited(); | ||
834 | synchronize_sched_expedited(); | ||
835 | } | ||
836 | |||
837 | #ifdef CONFIG_PROVE_RCU | 854 | #ifdef CONFIG_PROVE_RCU |
838 | 855 | ||
839 | /* | 856 | /* |
diff --git a/kernel/relay.c b/kernel/relay.c index 0e413d9eec8a..39a9dfc69486 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1212 | .nr_pages = 0, | 1212 | .nr_pages = 0, |
1213 | .nr_pages_max = PIPE_DEF_BUFFERS, | 1213 | .nr_pages_max = PIPE_DEF_BUFFERS, |
1214 | .partial = partial, | 1214 | .partial = partial, |
1215 | .flags = flags, | ||
1216 | .ops = &relay_pipe_buf_ops, | 1215 | .ops = &relay_pipe_buf_ops, |
1217 | .spd_release = relay_page_release, | 1216 | .spd_release = relay_page_release, |
1218 | }; | 1217 | }; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..803c3bc274c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000; | |||
86 | cpumask_var_t cpu_isolated_map; | 86 | cpumask_var_t cpu_isolated_map; |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * this_rq_lock - lock this runqueue and disable interrupts. | ||
90 | */ | ||
91 | static struct rq *this_rq_lock(void) | ||
92 | __acquires(rq->lock) | ||
93 | { | ||
94 | struct rq *rq; | ||
95 | |||
96 | local_irq_disable(); | ||
97 | rq = this_rq(); | ||
98 | raw_spin_lock(&rq->lock); | ||
99 | |||
100 | return rq; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * __task_rq_lock - lock the rq @p resides on. | 89 | * __task_rq_lock - lock the rq @p resides on. |
105 | */ | 90 | */ |
106 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 91 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq) | |||
233 | return; | 218 | return; |
234 | 219 | ||
235 | #ifdef CONFIG_SCHED_DEBUG | 220 | #ifdef CONFIG_SCHED_DEBUG |
221 | if (sched_feat(WARN_DOUBLE_CLOCK)) | ||
222 | SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); | ||
236 | rq->clock_update_flags |= RQCF_UPDATED; | 223 | rq->clock_update_flags |= RQCF_UPDATED; |
237 | #endif | 224 | #endif |
225 | |||
238 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 226 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
239 | if (delta < 0) | 227 | if (delta < 0) |
240 | return; | 228 | return; |
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq) | |||
261 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | 249 | static enum hrtimer_restart hrtick(struct hrtimer *timer) |
262 | { | 250 | { |
263 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | 251 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); |
252 | struct rq_flags rf; | ||
264 | 253 | ||
265 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 254 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
266 | 255 | ||
267 | raw_spin_lock(&rq->lock); | 256 | rq_lock(rq, &rf); |
268 | update_rq_clock(rq); | 257 | update_rq_clock(rq); |
269 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | 258 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); |
270 | raw_spin_unlock(&rq->lock); | 259 | rq_unlock(rq, &rf); |
271 | 260 | ||
272 | return HRTIMER_NORESTART; | 261 | return HRTIMER_NORESTART; |
273 | } | 262 | } |
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) | |||
287 | static void __hrtick_start(void *arg) | 276 | static void __hrtick_start(void *arg) |
288 | { | 277 | { |
289 | struct rq *rq = arg; | 278 | struct rq *rq = arg; |
279 | struct rq_flags rf; | ||
290 | 280 | ||
291 | raw_spin_lock(&rq->lock); | 281 | rq_lock(rq, &rf); |
292 | __hrtick_restart(rq); | 282 | __hrtick_restart(rq); |
293 | rq->hrtick_csd_pending = 0; | 283 | rq->hrtick_csd_pending = 0; |
294 | raw_spin_unlock(&rq->lock); | 284 | rq_unlock(rq, &rf); |
295 | } | 285 | } |
296 | 286 | ||
297 | /* | 287 | /* |
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p) | |||
762 | 752 | ||
763 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 753 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
764 | { | 754 | { |
765 | update_rq_clock(rq); | 755 | if (!(flags & ENQUEUE_NOCLOCK)) |
756 | update_rq_clock(rq); | ||
757 | |||
766 | if (!(flags & ENQUEUE_RESTORE)) | 758 | if (!(flags & ENQUEUE_RESTORE)) |
767 | sched_info_queued(rq, p); | 759 | sched_info_queued(rq, p); |
760 | |||
768 | p->sched_class->enqueue_task(rq, p, flags); | 761 | p->sched_class->enqueue_task(rq, p, flags); |
769 | } | 762 | } |
770 | 763 | ||
771 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 764 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
772 | { | 765 | { |
773 | update_rq_clock(rq); | 766 | if (!(flags & DEQUEUE_NOCLOCK)) |
767 | update_rq_clock(rq); | ||
768 | |||
774 | if (!(flags & DEQUEUE_SAVE)) | 769 | if (!(flags & DEQUEUE_SAVE)) |
775 | sched_info_dequeued(rq, p); | 770 | sched_info_dequeued(rq, p); |
771 | |||
776 | p->sched_class->dequeue_task(rq, p, flags); | 772 | p->sched_class->dequeue_task(rq, p, flags); |
777 | } | 773 | } |
778 | 774 | ||
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
946 | * | 942 | * |
947 | * Returns (locked) new rq. Old rq's lock is released. | 943 | * Returns (locked) new rq. Old rq's lock is released. |
948 | */ | 944 | */ |
949 | static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) | 945 | static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, |
946 | struct task_struct *p, int new_cpu) | ||
950 | { | 947 | { |
951 | lockdep_assert_held(&rq->lock); | 948 | lockdep_assert_held(&rq->lock); |
952 | 949 | ||
953 | p->on_rq = TASK_ON_RQ_MIGRATING; | 950 | p->on_rq = TASK_ON_RQ_MIGRATING; |
954 | dequeue_task(rq, p, 0); | 951 | dequeue_task(rq, p, DEQUEUE_NOCLOCK); |
955 | set_task_cpu(p, new_cpu); | 952 | set_task_cpu(p, new_cpu); |
956 | raw_spin_unlock(&rq->lock); | 953 | rq_unlock(rq, rf); |
957 | 954 | ||
958 | rq = cpu_rq(new_cpu); | 955 | rq = cpu_rq(new_cpu); |
959 | 956 | ||
960 | raw_spin_lock(&rq->lock); | 957 | rq_lock(rq, rf); |
961 | BUG_ON(task_cpu(p) != new_cpu); | 958 | BUG_ON(task_cpu(p) != new_cpu); |
962 | enqueue_task(rq, p, 0); | 959 | enqueue_task(rq, p, 0); |
963 | p->on_rq = TASK_ON_RQ_QUEUED; | 960 | p->on_rq = TASK_ON_RQ_QUEUED; |
@@ -980,7 +977,8 @@ struct migration_arg { | |||
980 | * So we race with normal scheduler movements, but that's OK, as long | 977 | * So we race with normal scheduler movements, but that's OK, as long |
981 | * as the task is no longer on this CPU. | 978 | * as the task is no longer on this CPU. |
982 | */ | 979 | */ |
983 | static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) | 980 | static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, |
981 | struct task_struct *p, int dest_cpu) | ||
984 | { | 982 | { |
985 | if (unlikely(!cpu_active(dest_cpu))) | 983 | if (unlikely(!cpu_active(dest_cpu))) |
986 | return rq; | 984 | return rq; |
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ | |||
989 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 987 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
990 | return rq; | 988 | return rq; |
991 | 989 | ||
992 | rq = move_queued_task(rq, p, dest_cpu); | 990 | update_rq_clock(rq); |
991 | rq = move_queued_task(rq, rf, p, dest_cpu); | ||
993 | 992 | ||
994 | return rq; | 993 | return rq; |
995 | } | 994 | } |
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data) | |||
1004 | struct migration_arg *arg = data; | 1003 | struct migration_arg *arg = data; |
1005 | struct task_struct *p = arg->task; | 1004 | struct task_struct *p = arg->task; |
1006 | struct rq *rq = this_rq(); | 1005 | struct rq *rq = this_rq(); |
1006 | struct rq_flags rf; | ||
1007 | 1007 | ||
1008 | /* | 1008 | /* |
1009 | * The original target CPU might have gone down and we might | 1009 | * The original target CPU might have gone down and we might |
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data) | |||
1018 | sched_ttwu_pending(); | 1018 | sched_ttwu_pending(); |
1019 | 1019 | ||
1020 | raw_spin_lock(&p->pi_lock); | 1020 | raw_spin_lock(&p->pi_lock); |
1021 | raw_spin_lock(&rq->lock); | 1021 | rq_lock(rq, &rf); |
1022 | /* | 1022 | /* |
1023 | * If task_rq(p) != rq, it cannot be migrated here, because we're | 1023 | * If task_rq(p) != rq, it cannot be migrated here, because we're |
1024 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because | 1024 | * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because |
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data) | |||
1026 | */ | 1026 | */ |
1027 | if (task_rq(p) == rq) { | 1027 | if (task_rq(p) == rq) { |
1028 | if (task_on_rq_queued(p)) | 1028 | if (task_on_rq_queued(p)) |
1029 | rq = __migrate_task(rq, p, arg->dest_cpu); | 1029 | rq = __migrate_task(rq, &rf, p, arg->dest_cpu); |
1030 | else | 1030 | else |
1031 | p->wake_cpu = arg->dest_cpu; | 1031 | p->wake_cpu = arg->dest_cpu; |
1032 | } | 1032 | } |
1033 | raw_spin_unlock(&rq->lock); | 1033 | rq_unlock(rq, &rf); |
1034 | raw_spin_unlock(&p->pi_lock); | 1034 | raw_spin_unlock(&p->pi_lock); |
1035 | 1035 | ||
1036 | local_irq_enable(); | 1036 | local_irq_enable(); |
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1063 | * holding rq->lock. | 1063 | * holding rq->lock. |
1064 | */ | 1064 | */ |
1065 | lockdep_assert_held(&rq->lock); | 1065 | lockdep_assert_held(&rq->lock); |
1066 | dequeue_task(rq, p, DEQUEUE_SAVE); | 1066 | dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); |
1067 | } | 1067 | } |
1068 | if (running) | 1068 | if (running) |
1069 | put_prev_task(rq, p); | 1069 | put_prev_task(rq, p); |
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1071 | p->sched_class->set_cpus_allowed(p, new_mask); | 1071 | p->sched_class->set_cpus_allowed(p, new_mask); |
1072 | 1072 | ||
1073 | if (queued) | 1073 | if (queued) |
1074 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 1074 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
1075 | if (running) | 1075 | if (running) |
1076 | set_curr_task(rq, p); | 1076 | set_curr_task(rq, p); |
1077 | } | 1077 | } |
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1150 | * OK, since we're going to drop the lock immediately | 1150 | * OK, since we're going to drop the lock immediately |
1151 | * afterwards anyway. | 1151 | * afterwards anyway. |
1152 | */ | 1152 | */ |
1153 | rq_unpin_lock(rq, &rf); | 1153 | rq = move_queued_task(rq, &rf, p, dest_cpu); |
1154 | rq = move_queued_task(rq, p, dest_cpu); | ||
1155 | rq_repin_lock(rq, &rf); | ||
1156 | } | 1154 | } |
1157 | out: | 1155 | out: |
1158 | task_rq_unlock(rq, p, &rf); | 1156 | task_rq_unlock(rq, p, &rf); |
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1217 | { | 1215 | { |
1218 | if (task_on_rq_queued(p)) { | 1216 | if (task_on_rq_queued(p)) { |
1219 | struct rq *src_rq, *dst_rq; | 1217 | struct rq *src_rq, *dst_rq; |
1218 | struct rq_flags srf, drf; | ||
1220 | 1219 | ||
1221 | src_rq = task_rq(p); | 1220 | src_rq = task_rq(p); |
1222 | dst_rq = cpu_rq(cpu); | 1221 | dst_rq = cpu_rq(cpu); |
1223 | 1222 | ||
1223 | rq_pin_lock(src_rq, &srf); | ||
1224 | rq_pin_lock(dst_rq, &drf); | ||
1225 | |||
1224 | p->on_rq = TASK_ON_RQ_MIGRATING; | 1226 | p->on_rq = TASK_ON_RQ_MIGRATING; |
1225 | deactivate_task(src_rq, p, 0); | 1227 | deactivate_task(src_rq, p, 0); |
1226 | set_task_cpu(p, cpu); | 1228 | set_task_cpu(p, cpu); |
1227 | activate_task(dst_rq, p, 0); | 1229 | activate_task(dst_rq, p, 0); |
1228 | p->on_rq = TASK_ON_RQ_QUEUED; | 1230 | p->on_rq = TASK_ON_RQ_QUEUED; |
1229 | check_preempt_curr(dst_rq, p, 0); | 1231 | check_preempt_curr(dst_rq, p, 0); |
1232 | |||
1233 | rq_unpin_lock(dst_rq, &drf); | ||
1234 | rq_unpin_lock(src_rq, &srf); | ||
1235 | |||
1230 | } else { | 1236 | } else { |
1231 | /* | 1237 | /* |
1232 | * Task isn't running anymore; make it appear like we migrated | 1238 | * Task isn't running anymore; make it appear like we migrated |
@@ -1680,7 +1686,7 @@ static void | |||
1680 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | 1686 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, |
1681 | struct rq_flags *rf) | 1687 | struct rq_flags *rf) |
1682 | { | 1688 | { |
1683 | int en_flags = ENQUEUE_WAKEUP; | 1689 | int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; |
1684 | 1690 | ||
1685 | lockdep_assert_held(&rq->lock); | 1691 | lockdep_assert_held(&rq->lock); |
1686 | 1692 | ||
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void) | |||
1726 | struct rq *rq = this_rq(); | 1732 | struct rq *rq = this_rq(); |
1727 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1733 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1728 | struct task_struct *p; | 1734 | struct task_struct *p; |
1729 | unsigned long flags; | ||
1730 | struct rq_flags rf; | 1735 | struct rq_flags rf; |
1731 | 1736 | ||
1732 | if (!llist) | 1737 | if (!llist) |
1733 | return; | 1738 | return; |
1734 | 1739 | ||
1735 | raw_spin_lock_irqsave(&rq->lock, flags); | 1740 | rq_lock_irqsave(rq, &rf); |
1736 | rq_pin_lock(rq, &rf); | 1741 | update_rq_clock(rq); |
1737 | 1742 | ||
1738 | while (llist) { | 1743 | while (llist) { |
1739 | int wake_flags = 0; | 1744 | int wake_flags = 0; |
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void) | |||
1747 | ttwu_do_activate(rq, p, wake_flags, &rf); | 1752 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1748 | } | 1753 | } |
1749 | 1754 | ||
1750 | rq_unpin_lock(rq, &rf); | 1755 | rq_unlock_irqrestore(rq, &rf); |
1751 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1752 | } | 1756 | } |
1753 | 1757 | ||
1754 | void scheduler_ipi(void) | 1758 | void scheduler_ipi(void) |
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) | |||
1806 | void wake_up_if_idle(int cpu) | 1810 | void wake_up_if_idle(int cpu) |
1807 | { | 1811 | { |
1808 | struct rq *rq = cpu_rq(cpu); | 1812 | struct rq *rq = cpu_rq(cpu); |
1809 | unsigned long flags; | 1813 | struct rq_flags rf; |
1810 | 1814 | ||
1811 | rcu_read_lock(); | 1815 | rcu_read_lock(); |
1812 | 1816 | ||
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu) | |||
1816 | if (set_nr_if_polling(rq->idle)) { | 1820 | if (set_nr_if_polling(rq->idle)) { |
1817 | trace_sched_wake_idle_without_ipi(cpu); | 1821 | trace_sched_wake_idle_without_ipi(cpu); |
1818 | } else { | 1822 | } else { |
1819 | raw_spin_lock_irqsave(&rq->lock, flags); | 1823 | rq_lock_irqsave(rq, &rf); |
1820 | if (is_idle_task(rq->curr)) | 1824 | if (is_idle_task(rq->curr)) |
1821 | smp_send_reschedule(cpu); | 1825 | smp_send_reschedule(cpu); |
1822 | /* Else CPU is not idle, do nothing here: */ | 1826 | /* Else CPU is not idle, do nothing here: */ |
1823 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1827 | rq_unlock_irqrestore(rq, &rf); |
1824 | } | 1828 | } |
1825 | 1829 | ||
1826 | out: | 1830 | out: |
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1846 | } | 1850 | } |
1847 | #endif | 1851 | #endif |
1848 | 1852 | ||
1849 | raw_spin_lock(&rq->lock); | 1853 | rq_lock(rq, &rf); |
1850 | rq_pin_lock(rq, &rf); | 1854 | update_rq_clock(rq); |
1851 | ttwu_do_activate(rq, p, wake_flags, &rf); | 1855 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1852 | rq_unpin_lock(rq, &rf); | 1856 | rq_unlock(rq, &rf); |
1853 | raw_spin_unlock(&rq->lock); | ||
1854 | } | 1857 | } |
1855 | 1858 | ||
1856 | /* | 1859 | /* |
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | |||
2097 | * disabled avoiding further scheduler activity on it and we've | 2100 | * disabled avoiding further scheduler activity on it and we've |
2098 | * not yet picked a replacement task. | 2101 | * not yet picked a replacement task. |
2099 | */ | 2102 | */ |
2100 | rq_unpin_lock(rq, rf); | 2103 | rq_unlock(rq, rf); |
2101 | raw_spin_unlock(&rq->lock); | ||
2102 | raw_spin_lock(&p->pi_lock); | 2104 | raw_spin_lock(&p->pi_lock); |
2103 | raw_spin_lock(&rq->lock); | 2105 | rq_relock(rq, rf); |
2104 | rq_repin_lock(rq, rf); | ||
2105 | } | 2106 | } |
2106 | 2107 | ||
2107 | if (!(p->state & TASK_NORMAL)) | 2108 | if (!(p->state & TASK_NORMAL)) |
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | |||
2114 | delayacct_blkio_end(); | 2115 | delayacct_blkio_end(); |
2115 | atomic_dec(&rq->nr_iowait); | 2116 | atomic_dec(&rq->nr_iowait); |
2116 | } | 2117 | } |
2117 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2118 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); |
2118 | } | 2119 | } |
2119 | 2120 | ||
2120 | ttwu_do_wakeup(rq, p, 0, rf); | 2121 | ttwu_do_wakeup(rq, p, 0, rf); |
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2555 | update_rq_clock(rq); | 2556 | update_rq_clock(rq); |
2556 | post_init_entity_util_avg(&p->se); | 2557 | post_init_entity_util_avg(&p->se); |
2557 | 2558 | ||
2558 | activate_task(rq, p, 0); | 2559 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
2559 | p->on_rq = TASK_ON_RQ_QUEUED; | 2560 | p->on_rq = TASK_ON_RQ_QUEUED; |
2560 | trace_sched_wakeup_new(p); | 2561 | trace_sched_wakeup_new(p); |
2561 | check_preempt_curr(rq, p, WF_FORK); | 2562 | check_preempt_curr(rq, p, WF_FORK); |
@@ -3093,15 +3094,18 @@ void scheduler_tick(void) | |||
3093 | int cpu = smp_processor_id(); | 3094 | int cpu = smp_processor_id(); |
3094 | struct rq *rq = cpu_rq(cpu); | 3095 | struct rq *rq = cpu_rq(cpu); |
3095 | struct task_struct *curr = rq->curr; | 3096 | struct task_struct *curr = rq->curr; |
3097 | struct rq_flags rf; | ||
3096 | 3098 | ||
3097 | sched_clock_tick(); | 3099 | sched_clock_tick(); |
3098 | 3100 | ||
3099 | raw_spin_lock(&rq->lock); | 3101 | rq_lock(rq, &rf); |
3102 | |||
3100 | update_rq_clock(rq); | 3103 | update_rq_clock(rq); |
3101 | curr->sched_class->task_tick(rq, curr, 0); | 3104 | curr->sched_class->task_tick(rq, curr, 0); |
3102 | cpu_load_update_active(rq); | 3105 | cpu_load_update_active(rq); |
3103 | calc_global_load_tick(rq); | 3106 | calc_global_load_tick(rq); |
3104 | raw_spin_unlock(&rq->lock); | 3107 | |
3108 | rq_unlock(rq, &rf); | ||
3105 | 3109 | ||
3106 | perf_event_task_tick(); | 3110 | perf_event_task_tick(); |
3107 | 3111 | ||
@@ -3378,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3378 | hrtick_clear(rq); | 3382 | hrtick_clear(rq); |
3379 | 3383 | ||
3380 | local_irq_disable(); | 3384 | local_irq_disable(); |
3381 | rcu_note_context_switch(); | 3385 | rcu_note_context_switch(preempt); |
3382 | 3386 | ||
3383 | /* | 3387 | /* |
3384 | * Make sure that signal_pending_state()->signal_pending() below | 3388 | * Make sure that signal_pending_state()->signal_pending() below |
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt) | |||
3386 | * done by the caller to avoid the race with signal_wake_up(). | 3390 | * done by the caller to avoid the race with signal_wake_up(). |
3387 | */ | 3391 | */ |
3388 | smp_mb__before_spinlock(); | 3392 | smp_mb__before_spinlock(); |
3389 | raw_spin_lock(&rq->lock); | 3393 | rq_lock(rq, &rf); |
3390 | rq_pin_lock(rq, &rf); | ||
3391 | 3394 | ||
3392 | /* Promote REQ to ACT */ | 3395 | /* Promote REQ to ACT */ |
3393 | rq->clock_update_flags <<= 1; | 3396 | rq->clock_update_flags <<= 1; |
3397 | update_rq_clock(rq); | ||
3394 | 3398 | ||
3395 | switch_count = &prev->nivcsw; | 3399 | switch_count = &prev->nivcsw; |
3396 | if (!preempt && prev->state) { | 3400 | if (!preempt && prev->state) { |
3397 | if (unlikely(signal_pending_state(prev->state, prev))) { | 3401 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3398 | prev->state = TASK_RUNNING; | 3402 | prev->state = TASK_RUNNING; |
3399 | } else { | 3403 | } else { |
3400 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3404 | deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); |
3401 | prev->on_rq = 0; | 3405 | prev->on_rq = 0; |
3402 | 3406 | ||
3403 | if (prev->in_iowait) { | 3407 | if (prev->in_iowait) { |
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt) | |||
3421 | switch_count = &prev->nvcsw; | 3425 | switch_count = &prev->nvcsw; |
3422 | } | 3426 | } |
3423 | 3427 | ||
3424 | if (task_on_rq_queued(prev)) | ||
3425 | update_rq_clock(rq); | ||
3426 | |||
3427 | next = pick_next_task(rq, prev, &rf); | 3428 | next = pick_next_task(rq, prev, &rf); |
3428 | clear_tsk_need_resched(prev); | 3429 | clear_tsk_need_resched(prev); |
3429 | clear_preempt_need_resched(); | 3430 | clear_preempt_need_resched(); |
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3439 | rq = context_switch(rq, prev, next, &rf); | 3440 | rq = context_switch(rq, prev, next, &rf); |
3440 | } else { | 3441 | } else { |
3441 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); | 3442 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); |
3442 | rq_unpin_lock(rq, &rf); | 3443 | rq_unlock_irq(rq, &rf); |
3443 | raw_spin_unlock_irq(&rq->lock); | ||
3444 | } | 3444 | } |
3445 | 3445 | ||
3446 | balance_callback(rq); | 3446 | balance_callback(rq); |
@@ -3502,6 +3502,31 @@ asmlinkage __visible void __sched schedule(void) | |||
3502 | } | 3502 | } |
3503 | EXPORT_SYMBOL(schedule); | 3503 | EXPORT_SYMBOL(schedule); |
3504 | 3504 | ||
3505 | /* | ||
3506 | * synchronize_rcu_tasks() makes sure that no task is stuck in preempted | ||
3507 | * state (have scheduled out non-voluntarily) by making sure that all | ||
3508 | * tasks have either left the run queue or have gone into user space. | ||
3509 | * As idle tasks do not do either, they must not ever be preempted | ||
3510 | * (schedule out non-voluntarily). | ||
3511 | * | ||
3512 | * schedule_idle() is similar to schedule_preempt_disable() except that it | ||
3513 | * never enables preemption because it does not call sched_submit_work(). | ||
3514 | */ | ||
3515 | void __sched schedule_idle(void) | ||
3516 | { | ||
3517 | /* | ||
3518 | * As this skips calling sched_submit_work(), which the idle task does | ||
3519 | * regardless because that function is a nop when the task is in a | ||
3520 | * TASK_RUNNING state, make sure this isn't used someplace that the | ||
3521 | * current task can be in any other state. Note, idle is always in the | ||
3522 | * TASK_RUNNING state. | ||
3523 | */ | ||
3524 | WARN_ON_ONCE(current->state); | ||
3525 | do { | ||
3526 | __schedule(false); | ||
3527 | } while (need_resched()); | ||
3528 | } | ||
3529 | |||
3505 | #ifdef CONFIG_CONTEXT_TRACKING | 3530 | #ifdef CONFIG_CONTEXT_TRACKING |
3506 | asmlinkage __visible void __sched schedule_user(void) | 3531 | asmlinkage __visible void __sched schedule_user(void) |
3507 | { | 3532 | { |
@@ -3671,10 +3696,25 @@ EXPORT_SYMBOL(default_wake_function); | |||
3671 | 3696 | ||
3672 | #ifdef CONFIG_RT_MUTEXES | 3697 | #ifdef CONFIG_RT_MUTEXES |
3673 | 3698 | ||
3699 | static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) | ||
3700 | { | ||
3701 | if (pi_task) | ||
3702 | prio = min(prio, pi_task->prio); | ||
3703 | |||
3704 | return prio; | ||
3705 | } | ||
3706 | |||
3707 | static inline int rt_effective_prio(struct task_struct *p, int prio) | ||
3708 | { | ||
3709 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
3710 | |||
3711 | return __rt_effective_prio(pi_task, prio); | ||
3712 | } | ||
3713 | |||
3674 | /* | 3714 | /* |
3675 | * rt_mutex_setprio - set the current priority of a task | 3715 | * rt_mutex_setprio - set the current priority of a task |
3676 | * @p: task | 3716 | * @p: task to boost |
3677 | * @prio: prio value (kernel-internal form) | 3717 | * @pi_task: donor task |
3678 | * | 3718 | * |
3679 | * This function changes the 'effective' priority of a task. It does | 3719 | * This function changes the 'effective' priority of a task. It does |
3680 | * not touch ->normal_prio like __setscheduler(). | 3720 | * not touch ->normal_prio like __setscheduler(). |
@@ -3682,17 +3722,42 @@ EXPORT_SYMBOL(default_wake_function); | |||
3682 | * Used by the rt_mutex code to implement priority inheritance | 3722 | * Used by the rt_mutex code to implement priority inheritance |
3683 | * logic. Call site only calls if the priority of the task changed. | 3723 | * logic. Call site only calls if the priority of the task changed. |
3684 | */ | 3724 | */ |
3685 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3725 | void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) |
3686 | { | 3726 | { |
3687 | int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; | 3727 | int prio, oldprio, queued, running, queue_flag = |
3728 | DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; | ||
3688 | const struct sched_class *prev_class; | 3729 | const struct sched_class *prev_class; |
3689 | struct rq_flags rf; | 3730 | struct rq_flags rf; |
3690 | struct rq *rq; | 3731 | struct rq *rq; |
3691 | 3732 | ||
3692 | BUG_ON(prio > MAX_PRIO); | 3733 | /* XXX used to be waiter->prio, not waiter->task->prio */ |
3734 | prio = __rt_effective_prio(pi_task, p->normal_prio); | ||
3735 | |||
3736 | /* | ||
3737 | * If nothing changed; bail early. | ||
3738 | */ | ||
3739 | if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) | ||
3740 | return; | ||
3693 | 3741 | ||
3694 | rq = __task_rq_lock(p, &rf); | 3742 | rq = __task_rq_lock(p, &rf); |
3695 | update_rq_clock(rq); | 3743 | update_rq_clock(rq); |
3744 | /* | ||
3745 | * Set under pi_lock && rq->lock, such that the value can be used under | ||
3746 | * either lock. | ||
3747 | * | ||
3748 | * Note that there is loads of tricky to make this pointer cache work | ||
3749 | * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to | ||
3750 | * ensure a task is de-boosted (pi_task is set to NULL) before the | ||
3751 | * task is allowed to run again (and can exit). This ensures the pointer | ||
3752 | * points to a blocked task -- which guaratees the task is present. | ||
3753 | */ | ||
3754 | p->pi_top_task = pi_task; | ||
3755 | |||
3756 | /* | ||
3757 | * For FIFO/RR we only need to set prio, if that matches we're done. | ||
3758 | */ | ||
3759 | if (prio == p->prio && !dl_prio(prio)) | ||
3760 | goto out_unlock; | ||
3696 | 3761 | ||
3697 | /* | 3762 | /* |
3698 | * Idle task boosting is a nono in general. There is one | 3763 | * Idle task boosting is a nono in general. There is one |
@@ -3712,7 +3777,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3712 | goto out_unlock; | 3777 | goto out_unlock; |
3713 | } | 3778 | } |
3714 | 3779 | ||
3715 | trace_sched_pi_setprio(p, prio); | 3780 | trace_sched_pi_setprio(p, pi_task); |
3716 | oldprio = p->prio; | 3781 | oldprio = p->prio; |
3717 | 3782 | ||
3718 | if (oldprio == prio) | 3783 | if (oldprio == prio) |
@@ -3736,7 +3801,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3736 | * running task | 3801 | * running task |
3737 | */ | 3802 | */ |
3738 | if (dl_prio(prio)) { | 3803 | if (dl_prio(prio)) { |
3739 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | ||
3740 | if (!dl_prio(p->normal_prio) || | 3804 | if (!dl_prio(p->normal_prio) || |
3741 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3805 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
3742 | p->dl.dl_boosted = 1; | 3806 | p->dl.dl_boosted = 1; |
@@ -3774,6 +3838,11 @@ out_unlock: | |||
3774 | balance_callback(rq); | 3838 | balance_callback(rq); |
3775 | preempt_enable(); | 3839 | preempt_enable(); |
3776 | } | 3840 | } |
3841 | #else | ||
3842 | static inline int rt_effective_prio(struct task_struct *p, int prio) | ||
3843 | { | ||
3844 | return prio; | ||
3845 | } | ||
3777 | #endif | 3846 | #endif |
3778 | 3847 | ||
3779 | void set_user_nice(struct task_struct *p, long nice) | 3848 | void set_user_nice(struct task_struct *p, long nice) |
@@ -3805,7 +3874,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3805 | queued = task_on_rq_queued(p); | 3874 | queued = task_on_rq_queued(p); |
3806 | running = task_current(rq, p); | 3875 | running = task_current(rq, p); |
3807 | if (queued) | 3876 | if (queued) |
3808 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3877 | dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); |
3809 | if (running) | 3878 | if (running) |
3810 | put_prev_task(rq, p); | 3879 | put_prev_task(rq, p); |
3811 | 3880 | ||
@@ -3816,7 +3885,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3816 | delta = p->prio - old_prio; | 3885 | delta = p->prio - old_prio; |
3817 | 3886 | ||
3818 | if (queued) { | 3887 | if (queued) { |
3819 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 3888 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
3820 | /* | 3889 | /* |
3821 | * If the task increased its priority or is running and | 3890 | * If the task increased its priority or is running and |
3822 | * lowered its priority, then reschedule its CPU: | 3891 | * lowered its priority, then reschedule its CPU: |
@@ -4020,10 +4089,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, | |||
4020 | * Keep a potential priority boosting if called from | 4089 | * Keep a potential priority boosting if called from |
4021 | * sched_setscheduler(). | 4090 | * sched_setscheduler(). |
4022 | */ | 4091 | */ |
4092 | p->prio = normal_prio(p); | ||
4023 | if (keep_boost) | 4093 | if (keep_boost) |
4024 | p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); | 4094 | p->prio = rt_effective_prio(p, p->prio); |
4025 | else | ||
4026 | p->prio = normal_prio(p); | ||
4027 | 4095 | ||
4028 | if (dl_prio(p->prio)) | 4096 | if (dl_prio(p->prio)) |
4029 | p->sched_class = &dl_sched_class; | 4097 | p->sched_class = &dl_sched_class; |
@@ -4126,7 +4194,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
4126 | const struct sched_class *prev_class; | 4194 | const struct sched_class *prev_class; |
4127 | struct rq_flags rf; | 4195 | struct rq_flags rf; |
4128 | int reset_on_fork; | 4196 | int reset_on_fork; |
4129 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | 4197 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; |
4130 | struct rq *rq; | 4198 | struct rq *rq; |
4131 | 4199 | ||
4132 | /* May grab non-irq protected spin_locks: */ | 4200 | /* May grab non-irq protected spin_locks: */ |
@@ -4310,7 +4378,7 @@ change: | |||
4310 | * the runqueue. This will be done when the task deboost | 4378 | * the runqueue. This will be done when the task deboost |
4311 | * itself. | 4379 | * itself. |
4312 | */ | 4380 | */ |
4313 | new_effective_prio = rt_mutex_get_effective_prio(p, newprio); | 4381 | new_effective_prio = rt_effective_prio(p, newprio); |
4314 | if (new_effective_prio == oldprio) | 4382 | if (new_effective_prio == oldprio) |
4315 | queue_flags &= ~DEQUEUE_MOVE; | 4383 | queue_flags &= ~DEQUEUE_MOVE; |
4316 | } | 4384 | } |
@@ -4923,7 +4991,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
4923 | */ | 4991 | */ |
4924 | SYSCALL_DEFINE0(sched_yield) | 4992 | SYSCALL_DEFINE0(sched_yield) |
4925 | { | 4993 | { |
4926 | struct rq *rq = this_rq_lock(); | 4994 | struct rq_flags rf; |
4995 | struct rq *rq; | ||
4996 | |||
4997 | local_irq_disable(); | ||
4998 | rq = this_rq(); | ||
4999 | rq_lock(rq, &rf); | ||
4927 | 5000 | ||
4928 | schedstat_inc(rq->yld_count); | 5001 | schedstat_inc(rq->yld_count); |
4929 | current->sched_class->yield_task(rq); | 5002 | current->sched_class->yield_task(rq); |
@@ -4932,9 +5005,8 @@ SYSCALL_DEFINE0(sched_yield) | |||
4932 | * Since we are going to call schedule() anyway, there's | 5005 | * Since we are going to call schedule() anyway, there's |
4933 | * no need to preempt or enable interrupts: | 5006 | * no need to preempt or enable interrupts: |
4934 | */ | 5007 | */ |
4935 | __release(rq->lock); | 5008 | preempt_disable(); |
4936 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 5009 | rq_unlock(rq, &rf); |
4937 | do_raw_spin_unlock(&rq->lock); | ||
4938 | sched_preempt_enable_no_resched(); | 5010 | sched_preempt_enable_no_resched(); |
4939 | 5011 | ||
4940 | schedule(); | 5012 | schedule(); |
@@ -5514,7 +5586,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5514 | p->numa_preferred_nid = nid; | 5586 | p->numa_preferred_nid = nid; |
5515 | 5587 | ||
5516 | if (queued) | 5588 | if (queued) |
5517 | enqueue_task(rq, p, ENQUEUE_RESTORE); | 5589 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
5518 | if (running) | 5590 | if (running) |
5519 | set_curr_task(rq, p); | 5591 | set_curr_task(rq, p); |
5520 | task_rq_unlock(rq, p, &rf); | 5592 | task_rq_unlock(rq, p, &rf); |
@@ -5579,11 +5651,11 @@ static struct task_struct fake_task = { | |||
5579 | * there's no concurrency possible, we hold the required locks anyway | 5651 | * there's no concurrency possible, we hold the required locks anyway |
5580 | * because of lock validation efforts. | 5652 | * because of lock validation efforts. |
5581 | */ | 5653 | */ |
5582 | static void migrate_tasks(struct rq *dead_rq) | 5654 | static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) |
5583 | { | 5655 | { |
5584 | struct rq *rq = dead_rq; | 5656 | struct rq *rq = dead_rq; |
5585 | struct task_struct *next, *stop = rq->stop; | 5657 | struct task_struct *next, *stop = rq->stop; |
5586 | struct rq_flags rf; | 5658 | struct rq_flags orf = *rf; |
5587 | int dest_cpu; | 5659 | int dest_cpu; |
5588 | 5660 | ||
5589 | /* | 5661 | /* |
@@ -5602,9 +5674,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5602 | * class method both need to have an up-to-date | 5674 | * class method both need to have an up-to-date |
5603 | * value of rq->clock[_task] | 5675 | * value of rq->clock[_task] |
5604 | */ | 5676 | */ |
5605 | rq_pin_lock(rq, &rf); | ||
5606 | update_rq_clock(rq); | 5677 | update_rq_clock(rq); |
5607 | rq_unpin_lock(rq, &rf); | ||
5608 | 5678 | ||
5609 | for (;;) { | 5679 | for (;;) { |
5610 | /* | 5680 | /* |
@@ -5617,8 +5687,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5617 | /* | 5687 | /* |
5618 | * pick_next_task() assumes pinned rq->lock: | 5688 | * pick_next_task() assumes pinned rq->lock: |
5619 | */ | 5689 | */ |
5620 | rq_repin_lock(rq, &rf); | 5690 | next = pick_next_task(rq, &fake_task, rf); |
5621 | next = pick_next_task(rq, &fake_task, &rf); | ||
5622 | BUG_ON(!next); | 5691 | BUG_ON(!next); |
5623 | next->sched_class->put_prev_task(rq, next); | 5692 | next->sched_class->put_prev_task(rq, next); |
5624 | 5693 | ||
@@ -5631,10 +5700,9 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5631 | * because !cpu_active at this point, which means load-balance | 5700 | * because !cpu_active at this point, which means load-balance |
5632 | * will not interfere. Also, stop-machine. | 5701 | * will not interfere. Also, stop-machine. |
5633 | */ | 5702 | */ |
5634 | rq_unpin_lock(rq, &rf); | 5703 | rq_unlock(rq, rf); |
5635 | raw_spin_unlock(&rq->lock); | ||
5636 | raw_spin_lock(&next->pi_lock); | 5704 | raw_spin_lock(&next->pi_lock); |
5637 | raw_spin_lock(&rq->lock); | 5705 | rq_relock(rq, rf); |
5638 | 5706 | ||
5639 | /* | 5707 | /* |
5640 | * Since we're inside stop-machine, _nothing_ should have | 5708 | * Since we're inside stop-machine, _nothing_ should have |
@@ -5648,12 +5716,12 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5648 | 5716 | ||
5649 | /* Find suitable destination for @next, with force if needed. */ | 5717 | /* Find suitable destination for @next, with force if needed. */ |
5650 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); | 5718 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); |
5651 | 5719 | rq = __migrate_task(rq, rf, next, dest_cpu); | |
5652 | rq = __migrate_task(rq, next, dest_cpu); | ||
5653 | if (rq != dead_rq) { | 5720 | if (rq != dead_rq) { |
5654 | raw_spin_unlock(&rq->lock); | 5721 | rq_unlock(rq, rf); |
5655 | rq = dead_rq; | 5722 | rq = dead_rq; |
5656 | raw_spin_lock(&rq->lock); | 5723 | *rf = orf; |
5724 | rq_relock(rq, rf); | ||
5657 | } | 5725 | } |
5658 | raw_spin_unlock(&next->pi_lock); | 5726 | raw_spin_unlock(&next->pi_lock); |
5659 | } | 5727 | } |
@@ -5732,7 +5800,7 @@ static void cpuset_cpu_active(void) | |||
5732 | * cpuset configurations. | 5800 | * cpuset configurations. |
5733 | */ | 5801 | */ |
5734 | } | 5802 | } |
5735 | cpuset_update_active_cpus(true); | 5803 | cpuset_update_active_cpus(); |
5736 | } | 5804 | } |
5737 | 5805 | ||
5738 | static int cpuset_cpu_inactive(unsigned int cpu) | 5806 | static int cpuset_cpu_inactive(unsigned int cpu) |
@@ -5755,7 +5823,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) | |||
5755 | 5823 | ||
5756 | if (overflow) | 5824 | if (overflow) |
5757 | return -EBUSY; | 5825 | return -EBUSY; |
5758 | cpuset_update_active_cpus(false); | 5826 | cpuset_update_active_cpus(); |
5759 | } else { | 5827 | } else { |
5760 | num_cpus_frozen++; | 5828 | num_cpus_frozen++; |
5761 | partition_sched_domains(1, NULL, NULL); | 5829 | partition_sched_domains(1, NULL, NULL); |
@@ -5766,7 +5834,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) | |||
5766 | int sched_cpu_activate(unsigned int cpu) | 5834 | int sched_cpu_activate(unsigned int cpu) |
5767 | { | 5835 | { |
5768 | struct rq *rq = cpu_rq(cpu); | 5836 | struct rq *rq = cpu_rq(cpu); |
5769 | unsigned long flags; | 5837 | struct rq_flags rf; |
5770 | 5838 | ||
5771 | set_cpu_active(cpu, true); | 5839 | set_cpu_active(cpu, true); |
5772 | 5840 | ||
@@ -5784,12 +5852,12 @@ int sched_cpu_activate(unsigned int cpu) | |||
5784 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the | 5852 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the |
5785 | * domains. | 5853 | * domains. |
5786 | */ | 5854 | */ |
5787 | raw_spin_lock_irqsave(&rq->lock, flags); | 5855 | rq_lock_irqsave(rq, &rf); |
5788 | if (rq->rd) { | 5856 | if (rq->rd) { |
5789 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 5857 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
5790 | set_rq_online(rq); | 5858 | set_rq_online(rq); |
5791 | } | 5859 | } |
5792 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5860 | rq_unlock_irqrestore(rq, &rf); |
5793 | 5861 | ||
5794 | update_max_interval(); | 5862 | update_max_interval(); |
5795 | 5863 | ||
@@ -5847,18 +5915,20 @@ int sched_cpu_starting(unsigned int cpu) | |||
5847 | int sched_cpu_dying(unsigned int cpu) | 5915 | int sched_cpu_dying(unsigned int cpu) |
5848 | { | 5916 | { |
5849 | struct rq *rq = cpu_rq(cpu); | 5917 | struct rq *rq = cpu_rq(cpu); |
5850 | unsigned long flags; | 5918 | struct rq_flags rf; |
5851 | 5919 | ||
5852 | /* Handle pending wakeups and then migrate everything off */ | 5920 | /* Handle pending wakeups and then migrate everything off */ |
5853 | sched_ttwu_pending(); | 5921 | sched_ttwu_pending(); |
5854 | raw_spin_lock_irqsave(&rq->lock, flags); | 5922 | |
5923 | rq_lock_irqsave(rq, &rf); | ||
5855 | if (rq->rd) { | 5924 | if (rq->rd) { |
5856 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 5925 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
5857 | set_rq_offline(rq); | 5926 | set_rq_offline(rq); |
5858 | } | 5927 | } |
5859 | migrate_tasks(rq); | 5928 | migrate_tasks(rq, &rf); |
5860 | BUG_ON(rq->nr_running != 1); | 5929 | BUG_ON(rq->nr_running != 1); |
5861 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5930 | rq_unlock_irqrestore(rq, &rf); |
5931 | |||
5862 | calc_load_migrate(rq); | 5932 | calc_load_migrate(rq); |
5863 | update_max_interval(); | 5933 | update_max_interval(); |
5864 | nohz_balance_exit_idle(cpu); | 5934 | nohz_balance_exit_idle(cpu); |
@@ -6412,7 +6482,8 @@ static void sched_change_group(struct task_struct *tsk, int type) | |||
6412 | */ | 6482 | */ |
6413 | void sched_move_task(struct task_struct *tsk) | 6483 | void sched_move_task(struct task_struct *tsk) |
6414 | { | 6484 | { |
6415 | int queued, running; | 6485 | int queued, running, queue_flags = |
6486 | DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; | ||
6416 | struct rq_flags rf; | 6487 | struct rq_flags rf; |
6417 | struct rq *rq; | 6488 | struct rq *rq; |
6418 | 6489 | ||
@@ -6423,14 +6494,14 @@ void sched_move_task(struct task_struct *tsk) | |||
6423 | queued = task_on_rq_queued(tsk); | 6494 | queued = task_on_rq_queued(tsk); |
6424 | 6495 | ||
6425 | if (queued) | 6496 | if (queued) |
6426 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | 6497 | dequeue_task(rq, tsk, queue_flags); |
6427 | if (running) | 6498 | if (running) |
6428 | put_prev_task(rq, tsk); | 6499 | put_prev_task(rq, tsk); |
6429 | 6500 | ||
6430 | sched_change_group(tsk, TASK_MOVE_GROUP); | 6501 | sched_change_group(tsk, TASK_MOVE_GROUP); |
6431 | 6502 | ||
6432 | if (queued) | 6503 | if (queued) |
6433 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 6504 | enqueue_task(rq, tsk, queue_flags); |
6434 | if (running) | 6505 | if (running) |
6435 | set_curr_task(rq, tsk); | 6506 | set_curr_task(rq, tsk); |
6436 | 6507 | ||
@@ -7008,14 +7079,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7008 | for_each_online_cpu(i) { | 7079 | for_each_online_cpu(i) { |
7009 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7080 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
7010 | struct rq *rq = cfs_rq->rq; | 7081 | struct rq *rq = cfs_rq->rq; |
7082 | struct rq_flags rf; | ||
7011 | 7083 | ||
7012 | raw_spin_lock_irq(&rq->lock); | 7084 | rq_lock_irq(rq, &rf); |
7013 | cfs_rq->runtime_enabled = runtime_enabled; | 7085 | cfs_rq->runtime_enabled = runtime_enabled; |
7014 | cfs_rq->runtime_remaining = 0; | 7086 | cfs_rq->runtime_remaining = 0; |
7015 | 7087 | ||
7016 | if (cfs_rq->throttled) | 7088 | if (cfs_rq->throttled) |
7017 | unthrottle_cfs_rq(cfs_rq); | 7089 | unthrottle_cfs_rq(cfs_rq); |
7018 | raw_spin_unlock_irq(&rq->lock); | 7090 | rq_unlock_irq(rq, &rf); |
7019 | } | 7091 | } |
7020 | if (runtime_was_enabled && !runtime_enabled) | 7092 | if (runtime_was_enabled && !runtime_enabled) |
7021 | cfs_bandwidth_usage_dec(); | 7093 | cfs_bandwidth_usage_dec(); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 54c577578da6..622eed1b7658 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -61,6 +61,11 @@ struct sugov_cpu { | |||
61 | unsigned long util; | 61 | unsigned long util; |
62 | unsigned long max; | 62 | unsigned long max; |
63 | unsigned int flags; | 63 | unsigned int flags; |
64 | |||
65 | /* The field below is for single-CPU policies only. */ | ||
66 | #ifdef CONFIG_NO_HZ_COMMON | ||
67 | unsigned long saved_idle_calls; | ||
68 | #endif | ||
64 | }; | 69 | }; |
65 | 70 | ||
66 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); | 71 | static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); |
@@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
93 | { | 98 | { |
94 | struct cpufreq_policy *policy = sg_policy->policy; | 99 | struct cpufreq_policy *policy = sg_policy->policy; |
95 | 100 | ||
101 | if (sg_policy->next_freq == next_freq) | ||
102 | return; | ||
103 | |||
104 | if (sg_policy->next_freq > next_freq) | ||
105 | next_freq = (sg_policy->next_freq + next_freq) >> 1; | ||
106 | |||
107 | sg_policy->next_freq = next_freq; | ||
96 | sg_policy->last_freq_update_time = time; | 108 | sg_policy->last_freq_update_time = time; |
97 | 109 | ||
98 | if (policy->fast_switch_enabled) { | 110 | if (policy->fast_switch_enabled) { |
99 | if (sg_policy->next_freq == next_freq) { | ||
100 | trace_cpu_frequency(policy->cur, smp_processor_id()); | ||
101 | return; | ||
102 | } | ||
103 | sg_policy->next_freq = next_freq; | ||
104 | next_freq = cpufreq_driver_fast_switch(policy, next_freq); | 111 | next_freq = cpufreq_driver_fast_switch(policy, next_freq); |
105 | if (next_freq == CPUFREQ_ENTRY_INVALID) | 112 | if (next_freq == CPUFREQ_ENTRY_INVALID) |
106 | return; | 113 | return; |
107 | 114 | ||
108 | policy->cur = next_freq; | 115 | policy->cur = next_freq; |
109 | trace_cpu_frequency(next_freq, smp_processor_id()); | 116 | trace_cpu_frequency(next_freq, smp_processor_id()); |
110 | } else if (sg_policy->next_freq != next_freq) { | 117 | } else { |
111 | sg_policy->next_freq = next_freq; | ||
112 | sg_policy->work_in_progress = true; | 118 | sg_policy->work_in_progress = true; |
113 | irq_work_queue(&sg_policy->irq_work); | 119 | irq_work_queue(&sg_policy->irq_work); |
114 | } | 120 | } |
@@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, | |||
192 | sg_cpu->iowait_boost >>= 1; | 198 | sg_cpu->iowait_boost >>= 1; |
193 | } | 199 | } |
194 | 200 | ||
201 | #ifdef CONFIG_NO_HZ_COMMON | ||
202 | static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) | ||
203 | { | ||
204 | unsigned long idle_calls = tick_nohz_get_idle_calls(); | ||
205 | bool ret = idle_calls == sg_cpu->saved_idle_calls; | ||
206 | |||
207 | sg_cpu->saved_idle_calls = idle_calls; | ||
208 | return ret; | ||
209 | } | ||
210 | #else | ||
211 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } | ||
212 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
213 | |||
195 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 214 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
196 | unsigned int flags) | 215 | unsigned int flags) |
197 | { | 216 | { |
@@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
200 | struct cpufreq_policy *policy = sg_policy->policy; | 219 | struct cpufreq_policy *policy = sg_policy->policy; |
201 | unsigned long util, max; | 220 | unsigned long util, max; |
202 | unsigned int next_f; | 221 | unsigned int next_f; |
222 | bool busy; | ||
203 | 223 | ||
204 | sugov_set_iowait_boost(sg_cpu, time, flags); | 224 | sugov_set_iowait_boost(sg_cpu, time, flags); |
205 | sg_cpu->last_update = time; | 225 | sg_cpu->last_update = time; |
@@ -207,40 +227,36 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
207 | if (!sugov_should_update_freq(sg_policy, time)) | 227 | if (!sugov_should_update_freq(sg_policy, time)) |
208 | return; | 228 | return; |
209 | 229 | ||
230 | busy = sugov_cpu_is_busy(sg_cpu); | ||
231 | |||
210 | if (flags & SCHED_CPUFREQ_RT_DL) { | 232 | if (flags & SCHED_CPUFREQ_RT_DL) { |
211 | next_f = policy->cpuinfo.max_freq; | 233 | next_f = policy->cpuinfo.max_freq; |
212 | } else { | 234 | } else { |
213 | sugov_get_util(&util, &max); | 235 | sugov_get_util(&util, &max); |
214 | sugov_iowait_boost(sg_cpu, &util, &max); | 236 | sugov_iowait_boost(sg_cpu, &util, &max); |
215 | next_f = get_next_freq(sg_policy, util, max); | 237 | next_f = get_next_freq(sg_policy, util, max); |
238 | /* | ||
239 | * Do not reduce the frequency if the CPU has not been idle | ||
240 | * recently, as the reduction is likely to be premature then. | ||
241 | */ | ||
242 | if (busy && next_f < sg_policy->next_freq) | ||
243 | next_f = sg_policy->next_freq; | ||
216 | } | 244 | } |
217 | sugov_update_commit(sg_policy, time, next_f); | 245 | sugov_update_commit(sg_policy, time, next_f); |
218 | } | 246 | } |
219 | 247 | ||
220 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | 248 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) |
221 | unsigned long util, unsigned long max, | ||
222 | unsigned int flags) | ||
223 | { | 249 | { |
224 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 250 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
225 | struct cpufreq_policy *policy = sg_policy->policy; | 251 | struct cpufreq_policy *policy = sg_policy->policy; |
226 | unsigned int max_f = policy->cpuinfo.max_freq; | 252 | unsigned long util = 0, max = 1; |
227 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | ||
228 | unsigned int j; | 253 | unsigned int j; |
229 | 254 | ||
230 | if (flags & SCHED_CPUFREQ_RT_DL) | ||
231 | return max_f; | ||
232 | |||
233 | sugov_iowait_boost(sg_cpu, &util, &max); | ||
234 | |||
235 | for_each_cpu(j, policy->cpus) { | 255 | for_each_cpu(j, policy->cpus) { |
236 | struct sugov_cpu *j_sg_cpu; | 256 | struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); |
237 | unsigned long j_util, j_max; | 257 | unsigned long j_util, j_max; |
238 | s64 delta_ns; | 258 | s64 delta_ns; |
239 | 259 | ||
240 | if (j == smp_processor_id()) | ||
241 | continue; | ||
242 | |||
243 | j_sg_cpu = &per_cpu(sugov_cpu, j); | ||
244 | /* | 260 | /* |
245 | * If the CPU utilization was last updated before the previous | 261 | * If the CPU utilization was last updated before the previous |
246 | * frequency update and the time elapsed between the last update | 262 | * frequency update and the time elapsed between the last update |
@@ -248,13 +264,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, | |||
248 | * enough, don't take the CPU into account as it probably is | 264 | * enough, don't take the CPU into account as it probably is |
249 | * idle now (and clear iowait_boost for it). | 265 | * idle now (and clear iowait_boost for it). |
250 | */ | 266 | */ |
251 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; | 267 | delta_ns = time - j_sg_cpu->last_update; |
252 | if (delta_ns > TICK_NSEC) { | 268 | if (delta_ns > TICK_NSEC) { |
253 | j_sg_cpu->iowait_boost = 0; | 269 | j_sg_cpu->iowait_boost = 0; |
254 | continue; | 270 | continue; |
255 | } | 271 | } |
256 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) | 272 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) |
257 | return max_f; | 273 | return policy->cpuinfo.max_freq; |
258 | 274 | ||
259 | j_util = j_sg_cpu->util; | 275 | j_util = j_sg_cpu->util; |
260 | j_max = j_sg_cpu->max; | 276 | j_max = j_sg_cpu->max; |
@@ -289,7 +305,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
289 | sg_cpu->last_update = time; | 305 | sg_cpu->last_update = time; |
290 | 306 | ||
291 | if (sugov_should_update_freq(sg_policy, time)) { | 307 | if (sugov_should_update_freq(sg_policy, time)) { |
292 | next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); | 308 | if (flags & SCHED_CPUFREQ_RT_DL) |
309 | next_f = sg_policy->policy->cpuinfo.max_freq; | ||
310 | else | ||
311 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
312 | |||
293 | sugov_update_commit(sg_policy, time, next_f); | 313 | sugov_update_commit(sg_policy, time, next_f); |
294 | } | 314 | } |
295 | 315 | ||
@@ -473,7 +493,6 @@ static int sugov_init(struct cpufreq_policy *policy) | |||
473 | { | 493 | { |
474 | struct sugov_policy *sg_policy; | 494 | struct sugov_policy *sg_policy; |
475 | struct sugov_tunables *tunables; | 495 | struct sugov_tunables *tunables; |
476 | unsigned int lat; | ||
477 | int ret = 0; | 496 | int ret = 0; |
478 | 497 | ||
479 | /* State should be equivalent to EXIT */ | 498 | /* State should be equivalent to EXIT */ |
@@ -512,10 +531,16 @@ static int sugov_init(struct cpufreq_policy *policy) | |||
512 | goto stop_kthread; | 531 | goto stop_kthread; |
513 | } | 532 | } |
514 | 533 | ||
515 | tunables->rate_limit_us = LATENCY_MULTIPLIER; | 534 | if (policy->transition_delay_us) { |
516 | lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; | 535 | tunables->rate_limit_us = policy->transition_delay_us; |
517 | if (lat) | 536 | } else { |
518 | tunables->rate_limit_us *= lat; | 537 | unsigned int lat; |
538 | |||
539 | tunables->rate_limit_us = LATENCY_MULTIPLIER; | ||
540 | lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; | ||
541 | if (lat) | ||
542 | tunables->rate_limit_us *= lat; | ||
543 | } | ||
519 | 544 | ||
520 | policy->governor_data = sg_policy; | 545 | policy->governor_data = sg_policy; |
521 | sg_policy->tunables = tunables; | 546 | sg_policy->tunables = tunables; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b46c8..aea3135c5d90 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void) | |||
34 | sched_clock_irqtime = 0; | 34 | sched_clock_irqtime = 0; |
35 | } | 35 | } |
36 | 36 | ||
37 | static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, | ||
38 | enum cpu_usage_stat idx) | ||
39 | { | ||
40 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
41 | |||
42 | u64_stats_update_begin(&irqtime->sync); | ||
43 | cpustat[idx] += delta; | ||
44 | irqtime->total += delta; | ||
45 | irqtime->tick_delta += delta; | ||
46 | u64_stats_update_end(&irqtime->sync); | ||
47 | } | ||
48 | |||
37 | /* | 49 | /* |
38 | * Called before incrementing preempt_count on {soft,}irq_enter | 50 | * Called before incrementing preempt_count on {soft,}irq_enter |
39 | * and before decrementing preempt_count on {soft,}irq_exit. | 51 | * and before decrementing preempt_count on {soft,}irq_exit. |
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void) | |||
41 | void irqtime_account_irq(struct task_struct *curr) | 53 | void irqtime_account_irq(struct task_struct *curr) |
42 | { | 54 | { |
43 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | 55 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
44 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
45 | s64 delta; | 56 | s64 delta; |
46 | int cpu; | 57 | int cpu; |
47 | 58 | ||
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr) | |||
52 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; | 63 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
53 | irqtime->irq_start_time += delta; | 64 | irqtime->irq_start_time += delta; |
54 | 65 | ||
55 | u64_stats_update_begin(&irqtime->sync); | ||
56 | /* | 66 | /* |
57 | * We do not account for softirq time from ksoftirqd here. | 67 | * We do not account for softirq time from ksoftirqd here. |
58 | * We want to continue accounting softirq time to ksoftirqd thread | 68 | * We want to continue accounting softirq time to ksoftirqd thread |
59 | * in that case, so as not to confuse scheduler with a special task | 69 | * in that case, so as not to confuse scheduler with a special task |
60 | * that do not consume any time, but still wants to run. | 70 | * that do not consume any time, but still wants to run. |
61 | */ | 71 | */ |
62 | if (hardirq_count()) { | 72 | if (hardirq_count()) |
63 | cpustat[CPUTIME_IRQ] += delta; | 73 | irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); |
64 | irqtime->tick_delta += delta; | 74 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
65 | } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { | 75 | irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); |
66 | cpustat[CPUTIME_SOFTIRQ] += delta; | ||
67 | irqtime->tick_delta += delta; | ||
68 | } | ||
69 | |||
70 | u64_stats_update_end(&irqtime->sync); | ||
71 | } | 76 | } |
72 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 77 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
73 | 78 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b91..d71109321841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
717 | } | 717 | } |
718 | 718 | ||
719 | #ifdef CONFIG_SMP | 719 | #ifdef CONFIG_SMP |
720 | |||
721 | #include "sched-pelt.h" | ||
722 | |||
720 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); | 723 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
721 | static unsigned long task_h_load(struct task_struct *p); | 724 | static unsigned long task_h_load(struct task_struct *p); |
722 | 725 | ||
723 | /* | ||
724 | * We choose a half-life close to 1 scheduling period. | ||
725 | * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are | ||
726 | * dependent on this value. | ||
727 | */ | ||
728 | #define LOAD_AVG_PERIOD 32 | ||
729 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | ||
730 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ | ||
731 | |||
732 | /* Give new sched_entity start runnable values to heavy its load in infant time */ | 726 | /* Give new sched_entity start runnable values to heavy its load in infant time */ |
733 | void init_entity_runnable_average(struct sched_entity *se) | 727 | void init_entity_runnable_average(struct sched_entity *se) |
734 | { | 728 | { |
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se) | |||
2733 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2727 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2734 | 2728 | ||
2735 | #ifdef CONFIG_SMP | 2729 | #ifdef CONFIG_SMP |
2736 | /* Precomputed fixed inverse multiplies for multiplication by y^n */ | ||
2737 | static const u32 runnable_avg_yN_inv[] = { | ||
2738 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | ||
2739 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | ||
2740 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | ||
2741 | 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, | ||
2742 | 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, | ||
2743 | 0x85aac367, 0x82cd8698, | ||
2744 | }; | ||
2745 | |||
2746 | /* | ||
2747 | * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent | ||
2748 | * over-estimates when re-combining. | ||
2749 | */ | ||
2750 | static const u32 runnable_avg_yN_sum[] = { | ||
2751 | 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, | ||
2752 | 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, | ||
2753 | 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, | ||
2754 | }; | ||
2755 | |||
2756 | /* | ||
2757 | * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to | ||
2758 | * lower integers. See Documentation/scheduler/sched-avg.txt how these | ||
2759 | * were generated: | ||
2760 | */ | ||
2761 | static const u32 __accumulated_sum_N32[] = { | ||
2762 | 0, 23371, 35056, 40899, 43820, 45281, | ||
2763 | 46011, 46376, 46559, 46650, 46696, 46719, | ||
2764 | }; | ||
2765 | |||
2766 | /* | 2730 | /* |
2767 | * Approximate: | 2731 | * Approximate: |
2768 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | 2732 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) |
2769 | */ | 2733 | */ |
2770 | static __always_inline u64 decay_load(u64 val, u64 n) | 2734 | static u64 decay_load(u64 val, u64 n) |
2771 | { | 2735 | { |
2772 | unsigned int local_n; | 2736 | unsigned int local_n; |
2773 | 2737 | ||
2774 | if (!n) | 2738 | if (unlikely(n > LOAD_AVG_PERIOD * 63)) |
2775 | return val; | ||
2776 | else if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
2777 | return 0; | 2739 | return 0; |
2778 | 2740 | ||
2779 | /* after bounds checking we can collapse to 32-bit */ | 2741 | /* after bounds checking we can collapse to 32-bit */ |
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
2795 | return val; | 2757 | return val; |
2796 | } | 2758 | } |
2797 | 2759 | ||
2760 | static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) | ||
2761 | { | ||
2762 | u32 c1, c2, c3 = d3; /* y^0 == 1 */ | ||
2763 | |||
2764 | /* | ||
2765 | * c1 = d1 y^p | ||
2766 | */ | ||
2767 | c1 = decay_load((u64)d1, periods); | ||
2768 | |||
2769 | /* | ||
2770 | * p-1 | ||
2771 | * c2 = 1024 \Sum y^n | ||
2772 | * n=1 | ||
2773 | * | ||
2774 | * inf inf | ||
2775 | * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) | ||
2776 | * n=0 n=p | ||
2777 | */ | ||
2778 | c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; | ||
2779 | |||
2780 | return c1 + c2 + c3; | ||
2781 | } | ||
2782 | |||
2783 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | ||
2784 | |||
2798 | /* | 2785 | /* |
2799 | * For updates fully spanning n periods, the contribution to runnable | 2786 | * Accumulate the three separate parts of the sum; d1 the remainder |
2800 | * average will be: \Sum 1024*y^n | 2787 | * of the last (incomplete) period, d2 the span of full periods and d3 |
2788 | * the remainder of the (incomplete) current period. | ||
2789 | * | ||
2790 | * d1 d2 d3 | ||
2791 | * ^ ^ ^ | ||
2792 | * | | | | ||
2793 | * |<->|<----------------->|<--->| | ||
2794 | * ... |---x---|------| ... |------|-----x (now) | ||
2795 | * | ||
2796 | * p-1 | ||
2797 | * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 | ||
2798 | * n=1 | ||
2801 | * | 2799 | * |
2802 | * We can compute this reasonably efficiently by combining: | 2800 | * = u y^p + (Step 1) |
2803 | * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} | 2801 | * |
2802 | * p-1 | ||
2803 | * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) | ||
2804 | * n=1 | ||
2804 | */ | 2805 | */ |
2805 | static u32 __compute_runnable_contrib(u64 n) | 2806 | static __always_inline u32 |
2807 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | ||
2808 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | ||
2806 | { | 2809 | { |
2807 | u32 contrib = 0; | 2810 | unsigned long scale_freq, scale_cpu; |
2811 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ | ||
2812 | u64 periods; | ||
2808 | 2813 | ||
2809 | if (likely(n <= LOAD_AVG_PERIOD)) | 2814 | scale_freq = arch_scale_freq_capacity(NULL, cpu); |
2810 | return runnable_avg_yN_sum[n]; | 2815 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); |
2811 | else if (unlikely(n >= LOAD_AVG_MAX_N)) | ||
2812 | return LOAD_AVG_MAX; | ||
2813 | 2816 | ||
2814 | /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ | 2817 | delta += sa->period_contrib; |
2815 | contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; | 2818 | periods = delta / 1024; /* A period is 1024us (~1ms) */ |
2816 | n %= LOAD_AVG_PERIOD; | ||
2817 | contrib = decay_load(contrib, n); | ||
2818 | return contrib + runnable_avg_yN_sum[n]; | ||
2819 | } | ||
2820 | 2819 | ||
2821 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | 2820 | /* |
2821 | * Step 1: decay old *_sum if we crossed period boundaries. | ||
2822 | */ | ||
2823 | if (periods) { | ||
2824 | sa->load_sum = decay_load(sa->load_sum, periods); | ||
2825 | if (cfs_rq) { | ||
2826 | cfs_rq->runnable_load_sum = | ||
2827 | decay_load(cfs_rq->runnable_load_sum, periods); | ||
2828 | } | ||
2829 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); | ||
2830 | |||
2831 | /* | ||
2832 | * Step 2 | ||
2833 | */ | ||
2834 | delta %= 1024; | ||
2835 | contrib = __accumulate_pelt_segments(periods, | ||
2836 | 1024 - sa->period_contrib, delta); | ||
2837 | } | ||
2838 | sa->period_contrib = delta; | ||
2839 | |||
2840 | contrib = cap_scale(contrib, scale_freq); | ||
2841 | if (weight) { | ||
2842 | sa->load_sum += weight * contrib; | ||
2843 | if (cfs_rq) | ||
2844 | cfs_rq->runnable_load_sum += weight * contrib; | ||
2845 | } | ||
2846 | if (running) | ||
2847 | sa->util_sum += contrib * scale_cpu; | ||
2848 | |||
2849 | return periods; | ||
2850 | } | ||
2822 | 2851 | ||
2823 | /* | 2852 | /* |
2824 | * We can represent the historical contribution to runnable average as the | 2853 | * We can represent the historical contribution to runnable average as the |
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2849 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2878 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2850 | */ | 2879 | */ |
2851 | static __always_inline int | 2880 | static __always_inline int |
2852 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 2881 | ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
2853 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 2882 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
2854 | { | 2883 | { |
2855 | u64 delta, scaled_delta, periods; | 2884 | u64 delta; |
2856 | u32 contrib; | ||
2857 | unsigned int delta_w, scaled_delta_w, decayed = 0; | ||
2858 | unsigned long scale_freq, scale_cpu; | ||
2859 | 2885 | ||
2860 | delta = now - sa->last_update_time; | 2886 | delta = now - sa->last_update_time; |
2861 | /* | 2887 | /* |
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2874 | delta >>= 10; | 2900 | delta >>= 10; |
2875 | if (!delta) | 2901 | if (!delta) |
2876 | return 0; | 2902 | return 0; |
2877 | sa->last_update_time = now; | ||
2878 | |||
2879 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2880 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
2881 | |||
2882 | /* delta_w is the amount already accumulated against our next period */ | ||
2883 | delta_w = sa->period_contrib; | ||
2884 | if (delta + delta_w >= 1024) { | ||
2885 | decayed = 1; | ||
2886 | 2903 | ||
2887 | /* how much left for next period will start over, we don't know yet */ | 2904 | sa->last_update_time += delta << 10; |
2888 | sa->period_contrib = 0; | ||
2889 | 2905 | ||
2890 | /* | 2906 | /* |
2891 | * Now that we know we're crossing a period boundary, figure | 2907 | * Now we know we crossed measurement unit boundaries. The *_avg |
2892 | * out how much from delta we need to complete the current | 2908 | * accrues by two steps: |
2893 | * period and accrue it. | 2909 | * |
2894 | */ | 2910 | * Step 1: accumulate *_sum since last_update_time. If we haven't |
2895 | delta_w = 1024 - delta_w; | 2911 | * crossed period boundaries, finish. |
2896 | scaled_delta_w = cap_scale(delta_w, scale_freq); | 2912 | */ |
2897 | if (weight) { | 2913 | if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) |
2898 | sa->load_sum += weight * scaled_delta_w; | 2914 | return 0; |
2899 | if (cfs_rq) { | ||
2900 | cfs_rq->runnable_load_sum += | ||
2901 | weight * scaled_delta_w; | ||
2902 | } | ||
2903 | } | ||
2904 | if (running) | ||
2905 | sa->util_sum += scaled_delta_w * scale_cpu; | ||
2906 | |||
2907 | delta -= delta_w; | ||
2908 | |||
2909 | /* Figure out how many additional periods this update spans */ | ||
2910 | periods = delta / 1024; | ||
2911 | delta %= 1024; | ||
2912 | 2915 | ||
2913 | sa->load_sum = decay_load(sa->load_sum, periods + 1); | 2916 | /* |
2914 | if (cfs_rq) { | 2917 | * Step 2: update *_avg. |
2915 | cfs_rq->runnable_load_sum = | 2918 | */ |
2916 | decay_load(cfs_rq->runnable_load_sum, periods + 1); | 2919 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); |
2917 | } | 2920 | if (cfs_rq) { |
2918 | sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); | 2921 | cfs_rq->runnable_load_avg = |
2919 | 2922 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | |
2920 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | ||
2921 | contrib = __compute_runnable_contrib(periods); | ||
2922 | contrib = cap_scale(contrib, scale_freq); | ||
2923 | if (weight) { | ||
2924 | sa->load_sum += weight * contrib; | ||
2925 | if (cfs_rq) | ||
2926 | cfs_rq->runnable_load_sum += weight * contrib; | ||
2927 | } | ||
2928 | if (running) | ||
2929 | sa->util_sum += contrib * scale_cpu; | ||
2930 | } | 2923 | } |
2924 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; | ||
2931 | 2925 | ||
2932 | /* Remainder of delta accrued against u_0` */ | 2926 | return 1; |
2933 | scaled_delta = cap_scale(delta, scale_freq); | 2927 | } |
2934 | if (weight) { | ||
2935 | sa->load_sum += weight * scaled_delta; | ||
2936 | if (cfs_rq) | ||
2937 | cfs_rq->runnable_load_sum += weight * scaled_delta; | ||
2938 | } | ||
2939 | if (running) | ||
2940 | sa->util_sum += scaled_delta * scale_cpu; | ||
2941 | 2928 | ||
2942 | sa->period_contrib += delta; | 2929 | static int |
2930 | __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | ||
2931 | { | ||
2932 | return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); | ||
2933 | } | ||
2943 | 2934 | ||
2944 | if (decayed) { | 2935 | static int |
2945 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); | 2936 | __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) |
2946 | if (cfs_rq) { | 2937 | { |
2947 | cfs_rq->runnable_load_avg = | 2938 | return ___update_load_avg(now, cpu, &se->avg, |
2948 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2939 | se->on_rq * scale_load_down(se->load.weight), |
2949 | } | 2940 | cfs_rq->curr == se, NULL); |
2950 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; | 2941 | } |
2951 | } | ||
2952 | 2942 | ||
2953 | return decayed; | 2943 | static int |
2944 | __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | ||
2945 | { | ||
2946 | return ___update_load_avg(now, cpu, &cfs_rq->avg, | ||
2947 | scale_load_down(cfs_rq->load.weight), | ||
2948 | cfs_rq->curr != NULL, cfs_rq); | ||
2954 | } | 2949 | } |
2955 | 2950 | ||
2956 | /* | 2951 | /* |
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
3014 | void set_task_rq_fair(struct sched_entity *se, | 3009 | void set_task_rq_fair(struct sched_entity *se, |
3015 | struct cfs_rq *prev, struct cfs_rq *next) | 3010 | struct cfs_rq *prev, struct cfs_rq *next) |
3016 | { | 3011 | { |
3012 | u64 p_last_update_time; | ||
3013 | u64 n_last_update_time; | ||
3014 | |||
3017 | if (!sched_feat(ATTACH_AGE_LOAD)) | 3015 | if (!sched_feat(ATTACH_AGE_LOAD)) |
3018 | return; | 3016 | return; |
3019 | 3017 | ||
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3024 | * time. This will result in the wakee task is less decayed, but giving | 3022 | * time. This will result in the wakee task is less decayed, but giving |
3025 | * the wakee more load sounds not bad. | 3023 | * the wakee more load sounds not bad. |
3026 | */ | 3024 | */ |
3027 | if (se->avg.last_update_time && prev) { | 3025 | if (!(se->avg.last_update_time && prev)) |
3028 | u64 p_last_update_time; | 3026 | return; |
3029 | u64 n_last_update_time; | ||
3030 | 3027 | ||
3031 | #ifndef CONFIG_64BIT | 3028 | #ifndef CONFIG_64BIT |
3029 | { | ||
3032 | u64 p_last_update_time_copy; | 3030 | u64 p_last_update_time_copy; |
3033 | u64 n_last_update_time_copy; | 3031 | u64 n_last_update_time_copy; |
3034 | 3032 | ||
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3043 | 3041 | ||
3044 | } while (p_last_update_time != p_last_update_time_copy || | 3042 | } while (p_last_update_time != p_last_update_time_copy || |
3045 | n_last_update_time != n_last_update_time_copy); | 3043 | n_last_update_time != n_last_update_time_copy); |
3044 | } | ||
3046 | #else | 3045 | #else |
3047 | p_last_update_time = prev->avg.last_update_time; | 3046 | p_last_update_time = prev->avg.last_update_time; |
3048 | n_last_update_time = next->avg.last_update_time; | 3047 | n_last_update_time = next->avg.last_update_time; |
3049 | #endif | 3048 | #endif |
3050 | __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), | 3049 | __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); |
3051 | &se->avg, 0, 0, NULL); | 3050 | se->avg.last_update_time = n_last_update_time; |
3052 | se->avg.last_update_time = n_last_update_time; | ||
3053 | } | ||
3054 | } | 3051 | } |
3055 | 3052 | ||
3056 | /* Take into account change of utilization of a child task group */ | 3053 | /* Take into account change of utilization of a child task group */ |
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) | |||
3173 | return 1; | 3170 | return 1; |
3174 | } | 3171 | } |
3175 | 3172 | ||
3173 | /* | ||
3174 | * Check if we need to update the load and the utilization of a blocked | ||
3175 | * group_entity: | ||
3176 | */ | ||
3177 | static inline bool skip_blocked_update(struct sched_entity *se) | ||
3178 | { | ||
3179 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); | ||
3180 | |||
3181 | /* | ||
3182 | * If sched_entity still have not zero load or utilization, we have to | ||
3183 | * decay it: | ||
3184 | */ | ||
3185 | if (se->avg.load_avg || se->avg.util_avg) | ||
3186 | return false; | ||
3187 | |||
3188 | /* | ||
3189 | * If there is a pending propagation, we have to update the load and | ||
3190 | * the utilization of the sched_entity: | ||
3191 | */ | ||
3192 | if (gcfs_rq->propagate_avg) | ||
3193 | return false; | ||
3194 | |||
3195 | /* | ||
3196 | * Otherwise, the load and the utilization of the sched_entity is | ||
3197 | * already zero and there is no pending propagation, so it will be a | ||
3198 | * waste of time to try to decay it: | ||
3199 | */ | ||
3200 | return true; | ||
3201 | } | ||
3202 | |||
3176 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 3203 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
3177 | 3204 | ||
3178 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 3205 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3265 | set_tg_cfs_propagate(cfs_rq); | 3292 | set_tg_cfs_propagate(cfs_rq); |
3266 | } | 3293 | } |
3267 | 3294 | ||
3268 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 3295 | decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); |
3269 | scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); | ||
3270 | 3296 | ||
3271 | #ifndef CONFIG_64BIT | 3297 | #ifndef CONFIG_64BIT |
3272 | smp_wmb(); | 3298 | smp_wmb(); |
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) | |||
3298 | * Track task load average for carrying it to new CPU after migrated, and | 3324 | * Track task load average for carrying it to new CPU after migrated, and |
3299 | * track group sched_entity load average for task_h_load calc in migration | 3325 | * track group sched_entity load average for task_h_load calc in migration |
3300 | */ | 3326 | */ |
3301 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { | 3327 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) |
3302 | __update_load_avg(now, cpu, &se->avg, | 3328 | __update_load_avg_se(now, cpu, cfs_rq, se); |
3303 | se->on_rq * scale_load_down(se->load.weight), | ||
3304 | cfs_rq->curr == se, NULL); | ||
3305 | } | ||
3306 | 3329 | ||
3307 | decayed = update_cfs_rq_load_avg(now, cfs_rq, true); | 3330 | decayed = update_cfs_rq_load_avg(now, cfs_rq, true); |
3308 | decayed |= propagate_entity_load_avg(se); | 3331 | decayed |= propagate_entity_load_avg(se); |
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) | |||
3407 | u64 last_update_time; | 3430 | u64 last_update_time; |
3408 | 3431 | ||
3409 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3432 | last_update_time = cfs_rq_last_update_time(cfs_rq); |
3410 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | 3433 | __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); |
3411 | } | 3434 | } |
3412 | 3435 | ||
3413 | /* | 3436 | /* |
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | |||
4271 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | 4294 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, |
4272 | throttled_list) { | 4295 | throttled_list) { |
4273 | struct rq *rq = rq_of(cfs_rq); | 4296 | struct rq *rq = rq_of(cfs_rq); |
4297 | struct rq_flags rf; | ||
4274 | 4298 | ||
4275 | raw_spin_lock(&rq->lock); | 4299 | rq_lock(rq, &rf); |
4276 | if (!cfs_rq_throttled(cfs_rq)) | 4300 | if (!cfs_rq_throttled(cfs_rq)) |
4277 | goto next; | 4301 | goto next; |
4278 | 4302 | ||
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | |||
4289 | unthrottle_cfs_rq(cfs_rq); | 4313 | unthrottle_cfs_rq(cfs_rq); |
4290 | 4314 | ||
4291 | next: | 4315 | next: |
4292 | raw_spin_unlock(&rq->lock); | 4316 | rq_unlock(rq, &rf); |
4293 | 4317 | ||
4294 | if (!remaining) | 4318 | if (!remaining) |
4295 | break; | 4319 | break; |
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) | |||
5097 | unsigned long curr_jiffies = READ_ONCE(jiffies); | 5121 | unsigned long curr_jiffies = READ_ONCE(jiffies); |
5098 | struct rq *this_rq = this_rq(); | 5122 | struct rq *this_rq = this_rq(); |
5099 | unsigned long load; | 5123 | unsigned long load; |
5124 | struct rq_flags rf; | ||
5100 | 5125 | ||
5101 | if (curr_jiffies == this_rq->last_load_update_tick) | 5126 | if (curr_jiffies == this_rq->last_load_update_tick) |
5102 | return; | 5127 | return; |
5103 | 5128 | ||
5104 | load = weighted_cpuload(cpu_of(this_rq)); | 5129 | load = weighted_cpuload(cpu_of(this_rq)); |
5105 | raw_spin_lock(&this_rq->lock); | 5130 | rq_lock(this_rq, &rf); |
5106 | update_rq_clock(this_rq); | 5131 | update_rq_clock(this_rq); |
5107 | cpu_load_update_nohz(this_rq, curr_jiffies, load); | 5132 | cpu_load_update_nohz(this_rq, curr_jiffies, load); |
5108 | raw_spin_unlock(&this_rq->lock); | 5133 | rq_unlock(this_rq, &rf); |
5109 | } | 5134 | } |
5110 | #else /* !CONFIG_NO_HZ_COMMON */ | 5135 | #else /* !CONFIG_NO_HZ_COMMON */ |
5111 | static inline void cpu_load_update_nohz(struct rq *this_rq, | 5136 | static inline void cpu_load_update_nohz(struct rq *this_rq, |
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) | |||
6769 | lockdep_assert_held(&env->src_rq->lock); | 6794 | lockdep_assert_held(&env->src_rq->lock); |
6770 | 6795 | ||
6771 | p->on_rq = TASK_ON_RQ_MIGRATING; | 6796 | p->on_rq = TASK_ON_RQ_MIGRATING; |
6772 | deactivate_task(env->src_rq, p, 0); | 6797 | deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); |
6773 | set_task_cpu(p, env->dst_cpu); | 6798 | set_task_cpu(p, env->dst_cpu); |
6774 | } | 6799 | } |
6775 | 6800 | ||
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) | |||
6902 | lockdep_assert_held(&rq->lock); | 6927 | lockdep_assert_held(&rq->lock); |
6903 | 6928 | ||
6904 | BUG_ON(task_rq(p) != rq); | 6929 | BUG_ON(task_rq(p) != rq); |
6905 | activate_task(rq, p, 0); | 6930 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
6906 | p->on_rq = TASK_ON_RQ_QUEUED; | 6931 | p->on_rq = TASK_ON_RQ_QUEUED; |
6907 | check_preempt_curr(rq, p, 0); | 6932 | check_preempt_curr(rq, p, 0); |
6908 | } | 6933 | } |
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) | |||
6913 | */ | 6938 | */ |
6914 | static void attach_one_task(struct rq *rq, struct task_struct *p) | 6939 | static void attach_one_task(struct rq *rq, struct task_struct *p) |
6915 | { | 6940 | { |
6916 | raw_spin_lock(&rq->lock); | 6941 | struct rq_flags rf; |
6942 | |||
6943 | rq_lock(rq, &rf); | ||
6944 | update_rq_clock(rq); | ||
6917 | attach_task(rq, p); | 6945 | attach_task(rq, p); |
6918 | raw_spin_unlock(&rq->lock); | 6946 | rq_unlock(rq, &rf); |
6919 | } | 6947 | } |
6920 | 6948 | ||
6921 | /* | 6949 | /* |
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) | |||
6926 | { | 6954 | { |
6927 | struct list_head *tasks = &env->tasks; | 6955 | struct list_head *tasks = &env->tasks; |
6928 | struct task_struct *p; | 6956 | struct task_struct *p; |
6957 | struct rq_flags rf; | ||
6929 | 6958 | ||
6930 | raw_spin_lock(&env->dst_rq->lock); | 6959 | rq_lock(env->dst_rq, &rf); |
6960 | update_rq_clock(env->dst_rq); | ||
6931 | 6961 | ||
6932 | while (!list_empty(tasks)) { | 6962 | while (!list_empty(tasks)) { |
6933 | p = list_first_entry(tasks, struct task_struct, se.group_node); | 6963 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) | |||
6936 | attach_task(env->dst_rq, p); | 6966 | attach_task(env->dst_rq, p); |
6937 | } | 6967 | } |
6938 | 6968 | ||
6939 | raw_spin_unlock(&env->dst_rq->lock); | 6969 | rq_unlock(env->dst_rq, &rf); |
6940 | } | 6970 | } |
6941 | 6971 | ||
6942 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6972 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu) | |||
6944 | { | 6974 | { |
6945 | struct rq *rq = cpu_rq(cpu); | 6975 | struct rq *rq = cpu_rq(cpu); |
6946 | struct cfs_rq *cfs_rq; | 6976 | struct cfs_rq *cfs_rq; |
6947 | unsigned long flags; | 6977 | struct rq_flags rf; |
6948 | 6978 | ||
6949 | raw_spin_lock_irqsave(&rq->lock, flags); | 6979 | rq_lock_irqsave(rq, &rf); |
6950 | update_rq_clock(rq); | 6980 | update_rq_clock(rq); |
6951 | 6981 | ||
6952 | /* | 6982 | /* |
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu) | |||
6954 | * list_add_leaf_cfs_rq() for details. | 6984 | * list_add_leaf_cfs_rq() for details. |
6955 | */ | 6985 | */ |
6956 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 6986 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
6987 | struct sched_entity *se; | ||
6988 | |||
6957 | /* throttled entities do not contribute to load */ | 6989 | /* throttled entities do not contribute to load */ |
6958 | if (throttled_hierarchy(cfs_rq)) | 6990 | if (throttled_hierarchy(cfs_rq)) |
6959 | continue; | 6991 | continue; |
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu) | |||
6961 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) | 6993 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) |
6962 | update_tg_load_avg(cfs_rq, 0); | 6994 | update_tg_load_avg(cfs_rq, 0); |
6963 | 6995 | ||
6964 | /* Propagate pending load changes to the parent */ | 6996 | /* Propagate pending load changes to the parent, if any: */ |
6965 | if (cfs_rq->tg->se[cpu]) | 6997 | se = cfs_rq->tg->se[cpu]; |
6966 | update_load_avg(cfs_rq->tg->se[cpu], 0); | 6998 | if (se && !skip_blocked_update(se)) |
6999 | update_load_avg(se, 0); | ||
6967 | } | 7000 | } |
6968 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 7001 | rq_unlock_irqrestore(rq, &rf); |
6969 | } | 7002 | } |
6970 | 7003 | ||
6971 | /* | 7004 | /* |
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) | |||
7019 | { | 7052 | { |
7020 | struct rq *rq = cpu_rq(cpu); | 7053 | struct rq *rq = cpu_rq(cpu); |
7021 | struct cfs_rq *cfs_rq = &rq->cfs; | 7054 | struct cfs_rq *cfs_rq = &rq->cfs; |
7022 | unsigned long flags; | 7055 | struct rq_flags rf; |
7023 | 7056 | ||
7024 | raw_spin_lock_irqsave(&rq->lock, flags); | 7057 | rq_lock_irqsave(rq, &rf); |
7025 | update_rq_clock(rq); | 7058 | update_rq_clock(rq); |
7026 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); | 7059 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); |
7027 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 7060 | rq_unlock_irqrestore(rq, &rf); |
7028 | } | 7061 | } |
7029 | 7062 | ||
7030 | static unsigned long task_h_load(struct task_struct *p) | 7063 | static unsigned long task_h_load(struct task_struct *p) |
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
7525 | { | 7558 | { |
7526 | struct sched_domain *child = env->sd->child; | 7559 | struct sched_domain *child = env->sd->child; |
7527 | struct sched_group *sg = env->sd->groups; | 7560 | struct sched_group *sg = env->sd->groups; |
7561 | struct sg_lb_stats *local = &sds->local_stat; | ||
7528 | struct sg_lb_stats tmp_sgs; | 7562 | struct sg_lb_stats tmp_sgs; |
7529 | int load_idx, prefer_sibling = 0; | 7563 | int load_idx, prefer_sibling = 0; |
7530 | bool overload = false; | 7564 | bool overload = false; |
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
7541 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 7575 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
7542 | if (local_group) { | 7576 | if (local_group) { |
7543 | sds->local = sg; | 7577 | sds->local = sg; |
7544 | sgs = &sds->local_stat; | 7578 | sgs = local; |
7545 | 7579 | ||
7546 | if (env->idle != CPU_NEWLY_IDLE || | 7580 | if (env->idle != CPU_NEWLY_IDLE || |
7547 | time_after_eq(jiffies, sg->sgc->next_update)) | 7581 | time_after_eq(jiffies, sg->sgc->next_update)) |
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
7565 | * the tasks on the system). | 7599 | * the tasks on the system). |
7566 | */ | 7600 | */ |
7567 | if (prefer_sibling && sds->local && | 7601 | if (prefer_sibling && sds->local && |
7568 | group_has_capacity(env, &sds->local_stat) && | 7602 | group_has_capacity(env, local) && |
7569 | (sgs->sum_nr_running > 1)) { | 7603 | (sgs->sum_nr_running > local->sum_nr_running + 1)) { |
7570 | sgs->group_no_capacity = 1; | 7604 | sgs->group_no_capacity = 1; |
7571 | sgs->group_type = group_classify(sg, sgs); | 7605 | sgs->group_type = group_classify(sg, sgs); |
7572 | } | 7606 | } |
@@ -7597,7 +7631,7 @@ next_group: | |||
7597 | 7631 | ||
7598 | /** | 7632 | /** |
7599 | * check_asym_packing - Check to see if the group is packed into the | 7633 | * check_asym_packing - Check to see if the group is packed into the |
7600 | * sched doman. | 7634 | * sched domain. |
7601 | * | 7635 | * |
7602 | * This is primarily intended to used at the sibling level. Some | 7636 | * This is primarily intended to used at the sibling level. Some |
7603 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | 7637 | * cores like POWER7 prefer to use lower numbered SMT threads. In the |
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
8042 | struct sched_domain *sd_parent = sd->parent; | 8076 | struct sched_domain *sd_parent = sd->parent; |
8043 | struct sched_group *group; | 8077 | struct sched_group *group; |
8044 | struct rq *busiest; | 8078 | struct rq *busiest; |
8045 | unsigned long flags; | 8079 | struct rq_flags rf; |
8046 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); | 8080 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); |
8047 | 8081 | ||
8048 | struct lb_env env = { | 8082 | struct lb_env env = { |
@@ -8105,7 +8139,7 @@ redo: | |||
8105 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 8139 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
8106 | 8140 | ||
8107 | more_balance: | 8141 | more_balance: |
8108 | raw_spin_lock_irqsave(&busiest->lock, flags); | 8142 | rq_lock_irqsave(busiest, &rf); |
8109 | update_rq_clock(busiest); | 8143 | update_rq_clock(busiest); |
8110 | 8144 | ||
8111 | /* | 8145 | /* |
@@ -8122,14 +8156,14 @@ more_balance: | |||
8122 | * See task_rq_lock() family for the details. | 8156 | * See task_rq_lock() family for the details. |
8123 | */ | 8157 | */ |
8124 | 8158 | ||
8125 | raw_spin_unlock(&busiest->lock); | 8159 | rq_unlock(busiest, &rf); |
8126 | 8160 | ||
8127 | if (cur_ld_moved) { | 8161 | if (cur_ld_moved) { |
8128 | attach_tasks(&env); | 8162 | attach_tasks(&env); |
8129 | ld_moved += cur_ld_moved; | 8163 | ld_moved += cur_ld_moved; |
8130 | } | 8164 | } |
8131 | 8165 | ||
8132 | local_irq_restore(flags); | 8166 | local_irq_restore(rf.flags); |
8133 | 8167 | ||
8134 | if (env.flags & LBF_NEED_BREAK) { | 8168 | if (env.flags & LBF_NEED_BREAK) { |
8135 | env.flags &= ~LBF_NEED_BREAK; | 8169 | env.flags &= ~LBF_NEED_BREAK; |
@@ -8207,6 +8241,8 @@ more_balance: | |||
8207 | sd->nr_balance_failed++; | 8241 | sd->nr_balance_failed++; |
8208 | 8242 | ||
8209 | if (need_active_balance(&env)) { | 8243 | if (need_active_balance(&env)) { |
8244 | unsigned long flags; | ||
8245 | |||
8210 | raw_spin_lock_irqsave(&busiest->lock, flags); | 8246 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8211 | 8247 | ||
8212 | /* don't kick the active_load_balance_cpu_stop, | 8248 | /* don't kick the active_load_balance_cpu_stop, |
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) | |||
8444 | struct rq *target_rq = cpu_rq(target_cpu); | 8480 | struct rq *target_rq = cpu_rq(target_cpu); |
8445 | struct sched_domain *sd; | 8481 | struct sched_domain *sd; |
8446 | struct task_struct *p = NULL; | 8482 | struct task_struct *p = NULL; |
8483 | struct rq_flags rf; | ||
8447 | 8484 | ||
8448 | raw_spin_lock_irq(&busiest_rq->lock); | 8485 | rq_lock_irq(busiest_rq, &rf); |
8449 | 8486 | ||
8450 | /* make sure the requested cpu hasn't gone down in the meantime */ | 8487 | /* make sure the requested cpu hasn't gone down in the meantime */ |
8451 | if (unlikely(busiest_cpu != smp_processor_id() || | 8488 | if (unlikely(busiest_cpu != smp_processor_id() || |
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8496 | rcu_read_unlock(); | 8533 | rcu_read_unlock(); |
8497 | out_unlock: | 8534 | out_unlock: |
8498 | busiest_rq->active_balance = 0; | 8535 | busiest_rq->active_balance = 0; |
8499 | raw_spin_unlock(&busiest_rq->lock); | 8536 | rq_unlock(busiest_rq, &rf); |
8500 | 8537 | ||
8501 | if (p) | 8538 | if (p) |
8502 | attach_one_task(target_rq, p); | 8539 | attach_one_task(target_rq, p); |
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
8794 | * do the balance. | 8831 | * do the balance. |
8795 | */ | 8832 | */ |
8796 | if (time_after_eq(jiffies, rq->next_balance)) { | 8833 | if (time_after_eq(jiffies, rq->next_balance)) { |
8797 | raw_spin_lock_irq(&rq->lock); | 8834 | struct rq_flags rf; |
8835 | |||
8836 | rq_lock_irq(rq, &rf); | ||
8798 | update_rq_clock(rq); | 8837 | update_rq_clock(rq); |
8799 | cpu_load_update_idle(rq); | 8838 | cpu_load_update_idle(rq); |
8800 | raw_spin_unlock_irq(&rq->lock); | 8839 | rq_unlock_irq(rq, &rf); |
8840 | |||
8801 | rebalance_domains(rq, CPU_IDLE); | 8841 | rebalance_domains(rq, CPU_IDLE); |
8802 | } | 8842 | } |
8803 | 8843 | ||
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) | |||
8988 | struct cfs_rq *cfs_rq; | 9028 | struct cfs_rq *cfs_rq; |
8989 | struct sched_entity *se = &p->se, *curr; | 9029 | struct sched_entity *se = &p->se, *curr; |
8990 | struct rq *rq = this_rq(); | 9030 | struct rq *rq = this_rq(); |
9031 | struct rq_flags rf; | ||
8991 | 9032 | ||
8992 | raw_spin_lock(&rq->lock); | 9033 | rq_lock(rq, &rf); |
8993 | update_rq_clock(rq); | 9034 | update_rq_clock(rq); |
8994 | 9035 | ||
8995 | cfs_rq = task_cfs_rq(current); | 9036 | cfs_rq = task_cfs_rq(current); |
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) | |||
9010 | } | 9051 | } |
9011 | 9052 | ||
9012 | se->vruntime -= cfs_rq->min_vruntime; | 9053 | se->vruntime -= cfs_rq->min_vruntime; |
9013 | raw_spin_unlock(&rq->lock); | 9054 | rq_unlock(rq, &rf); |
9014 | } | 9055 | } |
9015 | 9056 | ||
9016 | /* | 9057 | /* |
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); | |||
9372 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 9413 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
9373 | { | 9414 | { |
9374 | int i; | 9415 | int i; |
9375 | unsigned long flags; | ||
9376 | 9416 | ||
9377 | /* | 9417 | /* |
9378 | * We can't change the weight of the root cgroup. | 9418 | * We can't change the weight of the root cgroup. |
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
9389 | tg->shares = shares; | 9429 | tg->shares = shares; |
9390 | for_each_possible_cpu(i) { | 9430 | for_each_possible_cpu(i) { |
9391 | struct rq *rq = cpu_rq(i); | 9431 | struct rq *rq = cpu_rq(i); |
9392 | struct sched_entity *se; | 9432 | struct sched_entity *se = tg->se[i]; |
9433 | struct rq_flags rf; | ||
9393 | 9434 | ||
9394 | se = tg->se[i]; | ||
9395 | /* Propagate contribution to hierarchy */ | 9435 | /* Propagate contribution to hierarchy */ |
9396 | raw_spin_lock_irqsave(&rq->lock, flags); | 9436 | rq_lock_irqsave(rq, &rf); |
9397 | |||
9398 | /* Possible calls to update_curr() need rq clock */ | ||
9399 | update_rq_clock(rq); | 9437 | update_rq_clock(rq); |
9400 | for_each_sched_entity(se) { | 9438 | for_each_sched_entity(se) { |
9401 | update_load_avg(se, UPDATE_TG); | 9439 | update_load_avg(se, UPDATE_TG); |
9402 | update_cfs_shares(se); | 9440 | update_cfs_shares(se); |
9403 | } | 9441 | } |
9404 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 9442 | rq_unlock_irqrestore(rq, &rf); |
9405 | } | 9443 | } |
9406 | 9444 | ||
9407 | done: | 9445 | done: |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1b3c8189b286..11192e0cb122 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
56 | */ | 56 | */ |
57 | SCHED_FEAT(SIS_AVG_CPU, false) | 57 | SCHED_FEAT(SIS_AVG_CPU, false) |
58 | 58 | ||
59 | /* | ||
60 | * Issue a WARN when we do multiple update_rq_clock() calls | ||
61 | * in a single rq->lock section. Default disabled because the | ||
62 | * annotations are not complete. | ||
63 | */ | ||
64 | SCHED_FEAT(WARN_DOUBLE_CLOCK, false) | ||
65 | |||
59 | #ifdef HAVE_RT_PUSH_IPI | 66 | #ifdef HAVE_RT_PUSH_IPI |
60 | /* | 67 | /* |
61 | * In order to avoid a thundering herd attack of CPUs that are | 68 | * In order to avoid a thundering herd attack of CPUs that are |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ac6d5176463d..ef63adce0c9c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/stackprotector.h> | 11 | #include <linux/stackprotector.h> |
12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
13 | #include <linux/livepatch.h> | ||
13 | 14 | ||
14 | #include <asm/tlb.h> | 15 | #include <asm/tlb.h> |
15 | 16 | ||
@@ -264,7 +265,10 @@ static void do_idle(void) | |||
264 | smp_mb__after_atomic(); | 265 | smp_mb__after_atomic(); |
265 | 266 | ||
266 | sched_ttwu_pending(); | 267 | sched_ttwu_pending(); |
267 | schedule_preempt_disabled(); | 268 | schedule_idle(); |
269 | |||
270 | if (unlikely(klp_patch_pending(current))) | ||
271 | klp_update_patch_state(current); | ||
268 | } | 272 | } |
269 | 273 | ||
270 | bool cpu_in_idle(unsigned long pc) | 274 | bool cpu_in_idle(unsigned long pc) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9f3e40226dec..979b7341008a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) | |||
1927 | #define RT_PUSH_IPI_EXECUTING 1 | 1927 | #define RT_PUSH_IPI_EXECUTING 1 |
1928 | #define RT_PUSH_IPI_RESTART 2 | 1928 | #define RT_PUSH_IPI_RESTART 2 |
1929 | 1929 | ||
1930 | /* | ||
1931 | * When a high priority task schedules out from a CPU and a lower priority | ||
1932 | * task is scheduled in, a check is made to see if there's any RT tasks | ||
1933 | * on other CPUs that are waiting to run because a higher priority RT task | ||
1934 | * is currently running on its CPU. In this case, the CPU with multiple RT | ||
1935 | * tasks queued on it (overloaded) needs to be notified that a CPU has opened | ||
1936 | * up that may be able to run one of its non-running queued RT tasks. | ||
1937 | * | ||
1938 | * On large CPU boxes, there's the case that several CPUs could schedule | ||
1939 | * a lower priority task at the same time, in which case it will look for | ||
1940 | * any overloaded CPUs that it could pull a task from. To do this, the runqueue | ||
1941 | * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting | ||
1942 | * for a single overloaded CPU's runqueue lock can produce a large latency. | ||
1943 | * (This has actually been observed on large boxes running cyclictest). | ||
1944 | * Instead of taking the runqueue lock of the overloaded CPU, each of the | ||
1945 | * CPUs that scheduled a lower priority task simply sends an IPI to the | ||
1946 | * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with | ||
1947 | * lots of contention. The overloaded CPU will look to push its non-running | ||
1948 | * RT task off, and if it does, it can then ignore the other IPIs coming | ||
1949 | * in, and just pass those IPIs off to any other overloaded CPU. | ||
1950 | * | ||
1951 | * When a CPU schedules a lower priority task, it only sends an IPI to | ||
1952 | * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, | ||
1953 | * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with | ||
1954 | * RT overloaded tasks, would cause 100 IPIs to go out at once. | ||
1955 | * | ||
1956 | * The overloaded RT CPU, when receiving an IPI, will try to push off its | ||
1957 | * overloaded RT tasks and then send an IPI to the next CPU that has | ||
1958 | * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks | ||
1959 | * have completed. Just because a CPU may have pushed off its own overloaded | ||
1960 | * RT task does not mean it should stop sending the IPI around to other | ||
1961 | * overloaded CPUs. There may be another RT task waiting to run on one of | ||
1962 | * those CPUs that are of higher priority than the one that was just | ||
1963 | * pushed. | ||
1964 | * | ||
1965 | * An optimization that could possibly be made is to make a CPU array similar | ||
1966 | * to the cpupri array mask of all running RT tasks, but for the overloaded | ||
1967 | * case, then the IPI could be sent to only the CPU with the highest priority | ||
1968 | * RT task waiting, and that CPU could send off further IPIs to the CPU with | ||
1969 | * the next highest waiting task. Since the overloaded case is much less likely | ||
1970 | * to happen, the complexity of this implementation may not be worth it. | ||
1971 | * Instead, just send an IPI around to all overloaded CPUs. | ||
1972 | * | ||
1973 | * The rq->rt.push_flags holds the status of the IPI that is going around. | ||
1974 | * A run queue can only send out a single IPI at a time. The possible flags | ||
1975 | * for rq->rt.push_flags are: | ||
1976 | * | ||
1977 | * (None or zero): No IPI is going around for the current rq | ||
1978 | * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around | ||
1979 | * RT_PUSH_IPI_RESTART: The priority of the running task for the rq | ||
1980 | * has changed, and the IPI should restart | ||
1981 | * circulating the overloaded CPUs again. | ||
1982 | * | ||
1983 | * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated | ||
1984 | * before sending to the next CPU. | ||
1985 | * | ||
1986 | * Instead of having all CPUs that schedule a lower priority task send | ||
1987 | * an IPI to the same "first" CPU in the RT overload mask, they send it | ||
1988 | * to the next overloaded CPU after their own CPU. This helps distribute | ||
1989 | * the work when there's more than one overloaded CPU and multiple CPUs | ||
1990 | * scheduling in lower priority tasks. | ||
1991 | * | ||
1992 | * When a rq schedules a lower priority task than what was currently | ||
1993 | * running, the next CPU with overloaded RT tasks is examined first. | ||
1994 | * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower | ||
1995 | * priority task, it will send an IPI first to CPU 5, then CPU 5 will | ||
1996 | * send to CPU 1 if it is still overloaded. CPU 1 will clear the | ||
1997 | * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. | ||
1998 | * | ||
1999 | * The first CPU to notice IPI_RESTART is set, will clear that flag and then | ||
2000 | * send an IPI to the next overloaded CPU after the rq->cpu and not the next | ||
2001 | * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 | ||
2002 | * schedules a lower priority task, and the IPI_RESTART gets set while the | ||
2003 | * handling is being done on CPU 5, it will clear the flag and send it back to | ||
2004 | * CPU 4 instead of CPU 1. | ||
2005 | * | ||
2006 | * Note, the above logic can be disabled by turning off the sched_feature | ||
2007 | * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be | ||
2008 | * taken by the CPU requesting a pull and the waiting RT task will be pulled | ||
2009 | * by that CPU. This may be fine for machines with few CPUs. | ||
2010 | */ | ||
1930 | static void tell_cpu_to_push(struct rq *rq) | 2011 | static void tell_cpu_to_push(struct rq *rq) |
1931 | { | 2012 | { |
1932 | int cpu; | 2013 | int cpu; |
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 000000000000..cd200d16529e --- /dev/null +++ b/kernel/sched/sched-pelt.h | |||
@@ -0,0 +1,13 @@ | |||
1 | /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ | ||
2 | |||
3 | static const u32 runnable_avg_yN_inv[] = { | ||
4 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | ||
5 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | ||
6 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | ||
7 | 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, | ||
8 | 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, | ||
9 | 0x85aac367, 0x82cd8698, | ||
10 | }; | ||
11 | |||
12 | #define LOAD_AVG_PERIOD 32 | ||
13 | #define LOAD_AVG_MAX 47742 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cbf92214ad8..6dda2aab731e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; | |||
1331 | #define DEQUEUE_SLEEP 0x01 | 1331 | #define DEQUEUE_SLEEP 0x01 |
1332 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | 1332 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ |
1333 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | 1333 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ |
1334 | #define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ | ||
1334 | 1335 | ||
1335 | #define ENQUEUE_WAKEUP 0x01 | 1336 | #define ENQUEUE_WAKEUP 0x01 |
1336 | #define ENQUEUE_RESTORE 0x02 | 1337 | #define ENQUEUE_RESTORE 0x02 |
1337 | #define ENQUEUE_MOVE 0x04 | 1338 | #define ENQUEUE_MOVE 0x04 |
1339 | #define ENQUEUE_NOCLOCK 0x08 | ||
1338 | 1340 | ||
1339 | #define ENQUEUE_HEAD 0x08 | 1341 | #define ENQUEUE_HEAD 0x10 |
1340 | #define ENQUEUE_REPLENISH 0x10 | 1342 | #define ENQUEUE_REPLENISH 0x20 |
1341 | #ifdef CONFIG_SMP | 1343 | #ifdef CONFIG_SMP |
1342 | #define ENQUEUE_MIGRATED 0x20 | 1344 | #define ENQUEUE_MIGRATED 0x40 |
1343 | #else | 1345 | #else |
1344 | #define ENQUEUE_MIGRATED 0x00 | 1346 | #define ENQUEUE_MIGRATED 0x00 |
1345 | #endif | 1347 | #endif |
@@ -1465,6 +1467,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) | |||
1465 | } | 1467 | } |
1466 | #endif | 1468 | #endif |
1467 | 1469 | ||
1470 | extern void schedule_idle(void); | ||
1471 | |||
1468 | extern void sysrq_sched_debug_show(void); | 1472 | extern void sysrq_sched_debug_show(void); |
1469 | extern void sched_init_granularity(void); | 1473 | extern void sched_init_granularity(void); |
1470 | extern void update_max_interval(void); | 1474 | extern void update_max_interval(void); |
@@ -1624,6 +1628,7 @@ static inline void sched_avg_update(struct rq *rq) { } | |||
1624 | 1628 | ||
1625 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1629 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
1626 | __acquires(rq->lock); | 1630 | __acquires(rq->lock); |
1631 | |||
1627 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1632 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
1628 | __acquires(p->pi_lock) | 1633 | __acquires(p->pi_lock) |
1629 | __acquires(rq->lock); | 1634 | __acquires(rq->lock); |
@@ -1645,6 +1650,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) | |||
1645 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); | 1650 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); |
1646 | } | 1651 | } |
1647 | 1652 | ||
1653 | static inline void | ||
1654 | rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) | ||
1655 | __acquires(rq->lock) | ||
1656 | { | ||
1657 | raw_spin_lock_irqsave(&rq->lock, rf->flags); | ||
1658 | rq_pin_lock(rq, rf); | ||
1659 | } | ||
1660 | |||
1661 | static inline void | ||
1662 | rq_lock_irq(struct rq *rq, struct rq_flags *rf) | ||
1663 | __acquires(rq->lock) | ||
1664 | { | ||
1665 | raw_spin_lock_irq(&rq->lock); | ||
1666 | rq_pin_lock(rq, rf); | ||
1667 | } | ||
1668 | |||
1669 | static inline void | ||
1670 | rq_lock(struct rq *rq, struct rq_flags *rf) | ||
1671 | __acquires(rq->lock) | ||
1672 | { | ||
1673 | raw_spin_lock(&rq->lock); | ||
1674 | rq_pin_lock(rq, rf); | ||
1675 | } | ||
1676 | |||
1677 | static inline void | ||
1678 | rq_relock(struct rq *rq, struct rq_flags *rf) | ||
1679 | __acquires(rq->lock) | ||
1680 | { | ||
1681 | raw_spin_lock(&rq->lock); | ||
1682 | rq_repin_lock(rq, rf); | ||
1683 | } | ||
1684 | |||
1685 | static inline void | ||
1686 | rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) | ||
1687 | __releases(rq->lock) | ||
1688 | { | ||
1689 | rq_unpin_lock(rq, rf); | ||
1690 | raw_spin_unlock_irqrestore(&rq->lock, rf->flags); | ||
1691 | } | ||
1692 | |||
1693 | static inline void | ||
1694 | rq_unlock_irq(struct rq *rq, struct rq_flags *rf) | ||
1695 | __releases(rq->lock) | ||
1696 | { | ||
1697 | rq_unpin_lock(rq, rf); | ||
1698 | raw_spin_unlock_irq(&rq->lock); | ||
1699 | } | ||
1700 | |||
1701 | static inline void | ||
1702 | rq_unlock(struct rq *rq, struct rq_flags *rf) | ||
1703 | __releases(rq->lock) | ||
1704 | { | ||
1705 | rq_unpin_lock(rq, rf); | ||
1706 | raw_spin_unlock(&rq->lock); | ||
1707 | } | ||
1708 | |||
1648 | #ifdef CONFIG_SMP | 1709 | #ifdef CONFIG_SMP |
1649 | #ifdef CONFIG_PREEMPT | 1710 | #ifdef CONFIG_PREEMPT |
1650 | 1711 | ||
@@ -1869,6 +1930,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
1869 | 1930 | ||
1870 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1931 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1871 | struct irqtime { | 1932 | struct irqtime { |
1933 | u64 total; | ||
1872 | u64 tick_delta; | 1934 | u64 tick_delta; |
1873 | u64 irq_start_time; | 1935 | u64 irq_start_time; |
1874 | struct u64_stats_sync sync; | 1936 | struct u64_stats_sync sync; |
@@ -1876,16 +1938,20 @@ struct irqtime { | |||
1876 | 1938 | ||
1877 | DECLARE_PER_CPU(struct irqtime, cpu_irqtime); | 1939 | DECLARE_PER_CPU(struct irqtime, cpu_irqtime); |
1878 | 1940 | ||
1941 | /* | ||
1942 | * Returns the irqtime minus the softirq time computed by ksoftirqd. | ||
1943 | * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime | ||
1944 | * and never move forward. | ||
1945 | */ | ||
1879 | static inline u64 irq_time_read(int cpu) | 1946 | static inline u64 irq_time_read(int cpu) |
1880 | { | 1947 | { |
1881 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); | 1948 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
1882 | u64 *cpustat = kcpustat_cpu(cpu).cpustat; | ||
1883 | unsigned int seq; | 1949 | unsigned int seq; |
1884 | u64 total; | 1950 | u64 total; |
1885 | 1951 | ||
1886 | do { | 1952 | do { |
1887 | seq = __u64_stats_fetch_begin(&irqtime->sync); | 1953 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
1888 | total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; | 1954 | total = irqtime->total; |
1889 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); | 1955 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
1890 | 1956 | ||
1891 | return total; | 1957 | return total; |
diff --git a/kernel/signal.c b/kernel/signal.c index 7e59ebc2c25e..ca92bcfeb322 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
1237 | } | 1237 | } |
1238 | /* | 1238 | /* |
1239 | * This sighand can be already freed and even reused, but | 1239 | * This sighand can be already freed and even reused, but |
1240 | * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which | 1240 | * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which |
1241 | * initializes ->siglock: this slab can't go away, it has | 1241 | * initializes ->siglock: this slab can't go away, it has |
1242 | * the same object type, ->siglock can't be reinitialized. | 1242 | * the same object type, ->siglock can't be reinitialized. |
1243 | * | 1243 | * |
@@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
1318 | } | 1318 | } |
1319 | } | 1319 | } |
1320 | 1320 | ||
1321 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1321 | static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
1322 | { | 1322 | { |
1323 | int error; | 1323 | int error; |
1324 | rcu_read_lock(); | 1324 | rcu_read_lock(); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae0..4e09821f9d9e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -309,7 +309,7 @@ restart: | |||
309 | account_irq_exit_time(current); | 309 | account_irq_exit_time(current); |
310 | __local_bh_enable(SOFTIRQ_OFFSET); | 310 | __local_bh_enable(SOFTIRQ_OFFSET); |
311 | WARN_ON_ONCE(in_interrupt()); | 311 | WARN_ON_ONCE(in_interrupt()); |
312 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 312 | current_restore_flags(old_flags, PF_MEMALLOC); |
313 | } | 313 | } |
314 | 314 | ||
315 | asmlinkage __visible void do_softirq(void) | 315 | asmlinkage __visible void do_softirq(void) |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c15a9124e83..f8edee9c792d 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
@@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size, | |||
54 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | 54 | EXPORT_SYMBOL_GPL(snprint_stack_trace); |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * Architectures that do not implement save_stack_trace_tsk or | 57 | * Architectures that do not implement save_stack_trace_*() |
58 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning | 58 | * get these weak aliases and once-per-bootup warnings |
59 | * (whenever this facility is utilized - for example by procfs): | 59 | * (whenever this facility is utilized - for example by procfs): |
60 | */ | 60 | */ |
61 | __weak void | 61 | __weak void |
@@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) | |||
69 | { | 69 | { |
70 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); | 70 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); |
71 | } | 71 | } |
72 | |||
73 | __weak int | ||
74 | save_stack_trace_tsk_reliable(struct task_struct *tsk, | ||
75 | struct stack_trace *trace) | ||
76 | { | ||
77 | WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); | ||
78 | return -ENOSYS; | ||
79 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 7ff6d1b10cec..8a94b4eabcaa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1396,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, | |||
1396 | !capable(CAP_SYS_RESOURCE)) | 1396 | !capable(CAP_SYS_RESOURCE)) |
1397 | retval = -EPERM; | 1397 | retval = -EPERM; |
1398 | if (!retval) | 1398 | if (!retval) |
1399 | retval = security_task_setrlimit(tsk->group_leader, | 1399 | retval = security_task_setrlimit(tsk, resource, new_rlim); |
1400 | resource, new_rlim); | ||
1401 | if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { | 1400 | if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { |
1402 | /* | 1401 | /* |
1403 | * The caller is asking for an immediate RLIMIT_CPU | 1402 | * The caller is asking for an immediate RLIMIT_CPU |
@@ -1432,25 +1431,26 @@ out: | |||
1432 | } | 1431 | } |
1433 | 1432 | ||
1434 | /* rcu lock must be held */ | 1433 | /* rcu lock must be held */ |
1435 | static int check_prlimit_permission(struct task_struct *task) | 1434 | static int check_prlimit_permission(struct task_struct *task, |
1435 | unsigned int flags) | ||
1436 | { | 1436 | { |
1437 | const struct cred *cred = current_cred(), *tcred; | 1437 | const struct cred *cred = current_cred(), *tcred; |
1438 | bool id_match; | ||
1438 | 1439 | ||
1439 | if (current == task) | 1440 | if (current == task) |
1440 | return 0; | 1441 | return 0; |
1441 | 1442 | ||
1442 | tcred = __task_cred(task); | 1443 | tcred = __task_cred(task); |
1443 | if (uid_eq(cred->uid, tcred->euid) && | 1444 | id_match = (uid_eq(cred->uid, tcred->euid) && |
1444 | uid_eq(cred->uid, tcred->suid) && | 1445 | uid_eq(cred->uid, tcred->suid) && |
1445 | uid_eq(cred->uid, tcred->uid) && | 1446 | uid_eq(cred->uid, tcred->uid) && |
1446 | gid_eq(cred->gid, tcred->egid) && | 1447 | gid_eq(cred->gid, tcred->egid) && |
1447 | gid_eq(cred->gid, tcred->sgid) && | 1448 | gid_eq(cred->gid, tcred->sgid) && |
1448 | gid_eq(cred->gid, tcred->gid)) | 1449 | gid_eq(cred->gid, tcred->gid)); |
1449 | return 0; | 1450 | if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) |
1450 | if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) | 1451 | return -EPERM; |
1451 | return 0; | ||
1452 | 1452 | ||
1453 | return -EPERM; | 1453 | return security_task_prlimit(cred, tcred, flags); |
1454 | } | 1454 | } |
1455 | 1455 | ||
1456 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | 1456 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, |
@@ -1460,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | |||
1460 | struct rlimit64 old64, new64; | 1460 | struct rlimit64 old64, new64; |
1461 | struct rlimit old, new; | 1461 | struct rlimit old, new; |
1462 | struct task_struct *tsk; | 1462 | struct task_struct *tsk; |
1463 | unsigned int checkflags = 0; | ||
1463 | int ret; | 1464 | int ret; |
1464 | 1465 | ||
1466 | if (old_rlim) | ||
1467 | checkflags |= LSM_PRLIMIT_READ; | ||
1468 | |||
1465 | if (new_rlim) { | 1469 | if (new_rlim) { |
1466 | if (copy_from_user(&new64, new_rlim, sizeof(new64))) | 1470 | if (copy_from_user(&new64, new_rlim, sizeof(new64))) |
1467 | return -EFAULT; | 1471 | return -EFAULT; |
1468 | rlim64_to_rlim(&new64, &new); | 1472 | rlim64_to_rlim(&new64, &new); |
1473 | checkflags |= LSM_PRLIMIT_WRITE; | ||
1469 | } | 1474 | } |
1470 | 1475 | ||
1471 | rcu_read_lock(); | 1476 | rcu_read_lock(); |
@@ -1474,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | |||
1474 | rcu_read_unlock(); | 1479 | rcu_read_unlock(); |
1475 | return -ESRCH; | 1480 | return -ESRCH; |
1476 | } | 1481 | } |
1477 | ret = check_prlimit_permission(tsk); | 1482 | ret = check_prlimit_permission(tsk, checkflags); |
1478 | if (ret) { | 1483 | if (ret) { |
1479 | rcu_read_unlock(); | 1484 | rcu_read_unlock(); |
1480 | return ret; | 1485 | return ret; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c8714fcb53c..4dfba1a76cc3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = { | |||
1176 | .maxlen = sizeof(unsigned int), | 1176 | .maxlen = sizeof(unsigned int), |
1177 | .mode = 0644, | 1177 | .mode = 0644, |
1178 | .proc_handler = timer_migration_handler, | 1178 | .proc_handler = timer_migration_handler, |
1179 | .extra1 = &zero, | ||
1180 | .extra2 = &one, | ||
1179 | }, | 1181 | }, |
1180 | #endif | 1182 | #endif |
1181 | #ifdef CONFIG_BPF_SYSCALL | 1183 | #ifdef CONFIG_BPF_SYSCALL |
@@ -2574,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, | |||
2574 | int write, void *data) | 2576 | int write, void *data) |
2575 | { | 2577 | { |
2576 | if (write) { | 2578 | if (write) { |
2577 | if (*lvalp > LONG_MAX / HZ) | 2579 | if (*lvalp > INT_MAX / HZ) |
2578 | return 1; | 2580 | return 1; |
2579 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); | 2581 | *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); |
2580 | } else { | 2582 | } else { |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78..4559e914452b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/pid_namespace.h> | 30 | #include <linux/pid_namespace.h> |
31 | #include <net/genetlink.h> | 31 | #include <net/genetlink.h> |
32 | #include <linux/atomic.h> | 32 | #include <linux/atomic.h> |
33 | #include <linux/sched/cputime.h> | ||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Maximum length of a cpumask that can be specified in | 36 | * Maximum length of a cpumask that can be specified in |
@@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) | |||
210 | struct task_struct *tsk, *first; | 211 | struct task_struct *tsk, *first; |
211 | unsigned long flags; | 212 | unsigned long flags; |
212 | int rc = -ESRCH; | 213 | int rc = -ESRCH; |
214 | u64 delta, utime, stime; | ||
215 | u64 start_time; | ||
213 | 216 | ||
214 | /* | 217 | /* |
215 | * Add additional stats from live tasks except zombie thread group | 218 | * Add additional stats from live tasks except zombie thread group |
@@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) | |||
227 | memset(stats, 0, sizeof(*stats)); | 230 | memset(stats, 0, sizeof(*stats)); |
228 | 231 | ||
229 | tsk = first; | 232 | tsk = first; |
233 | start_time = ktime_get_ns(); | ||
230 | do { | 234 | do { |
231 | if (tsk->exit_state) | 235 | if (tsk->exit_state) |
232 | continue; | 236 | continue; |
@@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) | |||
238 | */ | 242 | */ |
239 | delayacct_add_tsk(stats, tsk); | 243 | delayacct_add_tsk(stats, tsk); |
240 | 244 | ||
245 | /* calculate task elapsed time in nsec */ | ||
246 | delta = start_time - tsk->start_time; | ||
247 | /* Convert to micro seconds */ | ||
248 | do_div(delta, NSEC_PER_USEC); | ||
249 | stats->ac_etime += delta; | ||
250 | |||
251 | task_cputime(tsk, &utime, &stime); | ||
252 | stats->ac_utime += div_u64(utime, NSEC_PER_USEC); | ||
253 | stats->ac_stime += div_u64(stime, NSEC_PER_USEC); | ||
254 | |||
241 | stats->nvcsw += tsk->nvcsw; | 255 | stats->nvcsw += tsk->nvcsw; |
242 | stats->nivcsw += tsk->nivcsw; | 256 | stats->nivcsw += tsk->nivcsw; |
243 | } while_each_thread(first, tsk); | 257 | } while_each_thread(first, tsk); |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ce3a31e8eb36..5cb5b0008d97 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | |||
541 | * | 541 | * |
542 | * Returns the granularity of underlying alarm base clock | 542 | * Returns the granularity of underlying alarm base clock |
543 | */ | 543 | */ |
544 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | 544 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) |
545 | { | 545 | { |
546 | if (!alarmtimer_get_rtcdev()) | 546 | if (!alarmtimer_get_rtcdev()) |
547 | return -EINVAL; | 547 | return -EINVAL; |
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
558 | * | 558 | * |
559 | * Provides the underlying alarm base time. | 559 | * Provides the underlying alarm base time. |
560 | */ | 560 | */ |
561 | static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | 561 | static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) |
562 | { | 562 | { |
563 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | 563 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; |
564 | 564 | ||
565 | if (!alarmtimer_get_rtcdev()) | 565 | if (!alarmtimer_get_rtcdev()) |
566 | return -EINVAL; | 566 | return -EINVAL; |
567 | 567 | ||
568 | *tp = ktime_to_timespec(base->gettime()); | 568 | *tp = ktime_to_timespec64(base->gettime()); |
569 | return 0; | 569 | return 0; |
570 | } | 570 | } |
571 | 571 | ||
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
598 | * Copies out the current itimerspec data | 598 | * Copies out the current itimerspec data |
599 | */ | 599 | */ |
600 | static void alarm_timer_get(struct k_itimer *timr, | 600 | static void alarm_timer_get(struct k_itimer *timr, |
601 | struct itimerspec *cur_setting) | 601 | struct itimerspec64 *cur_setting) |
602 | { | 602 | { |
603 | ktime_t relative_expiry_time = | 603 | ktime_t relative_expiry_time = |
604 | alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); | 604 | alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); |
605 | 605 | ||
606 | if (ktime_to_ns(relative_expiry_time) > 0) { | 606 | if (ktime_to_ns(relative_expiry_time) > 0) { |
607 | cur_setting->it_value = ktime_to_timespec(relative_expiry_time); | 607 | cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); |
608 | } else { | 608 | } else { |
609 | cur_setting->it_value.tv_sec = 0; | 609 | cur_setting->it_value.tv_sec = 0; |
610 | cur_setting->it_value.tv_nsec = 0; | 610 | cur_setting->it_value.tv_nsec = 0; |
611 | } | 611 | } |
612 | 612 | ||
613 | cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); | 613 | cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); |
614 | } | 614 | } |
615 | 615 | ||
616 | /** | 616 | /** |
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr) | |||
640 | * Sets the timer to new_setting, and starts the timer. | 640 | * Sets the timer to new_setting, and starts the timer. |
641 | */ | 641 | */ |
642 | static int alarm_timer_set(struct k_itimer *timr, int flags, | 642 | static int alarm_timer_set(struct k_itimer *timr, int flags, |
643 | struct itimerspec *new_setting, | 643 | struct itimerspec64 *new_setting, |
644 | struct itimerspec *old_setting) | 644 | struct itimerspec64 *old_setting) |
645 | { | 645 | { |
646 | ktime_t exp; | 646 | ktime_t exp; |
647 | 647 | ||
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
659 | return TIMER_RETRY; | 659 | return TIMER_RETRY; |
660 | 660 | ||
661 | /* start the timer */ | 661 | /* start the timer */ |
662 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); | 662 | timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); |
663 | exp = timespec_to_ktime(new_setting->it_value); | 663 | exp = timespec64_to_ktime(new_setting->it_value); |
664 | /* Convert (if necessary) to absolute time */ | 664 | /* Convert (if necessary) to absolute time */ |
665 | if (flags != TIMER_ABSTIME) { | 665 | if (flags != TIMER_ABSTIME) { |
666 | ktime_t now; | 666 | ktime_t now; |
@@ -790,13 +790,14 @@ out: | |||
790 | * Handles clock_nanosleep calls against _ALARM clockids | 790 | * Handles clock_nanosleep calls against _ALARM clockids |
791 | */ | 791 | */ |
792 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | 792 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, |
793 | struct timespec *tsreq, struct timespec __user *rmtp) | 793 | struct timespec64 *tsreq, |
794 | struct timespec __user *rmtp) | ||
794 | { | 795 | { |
795 | enum alarmtimer_type type = clock2alarm(which_clock); | 796 | enum alarmtimer_type type = clock2alarm(which_clock); |
797 | struct restart_block *restart; | ||
796 | struct alarm alarm; | 798 | struct alarm alarm; |
797 | ktime_t exp; | 799 | ktime_t exp; |
798 | int ret = 0; | 800 | int ret = 0; |
799 | struct restart_block *restart; | ||
800 | 801 | ||
801 | if (!alarmtimer_get_rtcdev()) | 802 | if (!alarmtimer_get_rtcdev()) |
802 | return -ENOTSUPP; | 803 | return -ENOTSUPP; |
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | |||
809 | 810 | ||
810 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | 811 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); |
811 | 812 | ||
812 | exp = timespec_to_ktime(*tsreq); | 813 | exp = timespec64_to_ktime(*tsreq); |
813 | /* Convert (if necessary) to absolute time */ | 814 | /* Convert (if necessary) to absolute time */ |
814 | if (flags != TIMER_ABSTIME) { | 815 | if (flags != TIMER_ABSTIME) { |
815 | ktime_t now = alarm_bases[type].gettime(); | 816 | ktime_t now = alarm_bases[type].gettime(); |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 97ac0951f164..4237e0744e26 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
468 | } | 468 | } |
469 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 469 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
470 | 470 | ||
471 | void clockevents_config(struct clock_event_device *dev, u32 freq) | 471 | static void clockevents_config(struct clock_event_device *dev, u32 freq) |
472 | { | 472 | { |
473 | u64 sec; | 473 | u64 sec; |
474 | 474 | ||
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ec08f527d7ee..ac053bb5296e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); | |||
987 | * Returns: | 987 | * Returns: |
988 | * 0 when the timer was not active | 988 | * 0 when the timer was not active |
989 | * 1 when the timer was active | 989 | * 1 when the timer was active |
990 | * -1 when the timer is currently excuting the callback function and | 990 | * -1 when the timer is currently executing the callback function and |
991 | * cannot be stopped | 991 | * cannot be stopped |
992 | */ | 992 | */ |
993 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 993 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
@@ -1368,10 +1368,7 @@ retry: | |||
1368 | ktime_to_ns(delta)); | 1368 | ktime_to_ns(delta)); |
1369 | } | 1369 | } |
1370 | 1370 | ||
1371 | /* | 1371 | /* called with interrupts disabled */ |
1372 | * local version of hrtimer_peek_ahead_timers() called with interrupts | ||
1373 | * disabled. | ||
1374 | */ | ||
1375 | static inline void __hrtimer_peek_ahead_timers(void) | 1372 | static inline void __hrtimer_peek_ahead_timers(void) |
1376 | { | 1373 | { |
1377 | struct tick_device *td; | 1374 | struct tick_device *td; |
@@ -1506,7 +1503,7 @@ out: | |||
1506 | return ret; | 1503 | return ret; |
1507 | } | 1504 | } |
1508 | 1505 | ||
1509 | long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | 1506 | long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, |
1510 | const enum hrtimer_mode mode, const clockid_t clockid) | 1507 | const enum hrtimer_mode mode, const clockid_t clockid) |
1511 | { | 1508 | { |
1512 | struct restart_block *restart; | 1509 | struct restart_block *restart; |
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1519 | slack = 0; | 1516 | slack = 0; |
1520 | 1517 | ||
1521 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1518 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
1522 | hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); | 1519 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); |
1523 | if (do_nanosleep(&t, mode)) | 1520 | if (do_nanosleep(&t, mode)) |
1524 | goto out; | 1521 | goto out; |
1525 | 1522 | ||
@@ -1550,15 +1547,17 @@ out: | |||
1550 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | 1547 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, |
1551 | struct timespec __user *, rmtp) | 1548 | struct timespec __user *, rmtp) |
1552 | { | 1549 | { |
1550 | struct timespec64 tu64; | ||
1553 | struct timespec tu; | 1551 | struct timespec tu; |
1554 | 1552 | ||
1555 | if (copy_from_user(&tu, rqtp, sizeof(tu))) | 1553 | if (copy_from_user(&tu, rqtp, sizeof(tu))) |
1556 | return -EFAULT; | 1554 | return -EFAULT; |
1557 | 1555 | ||
1558 | if (!timespec_valid(&tu)) | 1556 | tu64 = timespec_to_timespec64(tu); |
1557 | if (!timespec64_valid(&tu64)) | ||
1559 | return -EINVAL; | 1558 | return -EINVAL; |
1560 | 1559 | ||
1561 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 1560 | return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
1562 | } | 1561 | } |
1563 | 1562 | ||
1564 | /* | 1563 | /* |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9cff0ab82b63..31d588d37a17 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c | |||
@@ -297,7 +297,7 @@ out: | |||
297 | return err; | 297 | return err; |
298 | } | 298 | } |
299 | 299 | ||
300 | static int pc_clock_gettime(clockid_t id, struct timespec *ts) | 300 | static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) |
301 | { | 301 | { |
302 | struct posix_clock_desc cd; | 302 | struct posix_clock_desc cd; |
303 | int err; | 303 | int err; |
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts) | |||
316 | return err; | 316 | return err; |
317 | } | 317 | } |
318 | 318 | ||
319 | static int pc_clock_getres(clockid_t id, struct timespec *ts) | 319 | static int pc_clock_getres(clockid_t id, struct timespec64 *ts) |
320 | { | 320 | { |
321 | struct posix_clock_desc cd; | 321 | struct posix_clock_desc cd; |
322 | int err; | 322 | int err; |
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts) | |||
335 | return err; | 335 | return err; |
336 | } | 336 | } |
337 | 337 | ||
338 | static int pc_clock_settime(clockid_t id, const struct timespec *ts) | 338 | static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) |
339 | { | 339 | { |
340 | struct posix_clock_desc cd; | 340 | struct posix_clock_desc cd; |
341 | int err; | 341 | int err; |
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit) | |||
399 | return err; | 399 | return err; |
400 | } | 400 | } |
401 | 401 | ||
402 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) | 402 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) |
403 | { | 403 | { |
404 | clockid_t id = kit->it_clock; | 404 | clockid_t id = kit->it_clock; |
405 | struct posix_clock_desc cd; | 405 | struct posix_clock_desc cd; |
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) | |||
414 | } | 414 | } |
415 | 415 | ||
416 | static int pc_timer_settime(struct k_itimer *kit, int flags, | 416 | static int pc_timer_settime(struct k_itimer *kit, int flags, |
417 | struct itimerspec *ts, struct itimerspec *old) | 417 | struct itimerspec64 *ts, struct itimerspec64 *old) |
418 | { | 418 | { |
419 | clockid_t id = kit->it_clock; | 419 | clockid_t id = kit->it_clock; |
420 | struct posix_clock_desc cd; | 420 | struct posix_clock_desc cd; |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4513ad16a253..1370f067fb51 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p) | |||
116 | } | 116 | } |
117 | 117 | ||
118 | static int | 118 | static int |
119 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | 119 | posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) |
120 | { | 120 | { |
121 | int error = check_clock(which_clock); | 121 | int error = check_clock(which_clock); |
122 | if (!error) { | 122 | if (!error) { |
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
135 | } | 135 | } |
136 | 136 | ||
137 | static int | 137 | static int |
138 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | 138 | posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) |
139 | { | 139 | { |
140 | /* | 140 | /* |
141 | * You can never reset a CPU clock, but we check for other errors | 141 | * You can never reset a CPU clock, but we check for other errors |
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
261 | 261 | ||
262 | static int posix_cpu_clock_get_task(struct task_struct *tsk, | 262 | static int posix_cpu_clock_get_task(struct task_struct *tsk, |
263 | const clockid_t which_clock, | 263 | const clockid_t which_clock, |
264 | struct timespec *tp) | 264 | struct timespec64 *tp) |
265 | { | 265 | { |
266 | int err = -EINVAL; | 266 | int err = -EINVAL; |
267 | u64 rtn; | 267 | u64 rtn; |
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
275 | } | 275 | } |
276 | 276 | ||
277 | if (!err) | 277 | if (!err) |
278 | *tp = ns_to_timespec(rtn); | 278 | *tp = ns_to_timespec64(rtn); |
279 | 279 | ||
280 | return err; | 280 | return err; |
281 | } | 281 | } |
282 | 282 | ||
283 | 283 | ||
284 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | 284 | static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) |
285 | { | 285 | { |
286 | const pid_t pid = CPUCLOCK_PID(which_clock); | 286 | const pid_t pid = CPUCLOCK_PID(which_clock); |
287 | int err = -EINVAL; | 287 | int err = -EINVAL; |
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
562 | * and try again. (This happens when the timer is in the middle of firing.) | 562 | * and try again. (This happens when the timer is in the middle of firing.) |
563 | */ | 563 | */ |
564 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | 564 | static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, |
565 | struct itimerspec *new, struct itimerspec *old) | 565 | struct itimerspec64 *new, struct itimerspec64 *old) |
566 | { | 566 | { |
567 | unsigned long flags; | 567 | unsigned long flags; |
568 | struct sighand_struct *sighand; | 568 | struct sighand_struct *sighand; |
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
572 | 572 | ||
573 | WARN_ON_ONCE(p == NULL); | 573 | WARN_ON_ONCE(p == NULL); |
574 | 574 | ||
575 | new_expires = timespec_to_ns(&new->it_value); | 575 | new_expires = timespec64_to_ns(&new->it_value); |
576 | 576 | ||
577 | /* | 577 | /* |
578 | * Protect against sighand release/switch in exit/exec and p->cpu_timers | 578 | * Protect against sighand release/switch in exit/exec and p->cpu_timers |
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
633 | bump_cpu_timer(timer, val); | 633 | bump_cpu_timer(timer, val); |
634 | if (val < timer->it.cpu.expires) { | 634 | if (val < timer->it.cpu.expires) { |
635 | old_expires = timer->it.cpu.expires - val; | 635 | old_expires = timer->it.cpu.expires - val; |
636 | old->it_value = ns_to_timespec(old_expires); | 636 | old->it_value = ns_to_timespec64(old_expires); |
637 | } else { | 637 | } else { |
638 | old->it_value.tv_nsec = 1; | 638 | old->it_value.tv_nsec = 1; |
639 | old->it_value.tv_sec = 0; | 639 | old->it_value.tv_sec = 0; |
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
671 | * Install the new reload setting, and | 671 | * Install the new reload setting, and |
672 | * set up the signal and overrun bookkeeping. | 672 | * set up the signal and overrun bookkeeping. |
673 | */ | 673 | */ |
674 | timer->it.cpu.incr = timespec_to_ns(&new->it_interval); | 674 | timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); |
675 | 675 | ||
676 | /* | 676 | /* |
677 | * This acts as a modification timestamp for the timer, | 677 | * This acts as a modification timestamp for the timer, |
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
695 | ret = 0; | 695 | ret = 0; |
696 | out: | 696 | out: |
697 | if (old) | 697 | if (old) |
698 | old->it_interval = ns_to_timespec(old_incr); | 698 | old->it_interval = ns_to_timespec64(old_incr); |
699 | 699 | ||
700 | return ret; | 700 | return ret; |
701 | } | 701 | } |
702 | 702 | ||
703 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 703 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) |
704 | { | 704 | { |
705 | u64 now; | 705 | u64 now; |
706 | struct task_struct *p = timer->it.cpu.task; | 706 | struct task_struct *p = timer->it.cpu.task; |
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
710 | /* | 710 | /* |
711 | * Easy part: convert the reload time. | 711 | * Easy part: convert the reload time. |
712 | */ | 712 | */ |
713 | itp->it_interval = ns_to_timespec(timer->it.cpu.incr); | 713 | itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); |
714 | 714 | ||
715 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ | 715 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ |
716 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | 716 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; |
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
739 | * Call the timer disarmed, nothing else to do. | 739 | * Call the timer disarmed, nothing else to do. |
740 | */ | 740 | */ |
741 | timer->it.cpu.expires = 0; | 741 | timer->it.cpu.expires = 0; |
742 | itp->it_value = ns_to_timespec(timer->it.cpu.expires); | 742 | itp->it_value = ns_to_timespec64(timer->it.cpu.expires); |
743 | return; | 743 | return; |
744 | } else { | 744 | } else { |
745 | cpu_timer_sample_group(timer->it_clock, p, &now); | 745 | cpu_timer_sample_group(timer->it_clock, p, &now); |
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
748 | } | 748 | } |
749 | 749 | ||
750 | if (now < timer->it.cpu.expires) { | 750 | if (now < timer->it.cpu.expires) { |
751 | itp->it_value = ns_to_timespec(timer->it.cpu.expires - now); | 751 | itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); |
752 | } else { | 752 | } else { |
753 | /* | 753 | /* |
754 | * The timer should have expired already, but the firing | 754 | * The timer should have expired already, but the firing |
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk, | |||
825 | * At the hard limit, we just die. | 825 | * At the hard limit, we just die. |
826 | * No need to calculate anything else now. | 826 | * No need to calculate anything else now. |
827 | */ | 827 | */ |
828 | pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", | ||
829 | tsk->comm, task_pid_nr(tsk)); | ||
828 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 830 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
829 | return; | 831 | return; |
830 | } | 832 | } |
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
836 | soft += USEC_PER_SEC; | 838 | soft += USEC_PER_SEC; |
837 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; | 839 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
838 | } | 840 | } |
839 | printk(KERN_INFO | 841 | pr_info("RT Watchdog Timeout (soft): %s[%d]\n", |
840 | "RT Watchdog Timeout: %s[%d]\n", | ||
841 | tsk->comm, task_pid_nr(tsk)); | 842 | tsk->comm, task_pid_nr(tsk)); |
842 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 843 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
843 | } | 844 | } |
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
935 | * At the hard limit, we just die. | 936 | * At the hard limit, we just die. |
936 | * No need to calculate anything else now. | 937 | * No need to calculate anything else now. |
937 | */ | 938 | */ |
939 | pr_info("RT Watchdog Timeout (hard): %s[%d]\n", | ||
940 | tsk->comm, task_pid_nr(tsk)); | ||
938 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 941 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
939 | return; | 942 | return; |
940 | } | 943 | } |
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
942 | /* | 945 | /* |
943 | * At the soft limit, send a SIGXCPU every second. | 946 | * At the soft limit, send a SIGXCPU every second. |
944 | */ | 947 | */ |
948 | pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", | ||
949 | tsk->comm, task_pid_nr(tsk)); | ||
945 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 950 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
946 | if (soft < hard) { | 951 | if (soft < hard) { |
947 | soft++; | 952 | soft++; |
@@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1214 | } | 1219 | } |
1215 | 1220 | ||
1216 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | 1221 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
1217 | struct timespec *rqtp, struct itimerspec *it) | 1222 | struct timespec64 *rqtp, struct itimerspec64 *it) |
1218 | { | 1223 | { |
1219 | struct k_itimer timer; | 1224 | struct k_itimer timer; |
1220 | int error; | 1225 | int error; |
@@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1229 | error = posix_cpu_timer_create(&timer); | 1234 | error = posix_cpu_timer_create(&timer); |
1230 | timer.it_process = current; | 1235 | timer.it_process = current; |
1231 | if (!error) { | 1236 | if (!error) { |
1232 | static struct itimerspec zero_it; | 1237 | static struct itimerspec64 zero_it; |
1233 | 1238 | ||
1234 | memset(it, 0, sizeof *it); | 1239 | memset(it, 0, sizeof *it); |
1235 | it->it_value = *rqtp; | 1240 | it->it_value = *rqtp; |
@@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1264 | /* | 1269 | /* |
1265 | * We were interrupted by a signal. | 1270 | * We were interrupted by a signal. |
1266 | */ | 1271 | */ |
1267 | *rqtp = ns_to_timespec(timer.it.cpu.expires); | 1272 | *rqtp = ns_to_timespec64(timer.it.cpu.expires); |
1268 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); | 1273 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); |
1269 | if (!error) { | 1274 | if (!error) { |
1270 | /* | 1275 | /* |
@@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1301 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); | 1306 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
1302 | 1307 | ||
1303 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | 1308 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, |
1304 | struct timespec *rqtp, struct timespec __user *rmtp) | 1309 | struct timespec64 *rqtp, struct timespec __user *rmtp) |
1305 | { | 1310 | { |
1306 | struct restart_block *restart_block = ¤t->restart_block; | 1311 | struct restart_block *restart_block = ¤t->restart_block; |
1307 | struct itimerspec it; | 1312 | struct itimerspec64 it; |
1313 | struct timespec ts; | ||
1308 | int error; | 1314 | int error; |
1309 | 1315 | ||
1310 | /* | 1316 | /* |
@@ -1312,7 +1318,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1312 | */ | 1318 | */ |
1313 | if (CPUCLOCK_PERTHREAD(which_clock) && | 1319 | if (CPUCLOCK_PERTHREAD(which_clock) && |
1314 | (CPUCLOCK_PID(which_clock) == 0 || | 1320 | (CPUCLOCK_PID(which_clock) == 0 || |
1315 | CPUCLOCK_PID(which_clock) == current->pid)) | 1321 | CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) |
1316 | return -EINVAL; | 1322 | return -EINVAL; |
1317 | 1323 | ||
1318 | error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); | 1324 | error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); |
@@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1324 | /* | 1330 | /* |
1325 | * Report back to the user the time still remaining. | 1331 | * Report back to the user the time still remaining. |
1326 | */ | 1332 | */ |
1327 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1333 | ts = timespec64_to_timespec(it.it_value); |
1334 | if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) | ||
1328 | return -EFAULT; | 1335 | return -EFAULT; |
1329 | 1336 | ||
1330 | restart_block->fn = posix_cpu_nsleep_restart; | 1337 | restart_block->fn = posix_cpu_nsleep_restart; |
1331 | restart_block->nanosleep.clockid = which_clock; | 1338 | restart_block->nanosleep.clockid = which_clock; |
1332 | restart_block->nanosleep.rmtp = rmtp; | 1339 | restart_block->nanosleep.rmtp = rmtp; |
1333 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); | 1340 | restart_block->nanosleep.expires = timespec64_to_ns(rqtp); |
1334 | } | 1341 | } |
1335 | return error; | 1342 | return error; |
1336 | } | 1343 | } |
@@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1338 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1345 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
1339 | { | 1346 | { |
1340 | clockid_t which_clock = restart_block->nanosleep.clockid; | 1347 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1341 | struct timespec t; | 1348 | struct itimerspec64 it; |
1342 | struct itimerspec it; | 1349 | struct timespec64 t; |
1350 | struct timespec tmp; | ||
1343 | int error; | 1351 | int error; |
1344 | 1352 | ||
1345 | t = ns_to_timespec(restart_block->nanosleep.expires); | 1353 | t = ns_to_timespec64(restart_block->nanosleep.expires); |
1346 | 1354 | ||
1347 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); | 1355 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); |
1348 | 1356 | ||
@@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | |||
1351 | /* | 1359 | /* |
1352 | * Report back to the user the time still remaining. | 1360 | * Report back to the user the time still remaining. |
1353 | */ | 1361 | */ |
1354 | if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1362 | tmp = timespec64_to_timespec(it.it_value); |
1363 | if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) | ||
1355 | return -EFAULT; | 1364 | return -EFAULT; |
1356 | 1365 | ||
1357 | restart_block->nanosleep.expires = timespec_to_ns(&t); | 1366 | restart_block->nanosleep.expires = timespec64_to_ns(&t); |
1358 | } | 1367 | } |
1359 | return error; | 1368 | return error; |
1360 | 1369 | ||
@@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | |||
1364 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1373 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
1365 | 1374 | ||
1366 | static int process_cpu_clock_getres(const clockid_t which_clock, | 1375 | static int process_cpu_clock_getres(const clockid_t which_clock, |
1367 | struct timespec *tp) | 1376 | struct timespec64 *tp) |
1368 | { | 1377 | { |
1369 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); | 1378 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); |
1370 | } | 1379 | } |
1371 | static int process_cpu_clock_get(const clockid_t which_clock, | 1380 | static int process_cpu_clock_get(const clockid_t which_clock, |
1372 | struct timespec *tp) | 1381 | struct timespec64 *tp) |
1373 | { | 1382 | { |
1374 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); | 1383 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); |
1375 | } | 1384 | } |
@@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) | |||
1379 | return posix_cpu_timer_create(timer); | 1388 | return posix_cpu_timer_create(timer); |
1380 | } | 1389 | } |
1381 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, | 1390 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, |
1382 | struct timespec *rqtp, | 1391 | struct timespec64 *rqtp, |
1383 | struct timespec __user *rmtp) | 1392 | struct timespec __user *rmtp) |
1384 | { | 1393 | { |
1385 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); | 1394 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); |
@@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block) | |||
1389 | return -EINVAL; | 1398 | return -EINVAL; |
1390 | } | 1399 | } |
1391 | static int thread_cpu_clock_getres(const clockid_t which_clock, | 1400 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
1392 | struct timespec *tp) | 1401 | struct timespec64 *tp) |
1393 | { | 1402 | { |
1394 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); | 1403 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); |
1395 | } | 1404 | } |
1396 | static int thread_cpu_clock_get(const clockid_t which_clock, | 1405 | static int thread_cpu_clock_get(const clockid_t which_clock, |
1397 | struct timespec *tp) | 1406 | struct timespec64 *tp) |
1398 | { | 1407 | { |
1399 | return posix_cpu_clock_get(THREAD_CLOCK, tp); | 1408 | return posix_cpu_clock_get(THREAD_CLOCK, tp); |
1400 | } | 1409 | } |
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index cd6716e115e8..c0cd53eb018a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c | |||
@@ -49,26 +49,32 @@ SYS_NI(alarm); | |||
49 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 49 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
50 | const struct timespec __user *, tp) | 50 | const struct timespec __user *, tp) |
51 | { | 51 | { |
52 | struct timespec64 new_tp64; | ||
52 | struct timespec new_tp; | 53 | struct timespec new_tp; |
53 | 54 | ||
54 | if (which_clock != CLOCK_REALTIME) | 55 | if (which_clock != CLOCK_REALTIME) |
55 | return -EINVAL; | 56 | return -EINVAL; |
56 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 57 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) |
57 | return -EFAULT; | 58 | return -EFAULT; |
58 | return do_sys_settimeofday(&new_tp, NULL); | 59 | |
60 | new_tp64 = timespec_to_timespec64(new_tp); | ||
61 | return do_sys_settimeofday64(&new_tp64, NULL); | ||
59 | } | 62 | } |
60 | 63 | ||
61 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 64 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
62 | struct timespec __user *,tp) | 65 | struct timespec __user *,tp) |
63 | { | 66 | { |
67 | struct timespec64 kernel_tp64; | ||
64 | struct timespec kernel_tp; | 68 | struct timespec kernel_tp; |
65 | 69 | ||
66 | switch (which_clock) { | 70 | switch (which_clock) { |
67 | case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; | 71 | case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; |
68 | case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; | 72 | case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; |
69 | case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; | 73 | case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; |
70 | default: return -EINVAL; | 74 | default: return -EINVAL; |
71 | } | 75 | } |
76 | |||
77 | kernel_tp = timespec64_to_timespec(kernel_tp64); | ||
72 | if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 78 | if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) |
73 | return -EFAULT; | 79 | return -EFAULT; |
74 | return 0; | 80 | return 0; |
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
97 | const struct timespec __user *, rqtp, | 103 | const struct timespec __user *, rqtp, |
98 | struct timespec __user *, rmtp) | 104 | struct timespec __user *, rmtp) |
99 | { | 105 | { |
106 | struct timespec64 t64; | ||
100 | struct timespec t; | 107 | struct timespec t; |
101 | 108 | ||
102 | switch (which_clock) { | 109 | switch (which_clock) { |
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
105 | case CLOCK_BOOTTIME: | 112 | case CLOCK_BOOTTIME: |
106 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 113 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
107 | return -EFAULT; | 114 | return -EFAULT; |
108 | if (!timespec_valid(&t)) | 115 | t64 = timespec_to_timespec64(t); |
116 | if (!timespec64_valid(&t64)) | ||
109 | return -EINVAL; | 117 | return -EINVAL; |
110 | return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? | 118 | return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? |
111 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | 119 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, |
112 | which_clock); | 120 | which_clock); |
113 | default: | 121 | default: |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 50a6a47020de..4d7b2ce09c27 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; | |||
130 | /* | 130 | /* |
131 | * These ones are defined below. | 131 | * These ones are defined below. |
132 | */ | 132 | */ |
133 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, | 133 | static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, |
134 | struct timespec __user *rmtp); | 134 | struct timespec __user *rmtp); |
135 | static int common_timer_create(struct k_itimer *new_timer); | 135 | static int common_timer_create(struct k_itimer *new_timer); |
136 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | 136 | static void common_timer_get(struct k_itimer *, struct itimerspec64 *); |
137 | static int common_timer_set(struct k_itimer *, int, | 137 | static int common_timer_set(struct k_itimer *, int, |
138 | struct itimerspec *, struct itimerspec *); | 138 | struct itimerspec64 *, struct itimerspec64 *); |
139 | static int common_timer_del(struct k_itimer *timer); | 139 | static int common_timer_del(struct k_itimer *timer); |
140 | 140 | ||
141 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | 141 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); |
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | |||
204 | } | 204 | } |
205 | 205 | ||
206 | /* Get clock_realtime */ | 206 | /* Get clock_realtime */ |
207 | static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) | 207 | static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) |
208 | { | 208 | { |
209 | ktime_get_real_ts(tp); | 209 | ktime_get_real_ts64(tp); |
210 | return 0; | 210 | return 0; |
211 | } | 211 | } |
212 | 212 | ||
213 | /* Set clock_realtime */ | 213 | /* Set clock_realtime */ |
214 | static int posix_clock_realtime_set(const clockid_t which_clock, | 214 | static int posix_clock_realtime_set(const clockid_t which_clock, |
215 | const struct timespec *tp) | 215 | const struct timespec64 *tp) |
216 | { | 216 | { |
217 | return do_sys_settimeofday(tp, NULL); | 217 | return do_sys_settimeofday64(tp, NULL); |
218 | } | 218 | } |
219 | 219 | ||
220 | static int posix_clock_realtime_adj(const clockid_t which_clock, | 220 | static int posix_clock_realtime_adj(const clockid_t which_clock, |
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, | |||
226 | /* | 226 | /* |
227 | * Get monotonic time for posix timers | 227 | * Get monotonic time for posix timers |
228 | */ | 228 | */ |
229 | static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | 229 | static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) |
230 | { | 230 | { |
231 | ktime_get_ts(tp); | 231 | ktime_get_ts64(tp); |
232 | return 0; | 232 | return 0; |
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * Get monotonic-raw time for posix timers | 236 | * Get monotonic-raw time for posix timers |
237 | */ | 237 | */ |
238 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) | 238 | static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) |
239 | { | 239 | { |
240 | getrawmonotonic(tp); | 240 | getrawmonotonic64(tp); |
241 | return 0; | 241 | return 0; |
242 | } | 242 | } |
243 | 243 | ||
244 | 244 | ||
245 | static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) | 245 | static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) |
246 | { | 246 | { |
247 | *tp = current_kernel_time(); | 247 | *tp = current_kernel_time64(); |
248 | return 0; | 248 | return 0; |
249 | } | 249 | } |
250 | 250 | ||
251 | static int posix_get_monotonic_coarse(clockid_t which_clock, | 251 | static int posix_get_monotonic_coarse(clockid_t which_clock, |
252 | struct timespec *tp) | 252 | struct timespec64 *tp) |
253 | { | 253 | { |
254 | *tp = get_monotonic_coarse(); | 254 | *tp = get_monotonic_coarse64(); |
255 | return 0; | 255 | return 0; |
256 | } | 256 | } |
257 | 257 | ||
258 | static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) | 258 | static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) |
259 | { | 259 | { |
260 | *tp = ktime_to_timespec(KTIME_LOW_RES); | 260 | *tp = ktime_to_timespec64(KTIME_LOW_RES); |
261 | return 0; | 261 | return 0; |
262 | } | 262 | } |
263 | 263 | ||
264 | static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) | 264 | static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) |
265 | { | 265 | { |
266 | get_monotonic_boottime(tp); | 266 | get_monotonic_boottime64(tp); |
267 | return 0; | 267 | return 0; |
268 | } | 268 | } |
269 | 269 | ||
270 | static int posix_get_tai(clockid_t which_clock, struct timespec *tp) | 270 | static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) |
271 | { | 271 | { |
272 | timekeeping_clocktai(tp); | 272 | timekeeping_clocktai64(tp); |
273 | return 0; | 273 | return 0; |
274 | } | 274 | } |
275 | 275 | ||
276 | static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) | 276 | static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) |
277 | { | 277 | { |
278 | tp->tv_sec = 0; | 278 | tp->tv_sec = 0; |
279 | tp->tv_nsec = hrtimer_resolution; | 279 | tp->tv_nsec = hrtimer_resolution; |
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | |||
734 | * report. | 734 | * report. |
735 | */ | 735 | */ |
736 | static void | 736 | static void |
737 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | 737 | common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) |
738 | { | 738 | { |
739 | ktime_t now, remaining, iv; | 739 | ktime_t now, remaining, iv; |
740 | struct hrtimer *timer = &timr->it.real.timer; | 740 | struct hrtimer *timer = &timr->it.real.timer; |
741 | 741 | ||
742 | memset(cur_setting, 0, sizeof(struct itimerspec)); | 742 | memset(cur_setting, 0, sizeof(*cur_setting)); |
743 | 743 | ||
744 | iv = timr->it.real.interval; | 744 | iv = timr->it.real.interval; |
745 | 745 | ||
746 | /* interval timer ? */ | 746 | /* interval timer ? */ |
747 | if (iv) | 747 | if (iv) |
748 | cur_setting->it_interval = ktime_to_timespec(iv); | 748 | cur_setting->it_interval = ktime_to_timespec64(iv); |
749 | else if (!hrtimer_active(timer) && | 749 | else if (!hrtimer_active(timer) && |
750 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | 750 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) |
751 | return; | 751 | return; |
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | |||
771 | if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | 771 | if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) |
772 | cur_setting->it_value.tv_nsec = 1; | 772 | cur_setting->it_value.tv_nsec = 1; |
773 | } else | 773 | } else |
774 | cur_setting->it_value = ktime_to_timespec(remaining); | 774 | cur_setting->it_value = ktime_to_timespec64(remaining); |
775 | } | 775 | } |
776 | 776 | ||
777 | /* Get the time remaining on a POSIX.1b interval timer. */ | 777 | /* Get the time remaining on a POSIX.1b interval timer. */ |
778 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 778 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
779 | struct itimerspec __user *, setting) | 779 | struct itimerspec __user *, setting) |
780 | { | 780 | { |
781 | struct itimerspec64 cur_setting64; | ||
781 | struct itimerspec cur_setting; | 782 | struct itimerspec cur_setting; |
782 | struct k_itimer *timr; | 783 | struct k_itimer *timr; |
783 | struct k_clock *kc; | 784 | struct k_clock *kc; |
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | |||
792 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) | 793 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) |
793 | ret = -EINVAL; | 794 | ret = -EINVAL; |
794 | else | 795 | else |
795 | kc->timer_get(timr, &cur_setting); | 796 | kc->timer_get(timr, &cur_setting64); |
796 | 797 | ||
797 | unlock_timer(timr, flags); | 798 | unlock_timer(timr, flags); |
798 | 799 | ||
800 | cur_setting = itimerspec64_to_itimerspec(&cur_setting64); | ||
799 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | 801 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) |
800 | return -EFAULT; | 802 | return -EFAULT; |
801 | 803 | ||
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) | |||
831 | /* timr->it_lock is taken. */ | 833 | /* timr->it_lock is taken. */ |
832 | static int | 834 | static int |
833 | common_timer_set(struct k_itimer *timr, int flags, | 835 | common_timer_set(struct k_itimer *timr, int flags, |
834 | struct itimerspec *new_setting, struct itimerspec *old_setting) | 836 | struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) |
835 | { | 837 | { |
836 | struct hrtimer *timer = &timr->it.real.timer; | 838 | struct hrtimer *timer = &timr->it.real.timer; |
837 | enum hrtimer_mode mode; | 839 | enum hrtimer_mode mode; |
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
860 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | 862 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); |
861 | timr->it.real.timer.function = posix_timer_fn; | 863 | timr->it.real.timer.function = posix_timer_fn; |
862 | 864 | ||
863 | hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); | 865 | hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); |
864 | 866 | ||
865 | /* Convert interval */ | 867 | /* Convert interval */ |
866 | timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); | 868 | timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); |
867 | 869 | ||
868 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | 870 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ |
869 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { | 871 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { |
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
883 | const struct itimerspec __user *, new_setting, | 885 | const struct itimerspec __user *, new_setting, |
884 | struct itimerspec __user *, old_setting) | 886 | struct itimerspec __user *, old_setting) |
885 | { | 887 | { |
886 | struct k_itimer *timr; | 888 | struct itimerspec64 new_spec64, old_spec64; |
889 | struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; | ||
887 | struct itimerspec new_spec, old_spec; | 890 | struct itimerspec new_spec, old_spec; |
888 | int error = 0; | 891 | struct k_itimer *timr; |
889 | unsigned long flag; | 892 | unsigned long flag; |
890 | struct itimerspec *rtn = old_setting ? &old_spec : NULL; | ||
891 | struct k_clock *kc; | 893 | struct k_clock *kc; |
894 | int error = 0; | ||
892 | 895 | ||
893 | if (!new_setting) | 896 | if (!new_setting) |
894 | return -EINVAL; | 897 | return -EINVAL; |
895 | 898 | ||
896 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | 899 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) |
897 | return -EFAULT; | 900 | return -EFAULT; |
901 | new_spec64 = itimerspec_to_itimerspec64(&new_spec); | ||
898 | 902 | ||
899 | if (!timespec_valid(&new_spec.it_interval) || | 903 | if (!timespec64_valid(&new_spec64.it_interval) || |
900 | !timespec_valid(&new_spec.it_value)) | 904 | !timespec64_valid(&new_spec64.it_value)) |
901 | return -EINVAL; | 905 | return -EINVAL; |
902 | retry: | 906 | retry: |
903 | timr = lock_timer(timer_id, &flag); | 907 | timr = lock_timer(timer_id, &flag); |
@@ -908,7 +912,7 @@ retry: | |||
908 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) | 912 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
909 | error = -EINVAL; | 913 | error = -EINVAL; |
910 | else | 914 | else |
911 | error = kc->timer_set(timr, flags, &new_spec, rtn); | 915 | error = kc->timer_set(timr, flags, &new_spec64, rtn); |
912 | 916 | ||
913 | unlock_timer(timr, flag); | 917 | unlock_timer(timr, flag); |
914 | if (error == TIMER_RETRY) { | 918 | if (error == TIMER_RETRY) { |
@@ -916,6 +920,7 @@ retry: | |||
916 | goto retry; | 920 | goto retry; |
917 | } | 921 | } |
918 | 922 | ||
923 | old_spec = itimerspec64_to_itimerspec(&old_spec64); | ||
919 | if (old_setting && !error && | 924 | if (old_setting && !error && |
920 | copy_to_user(old_setting, &old_spec, sizeof (old_spec))) | 925 | copy_to_user(old_setting, &old_spec, sizeof (old_spec))) |
921 | error = -EFAULT; | 926 | error = -EFAULT; |
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | |||
1014 | const struct timespec __user *, tp) | 1019 | const struct timespec __user *, tp) |
1015 | { | 1020 | { |
1016 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1021 | struct k_clock *kc = clockid_to_kclock(which_clock); |
1022 | struct timespec64 new_tp64; | ||
1017 | struct timespec new_tp; | 1023 | struct timespec new_tp; |
1018 | 1024 | ||
1019 | if (!kc || !kc->clock_set) | 1025 | if (!kc || !kc->clock_set) |
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | |||
1021 | 1027 | ||
1022 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 1028 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) |
1023 | return -EFAULT; | 1029 | return -EFAULT; |
1030 | new_tp64 = timespec_to_timespec64(new_tp); | ||
1024 | 1031 | ||
1025 | return kc->clock_set(which_clock, &new_tp); | 1032 | return kc->clock_set(which_clock, &new_tp64); |
1026 | } | 1033 | } |
1027 | 1034 | ||
1028 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 1035 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
1029 | struct timespec __user *,tp) | 1036 | struct timespec __user *,tp) |
1030 | { | 1037 | { |
1031 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1038 | struct k_clock *kc = clockid_to_kclock(which_clock); |
1039 | struct timespec64 kernel_tp64; | ||
1032 | struct timespec kernel_tp; | 1040 | struct timespec kernel_tp; |
1033 | int error; | 1041 | int error; |
1034 | 1042 | ||
1035 | if (!kc) | 1043 | if (!kc) |
1036 | return -EINVAL; | 1044 | return -EINVAL; |
1037 | 1045 | ||
1038 | error = kc->clock_get(which_clock, &kernel_tp); | 1046 | error = kc->clock_get(which_clock, &kernel_tp64); |
1047 | kernel_tp = timespec64_to_timespec(kernel_tp64); | ||
1039 | 1048 | ||
1040 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 1049 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) |
1041 | error = -EFAULT; | 1050 | error = -EFAULT; |
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | |||
1070 | struct timespec __user *, tp) | 1079 | struct timespec __user *, tp) |
1071 | { | 1080 | { |
1072 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1081 | struct k_clock *kc = clockid_to_kclock(which_clock); |
1082 | struct timespec64 rtn_tp64; | ||
1073 | struct timespec rtn_tp; | 1083 | struct timespec rtn_tp; |
1074 | int error; | 1084 | int error; |
1075 | 1085 | ||
1076 | if (!kc) | 1086 | if (!kc) |
1077 | return -EINVAL; | 1087 | return -EINVAL; |
1078 | 1088 | ||
1079 | error = kc->clock_getres(which_clock, &rtn_tp); | 1089 | error = kc->clock_getres(which_clock, &rtn_tp64); |
1090 | rtn_tp = timespec64_to_timespec(rtn_tp64); | ||
1080 | 1091 | ||
1081 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) | 1092 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) |
1082 | error = -EFAULT; | 1093 | error = -EFAULT; |
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | |||
1088 | * nanosleep for monotonic and realtime clocks | 1099 | * nanosleep for monotonic and realtime clocks |
1089 | */ | 1100 | */ |
1090 | static int common_nsleep(const clockid_t which_clock, int flags, | 1101 | static int common_nsleep(const clockid_t which_clock, int flags, |
1091 | struct timespec *tsave, struct timespec __user *rmtp) | 1102 | struct timespec64 *tsave, struct timespec __user *rmtp) |
1092 | { | 1103 | { |
1093 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? | 1104 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? |
1094 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | 1105 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, |
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1100 | struct timespec __user *, rmtp) | 1111 | struct timespec __user *, rmtp) |
1101 | { | 1112 | { |
1102 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1113 | struct k_clock *kc = clockid_to_kclock(which_clock); |
1114 | struct timespec64 t64; | ||
1103 | struct timespec t; | 1115 | struct timespec t; |
1104 | 1116 | ||
1105 | if (!kc) | 1117 | if (!kc) |
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1110 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1122 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
1111 | return -EFAULT; | 1123 | return -EFAULT; |
1112 | 1124 | ||
1113 | if (!timespec_valid(&t)) | 1125 | t64 = timespec_to_timespec64(t); |
1126 | if (!timespec64_valid(&t64)) | ||
1114 | return -EINVAL; | 1127 | return -EINVAL; |
1115 | 1128 | ||
1116 | return kc->nsleep(which_clock, flags, &t, rmtp); | 1129 | return kc->nsleep(which_clock, flags, &t64, rmtp); |
1117 | } | 1130 | } |
1118 | 1131 | ||
1119 | /* | 1132 | /* |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index ea6b610c4c57..2d8f05aad442 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) | |||
206 | 206 | ||
207 | update_clock_read_data(&rd); | 207 | update_clock_read_data(&rd); |
208 | 208 | ||
209 | if (sched_clock_timer.function != NULL) { | ||
210 | /* update timeout for clock wrap */ | ||
211 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | ||
212 | } | ||
213 | |||
209 | r = rate; | 214 | r = rate; |
210 | if (r >= 4000000) { | 215 | if (r >= 4000000) { |
211 | r /= 1000000; | 216 | r /= 1000000; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7fe53be86077..64c97fc130c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
993 | return ts->sleep_length; | 993 | return ts->sleep_length; |
994 | } | 994 | } |
995 | 995 | ||
996 | /** | ||
997 | * tick_nohz_get_idle_calls - return the current idle calls counter value | ||
998 | * | ||
999 | * Called from the schedutil frequency scaling governor in scheduler context. | ||
1000 | */ | ||
1001 | unsigned long tick_nohz_get_idle_calls(void) | ||
1002 | { | ||
1003 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); | ||
1004 | |||
1005 | return ts->idle_calls; | ||
1006 | } | ||
1007 | |||
996 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | 1008 | static void tick_nohz_account_idle_ticks(struct tick_sched *ts) |
997 | { | 1009 | { |
998 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 1010 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 25bdd2504571..49c73c6ed648 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz | |||
193 | SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, | 193 | SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, |
194 | struct timezone __user *, tz) | 194 | struct timezone __user *, tz) |
195 | { | 195 | { |
196 | struct timespec64 new_ts; | ||
196 | struct timeval user_tv; | 197 | struct timeval user_tv; |
197 | struct timespec new_ts; | ||
198 | struct timezone new_tz; | 198 | struct timezone new_tz; |
199 | 199 | ||
200 | if (tv) { | 200 | if (tv) { |
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, | |||
212 | return -EFAULT; | 212 | return -EFAULT; |
213 | } | 213 | } |
214 | 214 | ||
215 | return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | 215 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); |
216 | } | 216 | } |
217 | 217 | ||
218 | SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) | 218 | SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) |
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) | |||
230 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; | 230 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; |
231 | } | 231 | } |
232 | 232 | ||
233 | /** | ||
234 | * current_fs_time - Return FS time | ||
235 | * @sb: Superblock. | ||
236 | * | ||
237 | * Return the current time truncated to the time granularity supported by | ||
238 | * the fs. | ||
239 | */ | ||
240 | struct timespec current_fs_time(struct super_block *sb) | ||
241 | { | ||
242 | struct timespec now = current_kernel_time(); | ||
243 | return timespec_trunc(now, sb->s_time_gran); | ||
244 | } | ||
245 | EXPORT_SYMBOL(current_fs_time); | ||
246 | |||
247 | /* | 233 | /* |
248 | * Convert jiffies to milliseconds and back. | 234 | * Convert jiffies to milliseconds and back. |
249 | * | 235 | * |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b63a2102c29..9652bc57fd09 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, | |||
996 | return 0; | 996 | return 0; |
997 | 997 | ||
998 | /* Interpolate shortest distance from beginning or end of history */ | 998 | /* Interpolate shortest distance from beginning or end of history */ |
999 | interp_forward = partial_history_cycles > total_history_cycles/2 ? | 999 | interp_forward = partial_history_cycles > total_history_cycles / 2; |
1000 | true : false; | ||
1001 | partial_history_cycles = interp_forward ? | 1000 | partial_history_cycles = interp_forward ? |
1002 | total_history_cycles - partial_history_cycles : | 1001 | total_history_cycles - partial_history_cycles : |
1003 | partial_history_cycles; | 1002 | partial_history_cycles; |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1dc0256bfb6e..152a706ef8b8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write, | |||
241 | int ret; | 241 | int ret; |
242 | 242 | ||
243 | mutex_lock(&mutex); | 243 | mutex_lock(&mutex); |
244 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 244 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
245 | if (!ret && write) | 245 | if (!ret && write) |
246 | timers_update_migration(false); | 246 | timers_update_migration(false); |
247 | mutex_unlock(&mutex); | 247 | mutex_unlock(&mutex); |
@@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
1120 | EXPORT_SYMBOL_GPL(add_timer_on); | 1120 | EXPORT_SYMBOL_GPL(add_timer_on); |
1121 | 1121 | ||
1122 | /** | 1122 | /** |
1123 | * del_timer - deactive a timer. | 1123 | * del_timer - deactivate a timer. |
1124 | * @timer: the timer to be deactivated | 1124 | * @timer: the timer to be deactivated |
1125 | * | 1125 | * |
1126 | * del_timer() deactivates a timer - this works on both active and inactive | 1126 | * del_timer() deactivates a timer - this works on both active and inactive |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ff8d5c13d04b..0e7f5428a148 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/nmi.h> | ||
19 | 20 | ||
20 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
21 | 22 | ||
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, | |||
86 | 87 | ||
87 | next_one: | 88 | next_one: |
88 | i = 0; | 89 | i = 0; |
90 | |||
91 | touch_nmi_watchdog(); | ||
92 | |||
89 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); | 93 | raw_spin_lock_irqsave(&base->cpu_base->lock, flags); |
90 | 94 | ||
91 | curr = timerqueue_getnext(&base->active); | 95 | curr = timerqueue_getnext(&base->active); |
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
197 | { | 201 | { |
198 | struct clock_event_device *dev = td->evtdev; | 202 | struct clock_event_device *dev = td->evtdev; |
199 | 203 | ||
204 | touch_nmi_watchdog(); | ||
205 | |||
200 | SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); | 206 | SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); |
201 | if (cpu < 0) | 207 | if (cpu < 0) |
202 | SEQ_printf(m, "Broadcast device\n"); | 208 | SEQ_printf(m, "Broadcast device\n"); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4a06e714645..7e06f04e98fe 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -134,7 +134,8 @@ config FUNCTION_TRACER | |||
134 | select KALLSYMS | 134 | select KALLSYMS |
135 | select GENERIC_TRACER | 135 | select GENERIC_TRACER |
136 | select CONTEXT_SWITCH_TRACER | 136 | select CONTEXT_SWITCH_TRACER |
137 | select GLOB | 137 | select GLOB |
138 | select TASKS_RCU if PREEMPT | ||
138 | help | 139 | help |
139 | Enable the kernel to trace every kernel function. This is done | 140 | Enable the kernel to trace every kernel function. This is done |
140 | by using a compiler feature to insert a small, 5-byte No-Operation | 141 | by using a compiler feature to insert a small, 5-byte No-Operation |
@@ -455,7 +456,7 @@ config UPROBE_EVENTS | |||
455 | select UPROBES | 456 | select UPROBES |
456 | select PROBE_EVENTS | 457 | select PROBE_EVENTS |
457 | select TRACING | 458 | select TRACING |
458 | default n | 459 | default y |
459 | help | 460 | help |
460 | This allows the user to add tracing events on top of userspace | 461 | This allows the user to add tracing events on top of userspace |
461 | dynamic events (similar to tracepoints) on the fly via the trace | 462 | dynamic events (similar to tracepoints) on the fly via the trace |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b2058a7f94bd..193c5f5e3f79 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) | |||
690 | 690 | ||
691 | /** | 691 | /** |
692 | * blk_add_trace_rq - Add a trace for a request oriented action | 692 | * blk_add_trace_rq - Add a trace for a request oriented action |
693 | * @q: queue the io is for | ||
694 | * @rq: the source request | 693 | * @rq: the source request |
694 | * @error: return status to log | ||
695 | * @nr_bytes: number of completed bytes | 695 | * @nr_bytes: number of completed bytes |
696 | * @what: the action | 696 | * @what: the action |
697 | * | 697 | * |
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q) | |||
699 | * Records an action against a request. Will log the bio offset + size. | 699 | * Records an action against a request. Will log the bio offset + size. |
700 | * | 700 | * |
701 | **/ | 701 | **/ |
702 | static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | 702 | static void blk_add_trace_rq(struct request *rq, int error, |
703 | unsigned int nr_bytes, u32 what) | 703 | unsigned int nr_bytes, u32 what) |
704 | { | 704 | { |
705 | struct blk_trace *bt = q->blk_trace; | 705 | struct blk_trace *bt = rq->q->blk_trace; |
706 | 706 | ||
707 | if (likely(!bt)) | 707 | if (likely(!bt)) |
708 | return; | 708 | return; |
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |||
713 | what |= BLK_TC_ACT(BLK_TC_FS); | 713 | what |= BLK_TC_ACT(BLK_TC_FS); |
714 | 714 | ||
715 | __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), | 715 | __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), |
716 | rq->cmd_flags, what, rq->errors, 0, NULL); | 716 | rq->cmd_flags, what, error, 0, NULL); |
717 | } | ||
718 | |||
719 | static void blk_add_trace_rq_abort(void *ignore, | ||
720 | struct request_queue *q, struct request *rq) | ||
721 | { | ||
722 | blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); | ||
723 | } | 717 | } |
724 | 718 | ||
725 | static void blk_add_trace_rq_insert(void *ignore, | 719 | static void blk_add_trace_rq_insert(void *ignore, |
726 | struct request_queue *q, struct request *rq) | 720 | struct request_queue *q, struct request *rq) |
727 | { | 721 | { |
728 | blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); | 722 | blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); |
729 | } | 723 | } |
730 | 724 | ||
731 | static void blk_add_trace_rq_issue(void *ignore, | 725 | static void blk_add_trace_rq_issue(void *ignore, |
732 | struct request_queue *q, struct request *rq) | 726 | struct request_queue *q, struct request *rq) |
733 | { | 727 | { |
734 | blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); | 728 | blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); |
735 | } | 729 | } |
736 | 730 | ||
737 | static void blk_add_trace_rq_requeue(void *ignore, | 731 | static void blk_add_trace_rq_requeue(void *ignore, |
738 | struct request_queue *q, | 732 | struct request_queue *q, |
739 | struct request *rq) | 733 | struct request *rq) |
740 | { | 734 | { |
741 | blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); | 735 | blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); |
742 | } | 736 | } |
743 | 737 | ||
744 | static void blk_add_trace_rq_complete(void *ignore, | 738 | static void blk_add_trace_rq_complete(void *ignore, struct request *rq, |
745 | struct request_queue *q, | 739 | int error, unsigned int nr_bytes) |
746 | struct request *rq, | ||
747 | unsigned int nr_bytes) | ||
748 | { | 740 | { |
749 | blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); | 741 | blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); |
750 | } | 742 | } |
751 | 743 | ||
752 | /** | 744 | /** |
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, | |||
941 | r.sector_from = cpu_to_be64(from); | 933 | r.sector_from = cpu_to_be64(from); |
942 | 934 | ||
943 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), | 935 | __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
944 | rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, | 936 | rq_data_dir(rq), 0, BLK_TA_REMAP, 0, |
945 | sizeof(r), &r); | 937 | sizeof(r), &r); |
946 | } | 938 | } |
947 | 939 | ||
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q, | |||
966 | return; | 958 | return; |
967 | 959 | ||
968 | __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, | 960 | __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, |
969 | BLK_TA_DRV_DATA, rq->errors, len, data); | 961 | BLK_TA_DRV_DATA, 0, len, data); |
970 | } | 962 | } |
971 | EXPORT_SYMBOL_GPL(blk_add_driver_data); | 963 | EXPORT_SYMBOL_GPL(blk_add_driver_data); |
972 | 964 | ||
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void) | |||
974 | { | 966 | { |
975 | int ret; | 967 | int ret; |
976 | 968 | ||
977 | ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); | ||
978 | WARN_ON(ret); | ||
979 | ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); | 969 | ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); |
980 | WARN_ON(ret); | 970 | WARN_ON(ret); |
981 | ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); | 971 | ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); |
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void) | |||
1028 | unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); | 1018 | unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); |
1029 | unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); | 1019 | unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); |
1030 | unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); | 1020 | unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); |
1031 | unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); | ||
1032 | 1021 | ||
1033 | tracepoint_synchronize_unregister(); | 1022 | tracepoint_synchronize_unregister(); |
1034 | } | 1023 | } |
@@ -1673,14 +1662,14 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, | |||
1673 | goto out; | 1662 | goto out; |
1674 | 1663 | ||
1675 | if (attr == &dev_attr_act_mask) { | 1664 | if (attr == &dev_attr_act_mask) { |
1676 | if (sscanf(buf, "%llx", &value) != 1) { | 1665 | if (kstrtoull(buf, 0, &value)) { |
1677 | /* Assume it is a list of trace category names */ | 1666 | /* Assume it is a list of trace category names */ |
1678 | ret = blk_trace_str2mask(buf); | 1667 | ret = blk_trace_str2mask(buf); |
1679 | if (ret < 0) | 1668 | if (ret < 0) |
1680 | goto out; | 1669 | goto out; |
1681 | value = ret; | 1670 | value = ret; |
1682 | } | 1671 | } |
1683 | } else if (sscanf(buf, "%llu", &value) != 1) | 1672 | } else if (kstrtoull(buf, 0, &value)) |
1684 | goto out; | 1673 | goto out; |
1685 | 1674 | ||
1686 | ret = -ENXIO; | 1675 | ret = -ENXIO; |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cee9802cf3e0..460a031c77e5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
96 | if (unlikely(in_interrupt() || | 96 | if (unlikely(in_interrupt() || |
97 | current->flags & (PF_KTHREAD | PF_EXITING))) | 97 | current->flags & (PF_KTHREAD | PF_EXITING))) |
98 | return -EPERM; | 98 | return -EPERM; |
99 | if (unlikely(segment_eq(get_fs(), KERNEL_DS))) | 99 | if (unlikely(uaccess_kernel())) |
100 | return -EPERM; | 100 | return -EPERM; |
101 | if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) | 101 | if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) |
102 | return -EPERM; | 102 | return -EPERM; |
@@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type | |||
501 | return true; | 501 | return true; |
502 | } | 502 | } |
503 | 503 | ||
504 | static const struct bpf_verifier_ops kprobe_prog_ops = { | 504 | const struct bpf_verifier_ops kprobe_prog_ops = { |
505 | .get_func_proto = kprobe_prog_func_proto, | 505 | .get_func_proto = kprobe_prog_func_proto, |
506 | .is_valid_access = kprobe_prog_is_valid_access, | 506 | .is_valid_access = kprobe_prog_is_valid_access, |
507 | }; | 507 | }; |
508 | 508 | ||
509 | static struct bpf_prog_type_list kprobe_tl __ro_after_init = { | ||
510 | .ops = &kprobe_prog_ops, | ||
511 | .type = BPF_PROG_TYPE_KPROBE, | ||
512 | }; | ||
513 | |||
514 | BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, | 509 | BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, |
515 | u64, flags, void *, data, u64, size) | 510 | u64, flags, void *, data, u64, size) |
516 | { | 511 | { |
@@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type | |||
584 | return true; | 579 | return true; |
585 | } | 580 | } |
586 | 581 | ||
587 | static const struct bpf_verifier_ops tracepoint_prog_ops = { | 582 | const struct bpf_verifier_ops tracepoint_prog_ops = { |
588 | .get_func_proto = tp_prog_func_proto, | 583 | .get_func_proto = tp_prog_func_proto, |
589 | .is_valid_access = tp_prog_is_valid_access, | 584 | .is_valid_access = tp_prog_is_valid_access, |
590 | }; | 585 | }; |
591 | 586 | ||
592 | static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { | ||
593 | .ops = &tracepoint_prog_ops, | ||
594 | .type = BPF_PROG_TYPE_TRACEPOINT, | ||
595 | }; | ||
596 | |||
597 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 587 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
598 | enum bpf_reg_type *reg_type) | 588 | enum bpf_reg_type *reg_type) |
599 | { | 589 | { |
@@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, | |||
642 | return insn - insn_buf; | 632 | return insn - insn_buf; |
643 | } | 633 | } |
644 | 634 | ||
645 | static const struct bpf_verifier_ops perf_event_prog_ops = { | 635 | const struct bpf_verifier_ops perf_event_prog_ops = { |
646 | .get_func_proto = tp_prog_func_proto, | 636 | .get_func_proto = tp_prog_func_proto, |
647 | .is_valid_access = pe_prog_is_valid_access, | 637 | .is_valid_access = pe_prog_is_valid_access, |
648 | .convert_ctx_access = pe_prog_convert_ctx_access, | 638 | .convert_ctx_access = pe_prog_convert_ctx_access, |
649 | }; | 639 | }; |
650 | |||
651 | static struct bpf_prog_type_list perf_event_tl __ro_after_init = { | ||
652 | .ops = &perf_event_prog_ops, | ||
653 | .type = BPF_PROG_TYPE_PERF_EVENT, | ||
654 | }; | ||
655 | |||
656 | static int __init register_kprobe_prog_ops(void) | ||
657 | { | ||
658 | bpf_register_prog_type(&kprobe_tl); | ||
659 | bpf_register_prog_type(&tracepoint_tl); | ||
660 | bpf_register_prog_type(&perf_event_tl); | ||
661 | return 0; | ||
662 | } | ||
663 | late_initcall(register_kprobe_prog_ops); | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dd3e91d68dc7..74fdfe9ed3db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -36,6 +36,7 @@ | |||
36 | 36 | ||
37 | #include <trace/events/sched.h> | 37 | #include <trace/events/sched.h> |
38 | 38 | ||
39 | #include <asm/sections.h> | ||
39 | #include <asm/setup.h> | 40 | #include <asm/setup.h> |
40 | 41 | ||
41 | #include "trace_output.h" | 42 | #include "trace_output.h" |
@@ -1095,22 +1096,20 @@ static bool update_all_ops; | |||
1095 | # error Dynamic ftrace depends on MCOUNT_RECORD | 1096 | # error Dynamic ftrace depends on MCOUNT_RECORD |
1096 | #endif | 1097 | #endif |
1097 | 1098 | ||
1098 | static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; | ||
1099 | |||
1100 | struct ftrace_func_probe { | ||
1101 | struct hlist_node node; | ||
1102 | struct ftrace_probe_ops *ops; | ||
1103 | unsigned long flags; | ||
1104 | unsigned long ip; | ||
1105 | void *data; | ||
1106 | struct list_head free_list; | ||
1107 | }; | ||
1108 | |||
1109 | struct ftrace_func_entry { | 1099 | struct ftrace_func_entry { |
1110 | struct hlist_node hlist; | 1100 | struct hlist_node hlist; |
1111 | unsigned long ip; | 1101 | unsigned long ip; |
1112 | }; | 1102 | }; |
1113 | 1103 | ||
1104 | struct ftrace_func_probe { | ||
1105 | struct ftrace_probe_ops *probe_ops; | ||
1106 | struct ftrace_ops ops; | ||
1107 | struct trace_array *tr; | ||
1108 | struct list_head list; | ||
1109 | void *data; | ||
1110 | int ref; | ||
1111 | }; | ||
1112 | |||
1114 | /* | 1113 | /* |
1115 | * We make these constant because no one should touch them, | 1114 | * We make these constant because no one should touch them, |
1116 | * but they are used as the default "empty hash", to avoid allocating | 1115 | * but they are used as the default "empty hash", to avoid allocating |
@@ -1271,7 +1270,7 @@ static void | |||
1271 | remove_hash_entry(struct ftrace_hash *hash, | 1270 | remove_hash_entry(struct ftrace_hash *hash, |
1272 | struct ftrace_func_entry *entry) | 1271 | struct ftrace_func_entry *entry) |
1273 | { | 1272 | { |
1274 | hlist_del(&entry->hlist); | 1273 | hlist_del_rcu(&entry->hlist); |
1275 | hash->count--; | 1274 | hash->count--; |
1276 | } | 1275 | } |
1277 | 1276 | ||
@@ -2807,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2807 | * callers are done before leaving this function. | 2806 | * callers are done before leaving this function. |
2808 | * The same goes for freeing the per_cpu data of the per_cpu | 2807 | * The same goes for freeing the per_cpu data of the per_cpu |
2809 | * ops. | 2808 | * ops. |
2810 | * | ||
2811 | * Again, normal synchronize_sched() is not good enough. | ||
2812 | * We need to do a hard force of sched synchronization. | ||
2813 | * This is because we use preempt_disable() to do RCU, but | ||
2814 | * the function tracers can be called where RCU is not watching | ||
2815 | * (like before user_exit()). We can not rely on the RCU | ||
2816 | * infrastructure to do the synchronization, thus we must do it | ||
2817 | * ourselves. | ||
2818 | */ | 2809 | */ |
2819 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { | 2810 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { |
2811 | /* | ||
2812 | * We need to do a hard force of sched synchronization. | ||
2813 | * This is because we use preempt_disable() to do RCU, but | ||
2814 | * the function tracers can be called where RCU is not watching | ||
2815 | * (like before user_exit()). We can not rely on the RCU | ||
2816 | * infrastructure to do the synchronization, thus we must do it | ||
2817 | * ourselves. | ||
2818 | */ | ||
2820 | schedule_on_each_cpu(ftrace_sync); | 2819 | schedule_on_each_cpu(ftrace_sync); |
2821 | 2820 | ||
2821 | /* | ||
2822 | * When the kernel is preeptive, tasks can be preempted | ||
2823 | * while on a ftrace trampoline. Just scheduling a task on | ||
2824 | * a CPU is not good enough to flush them. Calling | ||
2825 | * synchornize_rcu_tasks() will wait for those tasks to | ||
2826 | * execute and either schedule voluntarily or enter user space. | ||
2827 | */ | ||
2828 | if (IS_ENABLED(CONFIG_PREEMPT)) | ||
2829 | synchronize_rcu_tasks(); | ||
2830 | |||
2822 | arch_ftrace_trampoline_free(ops); | 2831 | arch_ftrace_trampoline_free(ops); |
2823 | 2832 | ||
2824 | if (ops->flags & FTRACE_OPS_FL_PER_CPU) | 2833 | if (ops->flags & FTRACE_OPS_FL_PER_CPU) |
@@ -3055,34 +3064,63 @@ struct ftrace_iterator { | |||
3055 | struct ftrace_page *pg; | 3064 | struct ftrace_page *pg; |
3056 | struct dyn_ftrace *func; | 3065 | struct dyn_ftrace *func; |
3057 | struct ftrace_func_probe *probe; | 3066 | struct ftrace_func_probe *probe; |
3067 | struct ftrace_func_entry *probe_entry; | ||
3058 | struct trace_parser parser; | 3068 | struct trace_parser parser; |
3059 | struct ftrace_hash *hash; | 3069 | struct ftrace_hash *hash; |
3060 | struct ftrace_ops *ops; | 3070 | struct ftrace_ops *ops; |
3061 | int hidx; | 3071 | int pidx; |
3062 | int idx; | 3072 | int idx; |
3063 | unsigned flags; | 3073 | unsigned flags; |
3064 | }; | 3074 | }; |
3065 | 3075 | ||
3066 | static void * | 3076 | static void * |
3067 | t_hash_next(struct seq_file *m, loff_t *pos) | 3077 | t_probe_next(struct seq_file *m, loff_t *pos) |
3068 | { | 3078 | { |
3069 | struct ftrace_iterator *iter = m->private; | 3079 | struct ftrace_iterator *iter = m->private; |
3080 | struct trace_array *tr = iter->ops->private; | ||
3081 | struct list_head *func_probes; | ||
3082 | struct ftrace_hash *hash; | ||
3083 | struct list_head *next; | ||
3070 | struct hlist_node *hnd = NULL; | 3084 | struct hlist_node *hnd = NULL; |
3071 | struct hlist_head *hhd; | 3085 | struct hlist_head *hhd; |
3086 | int size; | ||
3072 | 3087 | ||
3073 | (*pos)++; | 3088 | (*pos)++; |
3074 | iter->pos = *pos; | 3089 | iter->pos = *pos; |
3075 | 3090 | ||
3076 | if (iter->probe) | 3091 | if (!tr) |
3077 | hnd = &iter->probe->node; | ||
3078 | retry: | ||
3079 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) | ||
3080 | return NULL; | 3092 | return NULL; |
3081 | 3093 | ||
3082 | hhd = &ftrace_func_hash[iter->hidx]; | 3094 | func_probes = &tr->func_probes; |
3095 | if (list_empty(func_probes)) | ||
3096 | return NULL; | ||
3097 | |||
3098 | if (!iter->probe) { | ||
3099 | next = func_probes->next; | ||
3100 | iter->probe = list_entry(next, struct ftrace_func_probe, list); | ||
3101 | } | ||
3102 | |||
3103 | if (iter->probe_entry) | ||
3104 | hnd = &iter->probe_entry->hlist; | ||
3105 | |||
3106 | hash = iter->probe->ops.func_hash->filter_hash; | ||
3107 | size = 1 << hash->size_bits; | ||
3108 | |||
3109 | retry: | ||
3110 | if (iter->pidx >= size) { | ||
3111 | if (iter->probe->list.next == func_probes) | ||
3112 | return NULL; | ||
3113 | next = iter->probe->list.next; | ||
3114 | iter->probe = list_entry(next, struct ftrace_func_probe, list); | ||
3115 | hash = iter->probe->ops.func_hash->filter_hash; | ||
3116 | size = 1 << hash->size_bits; | ||
3117 | iter->pidx = 0; | ||
3118 | } | ||
3119 | |||
3120 | hhd = &hash->buckets[iter->pidx]; | ||
3083 | 3121 | ||
3084 | if (hlist_empty(hhd)) { | 3122 | if (hlist_empty(hhd)) { |
3085 | iter->hidx++; | 3123 | iter->pidx++; |
3086 | hnd = NULL; | 3124 | hnd = NULL; |
3087 | goto retry; | 3125 | goto retry; |
3088 | } | 3126 | } |
@@ -3092,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) | |||
3092 | else { | 3130 | else { |
3093 | hnd = hnd->next; | 3131 | hnd = hnd->next; |
3094 | if (!hnd) { | 3132 | if (!hnd) { |
3095 | iter->hidx++; | 3133 | iter->pidx++; |
3096 | goto retry; | 3134 | goto retry; |
3097 | } | 3135 | } |
3098 | } | 3136 | } |
@@ -3100,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) | |||
3100 | if (WARN_ON_ONCE(!hnd)) | 3138 | if (WARN_ON_ONCE(!hnd)) |
3101 | return NULL; | 3139 | return NULL; |
3102 | 3140 | ||
3103 | iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); | 3141 | iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); |
3104 | 3142 | ||
3105 | return iter; | 3143 | return iter; |
3106 | } | 3144 | } |
3107 | 3145 | ||
3108 | static void *t_hash_start(struct seq_file *m, loff_t *pos) | 3146 | static void *t_probe_start(struct seq_file *m, loff_t *pos) |
3109 | { | 3147 | { |
3110 | struct ftrace_iterator *iter = m->private; | 3148 | struct ftrace_iterator *iter = m->private; |
3111 | void *p = NULL; | 3149 | void *p = NULL; |
3112 | loff_t l; | 3150 | loff_t l; |
3113 | 3151 | ||
3114 | if (!(iter->flags & FTRACE_ITER_DO_HASH)) | 3152 | if (!(iter->flags & FTRACE_ITER_DO_PROBES)) |
3115 | return NULL; | 3153 | return NULL; |
3116 | 3154 | ||
3117 | if (iter->func_pos > *pos) | 3155 | if (iter->func_pos > *pos) |
3118 | return NULL; | 3156 | return NULL; |
3119 | 3157 | ||
3120 | iter->hidx = 0; | 3158 | iter->probe = NULL; |
3159 | iter->probe_entry = NULL; | ||
3160 | iter->pidx = 0; | ||
3121 | for (l = 0; l <= (*pos - iter->func_pos); ) { | 3161 | for (l = 0; l <= (*pos - iter->func_pos); ) { |
3122 | p = t_hash_next(m, &l); | 3162 | p = t_probe_next(m, &l); |
3123 | if (!p) | 3163 | if (!p) |
3124 | break; | 3164 | break; |
3125 | } | 3165 | } |
@@ -3127,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
3127 | return NULL; | 3167 | return NULL; |
3128 | 3168 | ||
3129 | /* Only set this if we have an item */ | 3169 | /* Only set this if we have an item */ |
3130 | iter->flags |= FTRACE_ITER_HASH; | 3170 | iter->flags |= FTRACE_ITER_PROBE; |
3131 | 3171 | ||
3132 | return iter; | 3172 | return iter; |
3133 | } | 3173 | } |
3134 | 3174 | ||
3135 | static int | 3175 | static int |
3136 | t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) | 3176 | t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) |
3137 | { | 3177 | { |
3138 | struct ftrace_func_probe *rec; | 3178 | struct ftrace_func_entry *probe_entry; |
3179 | struct ftrace_probe_ops *probe_ops; | ||
3180 | struct ftrace_func_probe *probe; | ||
3181 | |||
3182 | probe = iter->probe; | ||
3183 | probe_entry = iter->probe_entry; | ||
3139 | 3184 | ||
3140 | rec = iter->probe; | 3185 | if (WARN_ON_ONCE(!probe || !probe_entry)) |
3141 | if (WARN_ON_ONCE(!rec)) | ||
3142 | return -EIO; | 3186 | return -EIO; |
3143 | 3187 | ||
3144 | if (rec->ops->print) | 3188 | probe_ops = probe->probe_ops; |
3145 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); | ||
3146 | 3189 | ||
3147 | seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); | 3190 | if (probe_ops->print) |
3191 | return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); | ||
3148 | 3192 | ||
3149 | if (rec->data) | 3193 | seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, |
3150 | seq_printf(m, ":%p", rec->data); | 3194 | (void *)probe_ops->func); |
3151 | seq_putc(m, '\n'); | ||
3152 | 3195 | ||
3153 | return 0; | 3196 | return 0; |
3154 | } | 3197 | } |
3155 | 3198 | ||
3156 | static void * | 3199 | static void * |
3157 | t_next(struct seq_file *m, void *v, loff_t *pos) | 3200 | t_func_next(struct seq_file *m, loff_t *pos) |
3158 | { | 3201 | { |
3159 | struct ftrace_iterator *iter = m->private; | 3202 | struct ftrace_iterator *iter = m->private; |
3160 | struct ftrace_ops *ops = iter->ops; | ||
3161 | struct dyn_ftrace *rec = NULL; | 3203 | struct dyn_ftrace *rec = NULL; |
3162 | 3204 | ||
3163 | if (unlikely(ftrace_disabled)) | ||
3164 | return NULL; | ||
3165 | |||
3166 | if (iter->flags & FTRACE_ITER_HASH) | ||
3167 | return t_hash_next(m, pos); | ||
3168 | |||
3169 | (*pos)++; | 3205 | (*pos)++; |
3170 | iter->pos = iter->func_pos = *pos; | ||
3171 | |||
3172 | if (iter->flags & FTRACE_ITER_PRINTALL) | ||
3173 | return t_hash_start(m, pos); | ||
3174 | 3206 | ||
3175 | retry: | 3207 | retry: |
3176 | if (iter->idx >= iter->pg->index) { | 3208 | if (iter->idx >= iter->pg->index) { |
@@ -3181,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
3181 | } | 3213 | } |
3182 | } else { | 3214 | } else { |
3183 | rec = &iter->pg->records[iter->idx++]; | 3215 | rec = &iter->pg->records[iter->idx++]; |
3184 | if (((iter->flags & FTRACE_ITER_FILTER) && | 3216 | if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && |
3185 | !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || | 3217 | !ftrace_lookup_ip(iter->hash, rec->ip)) || |
3186 | |||
3187 | ((iter->flags & FTRACE_ITER_NOTRACE) && | ||
3188 | !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || | ||
3189 | 3218 | ||
3190 | ((iter->flags & FTRACE_ITER_ENABLED) && | 3219 | ((iter->flags & FTRACE_ITER_ENABLED) && |
3191 | !(rec->flags & FTRACE_FL_ENABLED))) { | 3220 | !(rec->flags & FTRACE_FL_ENABLED))) { |
@@ -3196,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
3196 | } | 3225 | } |
3197 | 3226 | ||
3198 | if (!rec) | 3227 | if (!rec) |
3199 | return t_hash_start(m, pos); | 3228 | return NULL; |
3200 | 3229 | ||
3230 | iter->pos = iter->func_pos = *pos; | ||
3201 | iter->func = rec; | 3231 | iter->func = rec; |
3202 | 3232 | ||
3203 | return iter; | 3233 | return iter; |
3204 | } | 3234 | } |
3205 | 3235 | ||
3236 | static void * | ||
3237 | t_next(struct seq_file *m, void *v, loff_t *pos) | ||
3238 | { | ||
3239 | struct ftrace_iterator *iter = m->private; | ||
3240 | loff_t l = *pos; /* t_hash_start() must use original pos */ | ||
3241 | void *ret; | ||
3242 | |||
3243 | if (unlikely(ftrace_disabled)) | ||
3244 | return NULL; | ||
3245 | |||
3246 | if (iter->flags & FTRACE_ITER_PROBE) | ||
3247 | return t_probe_next(m, pos); | ||
3248 | |||
3249 | if (iter->flags & FTRACE_ITER_PRINTALL) { | ||
3250 | /* next must increment pos, and t_probe_start does not */ | ||
3251 | (*pos)++; | ||
3252 | return t_probe_start(m, &l); | ||
3253 | } | ||
3254 | |||
3255 | ret = t_func_next(m, pos); | ||
3256 | |||
3257 | if (!ret) | ||
3258 | return t_probe_start(m, &l); | ||
3259 | |||
3260 | return ret; | ||
3261 | } | ||
3262 | |||
3206 | static void reset_iter_read(struct ftrace_iterator *iter) | 3263 | static void reset_iter_read(struct ftrace_iterator *iter) |
3207 | { | 3264 | { |
3208 | iter->pos = 0; | 3265 | iter->pos = 0; |
3209 | iter->func_pos = 0; | 3266 | iter->func_pos = 0; |
3210 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); | 3267 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); |
3211 | } | 3268 | } |
3212 | 3269 | ||
3213 | static void *t_start(struct seq_file *m, loff_t *pos) | 3270 | static void *t_start(struct seq_file *m, loff_t *pos) |
3214 | { | 3271 | { |
3215 | struct ftrace_iterator *iter = m->private; | 3272 | struct ftrace_iterator *iter = m->private; |
3216 | struct ftrace_ops *ops = iter->ops; | ||
3217 | void *p = NULL; | 3273 | void *p = NULL; |
3218 | loff_t l; | 3274 | loff_t l; |
3219 | 3275 | ||
@@ -3233,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
3233 | * off, we can short cut and just print out that all | 3289 | * off, we can short cut and just print out that all |
3234 | * functions are enabled. | 3290 | * functions are enabled. |
3235 | */ | 3291 | */ |
3236 | if ((iter->flags & FTRACE_ITER_FILTER && | 3292 | if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && |
3237 | ftrace_hash_empty(ops->func_hash->filter_hash)) || | 3293 | ftrace_hash_empty(iter->hash)) { |
3238 | (iter->flags & FTRACE_ITER_NOTRACE && | 3294 | iter->func_pos = 1; /* Account for the message */ |
3239 | ftrace_hash_empty(ops->func_hash->notrace_hash))) { | ||
3240 | if (*pos > 0) | 3295 | if (*pos > 0) |
3241 | return t_hash_start(m, pos); | 3296 | return t_probe_start(m, pos); |
3242 | iter->flags |= FTRACE_ITER_PRINTALL; | 3297 | iter->flags |= FTRACE_ITER_PRINTALL; |
3243 | /* reset in case of seek/pread */ | 3298 | /* reset in case of seek/pread */ |
3244 | iter->flags &= ~FTRACE_ITER_HASH; | 3299 | iter->flags &= ~FTRACE_ITER_PROBE; |
3245 | return iter; | 3300 | return iter; |
3246 | } | 3301 | } |
3247 | 3302 | ||
3248 | if (iter->flags & FTRACE_ITER_HASH) | 3303 | if (iter->flags & FTRACE_ITER_PROBE) |
3249 | return t_hash_start(m, pos); | 3304 | return t_probe_start(m, pos); |
3250 | 3305 | ||
3251 | /* | 3306 | /* |
3252 | * Unfortunately, we need to restart at ftrace_pages_start | 3307 | * Unfortunately, we need to restart at ftrace_pages_start |
@@ -3256,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
3256 | iter->pg = ftrace_pages_start; | 3311 | iter->pg = ftrace_pages_start; |
3257 | iter->idx = 0; | 3312 | iter->idx = 0; |
3258 | for (l = 0; l <= *pos; ) { | 3313 | for (l = 0; l <= *pos; ) { |
3259 | p = t_next(m, p, &l); | 3314 | p = t_func_next(m, &l); |
3260 | if (!p) | 3315 | if (!p) |
3261 | break; | 3316 | break; |
3262 | } | 3317 | } |
3263 | 3318 | ||
3264 | if (!p) | 3319 | if (!p) |
3265 | return t_hash_start(m, pos); | 3320 | return t_probe_start(m, pos); |
3266 | 3321 | ||
3267 | return iter; | 3322 | return iter; |
3268 | } | 3323 | } |
@@ -3293,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v) | |||
3293 | struct ftrace_iterator *iter = m->private; | 3348 | struct ftrace_iterator *iter = m->private; |
3294 | struct dyn_ftrace *rec; | 3349 | struct dyn_ftrace *rec; |
3295 | 3350 | ||
3296 | if (iter->flags & FTRACE_ITER_HASH) | 3351 | if (iter->flags & FTRACE_ITER_PROBE) |
3297 | return t_hash_show(m, iter); | 3352 | return t_probe_show(m, iter); |
3298 | 3353 | ||
3299 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3354 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
3300 | if (iter->flags & FTRACE_ITER_NOTRACE) | 3355 | if (iter->flags & FTRACE_ITER_NOTRACE) |
@@ -3355,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
3355 | return -ENODEV; | 3410 | return -ENODEV; |
3356 | 3411 | ||
3357 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); | 3412 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
3358 | if (iter) { | 3413 | if (!iter) |
3359 | iter->pg = ftrace_pages_start; | 3414 | return -ENOMEM; |
3360 | iter->ops = &global_ops; | 3415 | |
3361 | } | 3416 | iter->pg = ftrace_pages_start; |
3417 | iter->ops = &global_ops; | ||
3362 | 3418 | ||
3363 | return iter ? 0 : -ENOMEM; | 3419 | return 0; |
3364 | } | 3420 | } |
3365 | 3421 | ||
3366 | static int | 3422 | static int |
@@ -3369,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file) | |||
3369 | struct ftrace_iterator *iter; | 3425 | struct ftrace_iterator *iter; |
3370 | 3426 | ||
3371 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); | 3427 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
3372 | if (iter) { | 3428 | if (!iter) |
3373 | iter->pg = ftrace_pages_start; | 3429 | return -ENOMEM; |
3374 | iter->flags = FTRACE_ITER_ENABLED; | 3430 | |
3375 | iter->ops = &global_ops; | 3431 | iter->pg = ftrace_pages_start; |
3376 | } | 3432 | iter->flags = FTRACE_ITER_ENABLED; |
3433 | iter->ops = &global_ops; | ||
3377 | 3434 | ||
3378 | return iter ? 0 : -ENOMEM; | 3435 | return 0; |
3379 | } | 3436 | } |
3380 | 3437 | ||
3381 | /** | 3438 | /** |
@@ -3440,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
3440 | ret = -ENOMEM; | 3497 | ret = -ENOMEM; |
3441 | goto out_unlock; | 3498 | goto out_unlock; |
3442 | } | 3499 | } |
3443 | } | 3500 | } else |
3501 | iter->hash = hash; | ||
3444 | 3502 | ||
3445 | if (file->f_mode & FMODE_READ) { | 3503 | if (file->f_mode & FMODE_READ) { |
3446 | iter->pg = ftrace_pages_start; | 3504 | iter->pg = ftrace_pages_start; |
@@ -3470,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) | |||
3470 | struct ftrace_ops *ops = inode->i_private; | 3528 | struct ftrace_ops *ops = inode->i_private; |
3471 | 3529 | ||
3472 | return ftrace_regex_open(ops, | 3530 | return ftrace_regex_open(ops, |
3473 | FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, | 3531 | FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, |
3474 | inode, file); | 3532 | inode, file); |
3475 | } | 3533 | } |
3476 | 3534 | ||
@@ -3573,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, | |||
3573 | /* blank module name to match all modules */ | 3631 | /* blank module name to match all modules */ |
3574 | if (!mod_g->len) { | 3632 | if (!mod_g->len) { |
3575 | /* blank module globbing: modname xor exclude_mod */ | 3633 | /* blank module globbing: modname xor exclude_mod */ |
3576 | if ((!exclude_mod) != (!modname)) | 3634 | if (!exclude_mod != !modname) |
3577 | goto func_match; | 3635 | goto func_match; |
3578 | return 0; | 3636 | return 0; |
3579 | } | 3637 | } |
3580 | 3638 | ||
3581 | /* not matching the module */ | 3639 | /* |
3582 | if (!modname || !mod_matches) { | 3640 | * exclude_mod is set to trace everything but the given |
3583 | if (exclude_mod) | 3641 | * module. If it is set and the module matches, then |
3584 | goto func_match; | 3642 | * return 0. If it is not set, and the module doesn't match |
3585 | else | 3643 | * also return 0. Otherwise, check the function to see if |
3586 | return 0; | 3644 | * that matches. |
3587 | } | 3645 | */ |
3588 | 3646 | if (!mod_matches == !exclude_mod) | |
3589 | if (mod_matches && exclude_mod) | ||
3590 | return 0; | 3647 | return 0; |
3591 | |||
3592 | func_match: | 3648 | func_match: |
3593 | /* blank search means to match all funcs in the mod */ | 3649 | /* blank search means to match all funcs in the mod */ |
3594 | if (!func_g->len) | 3650 | if (!func_g->len) |
@@ -3654,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) | |||
3654 | return match_records(hash, buff, len, NULL); | 3710 | return match_records(hash, buff, len, NULL); |
3655 | } | 3711 | } |
3656 | 3712 | ||
3713 | static void ftrace_ops_update_code(struct ftrace_ops *ops, | ||
3714 | struct ftrace_ops_hash *old_hash) | ||
3715 | { | ||
3716 | struct ftrace_ops *op; | ||
3717 | |||
3718 | if (!ftrace_enabled) | ||
3719 | return; | ||
3720 | |||
3721 | if (ops->flags & FTRACE_OPS_FL_ENABLED) { | ||
3722 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); | ||
3723 | return; | ||
3724 | } | ||
3725 | |||
3726 | /* | ||
3727 | * If this is the shared global_ops filter, then we need to | ||
3728 | * check if there is another ops that shares it, is enabled. | ||
3729 | * If so, we still need to run the modify code. | ||
3730 | */ | ||
3731 | if (ops->func_hash != &global_ops.local_hash) | ||
3732 | return; | ||
3733 | |||
3734 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
3735 | if (op->func_hash == &global_ops.local_hash && | ||
3736 | op->flags & FTRACE_OPS_FL_ENABLED) { | ||
3737 | ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); | ||
3738 | /* Only need to do this once */ | ||
3739 | return; | ||
3740 | } | ||
3741 | } while_for_each_ftrace_op(op); | ||
3742 | } | ||
3743 | |||
3744 | static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | ||
3745 | struct ftrace_hash **orig_hash, | ||
3746 | struct ftrace_hash *hash, | ||
3747 | int enable) | ||
3748 | { | ||
3749 | struct ftrace_ops_hash old_hash_ops; | ||
3750 | struct ftrace_hash *old_hash; | ||
3751 | int ret; | ||
3752 | |||
3753 | old_hash = *orig_hash; | ||
3754 | old_hash_ops.filter_hash = ops->func_hash->filter_hash; | ||
3755 | old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; | ||
3756 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | ||
3757 | if (!ret) { | ||
3758 | ftrace_ops_update_code(ops, &old_hash_ops); | ||
3759 | free_ftrace_hash_rcu(old_hash); | ||
3760 | } | ||
3761 | return ret; | ||
3762 | } | ||
3657 | 3763 | ||
3658 | /* | 3764 | /* |
3659 | * We register the module command as a template to show others how | 3765 | * We register the module command as a template to show others how |
@@ -3661,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) | |||
3661 | */ | 3767 | */ |
3662 | 3768 | ||
3663 | static int | 3769 | static int |
3664 | ftrace_mod_callback(struct ftrace_hash *hash, | 3770 | ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, |
3665 | char *func, char *cmd, char *module, int enable) | 3771 | char *func, char *cmd, char *module, int enable) |
3666 | { | 3772 | { |
3667 | int ret; | 3773 | int ret; |
@@ -3695,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init); | |||
3695 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | 3801 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, |
3696 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 3802 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
3697 | { | 3803 | { |
3698 | struct ftrace_func_probe *entry; | 3804 | struct ftrace_probe_ops *probe_ops; |
3699 | struct hlist_head *hhd; | 3805 | struct ftrace_func_probe *probe; |
3700 | unsigned long key; | ||
3701 | 3806 | ||
3702 | key = hash_long(ip, FTRACE_HASH_BITS); | 3807 | probe = container_of(op, struct ftrace_func_probe, ops); |
3703 | 3808 | probe_ops = probe->probe_ops; | |
3704 | hhd = &ftrace_func_hash[key]; | ||
3705 | |||
3706 | if (hlist_empty(hhd)) | ||
3707 | return; | ||
3708 | 3809 | ||
3709 | /* | 3810 | /* |
3710 | * Disable preemption for these calls to prevent a RCU grace | 3811 | * Disable preemption for these calls to prevent a RCU grace |
@@ -3712,213 +3813,340 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, | |||
3712 | * on the hash. rcu_read_lock is too dangerous here. | 3813 | * on the hash. rcu_read_lock is too dangerous here. |
3713 | */ | 3814 | */ |
3714 | preempt_disable_notrace(); | 3815 | preempt_disable_notrace(); |
3715 | hlist_for_each_entry_rcu_notrace(entry, hhd, node) { | 3816 | probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); |
3716 | if (entry->ip == ip) | ||
3717 | entry->ops->func(ip, parent_ip, &entry->data); | ||
3718 | } | ||
3719 | preempt_enable_notrace(); | 3817 | preempt_enable_notrace(); |
3720 | } | 3818 | } |
3721 | 3819 | ||
3722 | static struct ftrace_ops trace_probe_ops __read_mostly = | 3820 | struct ftrace_func_map { |
3723 | { | 3821 | struct ftrace_func_entry entry; |
3724 | .func = function_trace_probe_call, | 3822 | void *data; |
3725 | .flags = FTRACE_OPS_FL_INITIALIZED, | ||
3726 | INIT_OPS_HASH(trace_probe_ops) | ||
3727 | }; | 3823 | }; |
3728 | 3824 | ||
3729 | static int ftrace_probe_registered; | 3825 | struct ftrace_func_mapper { |
3826 | struct ftrace_hash hash; | ||
3827 | }; | ||
3730 | 3828 | ||
3731 | static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) | 3829 | /** |
3830 | * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper | ||
3831 | * | ||
3832 | * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. | ||
3833 | */ | ||
3834 | struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) | ||
3732 | { | 3835 | { |
3733 | int ret; | 3836 | struct ftrace_hash *hash; |
3734 | int i; | ||
3735 | 3837 | ||
3736 | if (ftrace_probe_registered) { | 3838 | /* |
3737 | /* still need to update the function call sites */ | 3839 | * The mapper is simply a ftrace_hash, but since the entries |
3738 | if (ftrace_enabled) | 3840 | * in the hash are not ftrace_func_entry type, we define it |
3739 | ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, | 3841 | * as a separate structure. |
3740 | old_hash); | 3842 | */ |
3741 | return; | 3843 | hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); |
3742 | } | 3844 | return (struct ftrace_func_mapper *)hash; |
3845 | } | ||
3743 | 3846 | ||
3744 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | 3847 | /** |
3745 | struct hlist_head *hhd = &ftrace_func_hash[i]; | 3848 | * ftrace_func_mapper_find_ip - Find some data mapped to an ip |
3746 | if (hhd->first) | 3849 | * @mapper: The mapper that has the ip maps |
3747 | break; | 3850 | * @ip: the instruction pointer to find the data for |
3748 | } | 3851 | * |
3749 | /* Nothing registered? */ | 3852 | * Returns the data mapped to @ip if found otherwise NULL. The return |
3750 | if (i == FTRACE_FUNC_HASHSIZE) | 3853 | * is actually the address of the mapper data pointer. The address is |
3751 | return; | 3854 | * returned for use cases where the data is no bigger than a long, and |
3855 | * the user can use the data pointer as its data instead of having to | ||
3856 | * allocate more memory for the reference. | ||
3857 | */ | ||
3858 | void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, | ||
3859 | unsigned long ip) | ||
3860 | { | ||
3861 | struct ftrace_func_entry *entry; | ||
3862 | struct ftrace_func_map *map; | ||
3752 | 3863 | ||
3753 | ret = ftrace_startup(&trace_probe_ops, 0); | 3864 | entry = ftrace_lookup_ip(&mapper->hash, ip); |
3865 | if (!entry) | ||
3866 | return NULL; | ||
3754 | 3867 | ||
3755 | ftrace_probe_registered = 1; | 3868 | map = (struct ftrace_func_map *)entry; |
3869 | return &map->data; | ||
3756 | } | 3870 | } |
3757 | 3871 | ||
3758 | static bool __disable_ftrace_function_probe(void) | 3872 | /** |
3873 | * ftrace_func_mapper_add_ip - Map some data to an ip | ||
3874 | * @mapper: The mapper that has the ip maps | ||
3875 | * @ip: The instruction pointer address to map @data to | ||
3876 | * @data: The data to map to @ip | ||
3877 | * | ||
3878 | * Returns 0 on succes otherwise an error. | ||
3879 | */ | ||
3880 | int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, | ||
3881 | unsigned long ip, void *data) | ||
3759 | { | 3882 | { |
3760 | int i; | 3883 | struct ftrace_func_entry *entry; |
3884 | struct ftrace_func_map *map; | ||
3761 | 3885 | ||
3762 | if (!ftrace_probe_registered) | 3886 | entry = ftrace_lookup_ip(&mapper->hash, ip); |
3763 | return false; | 3887 | if (entry) |
3888 | return -EBUSY; | ||
3764 | 3889 | ||
3765 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | 3890 | map = kmalloc(sizeof(*map), GFP_KERNEL); |
3766 | struct hlist_head *hhd = &ftrace_func_hash[i]; | 3891 | if (!map) |
3767 | if (hhd->first) | 3892 | return -ENOMEM; |
3768 | return false; | ||
3769 | } | ||
3770 | 3893 | ||
3771 | /* no more funcs left */ | 3894 | map->entry.ip = ip; |
3772 | ftrace_shutdown(&trace_probe_ops, 0); | 3895 | map->data = data; |
3773 | 3896 | ||
3774 | ftrace_probe_registered = 0; | 3897 | __add_hash_entry(&mapper->hash, &map->entry); |
3775 | return true; | ||
3776 | } | ||
3777 | 3898 | ||
3899 | return 0; | ||
3900 | } | ||
3778 | 3901 | ||
3779 | static void ftrace_free_entry(struct ftrace_func_probe *entry) | 3902 | /** |
3903 | * ftrace_func_mapper_remove_ip - Remove an ip from the mapping | ||
3904 | * @mapper: The mapper that has the ip maps | ||
3905 | * @ip: The instruction pointer address to remove the data from | ||
3906 | * | ||
3907 | * Returns the data if it is found, otherwise NULL. | ||
3908 | * Note, if the data pointer is used as the data itself, (see | ||
3909 | * ftrace_func_mapper_find_ip(), then the return value may be meaningless, | ||
3910 | * if the data pointer was set to zero. | ||
3911 | */ | ||
3912 | void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, | ||
3913 | unsigned long ip) | ||
3780 | { | 3914 | { |
3781 | if (entry->ops->free) | 3915 | struct ftrace_func_entry *entry; |
3782 | entry->ops->free(entry->ops, entry->ip, &entry->data); | 3916 | struct ftrace_func_map *map; |
3917 | void *data; | ||
3918 | |||
3919 | entry = ftrace_lookup_ip(&mapper->hash, ip); | ||
3920 | if (!entry) | ||
3921 | return NULL; | ||
3922 | |||
3923 | map = (struct ftrace_func_map *)entry; | ||
3924 | data = map->data; | ||
3925 | |||
3926 | remove_hash_entry(&mapper->hash, entry); | ||
3783 | kfree(entry); | 3927 | kfree(entry); |
3928 | |||
3929 | return data; | ||
3930 | } | ||
3931 | |||
3932 | /** | ||
3933 | * free_ftrace_func_mapper - free a mapping of ips and data | ||
3934 | * @mapper: The mapper that has the ip maps | ||
3935 | * @free_func: A function to be called on each data item. | ||
3936 | * | ||
3937 | * This is used to free the function mapper. The @free_func is optional | ||
3938 | * and can be used if the data needs to be freed as well. | ||
3939 | */ | ||
3940 | void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, | ||
3941 | ftrace_mapper_func free_func) | ||
3942 | { | ||
3943 | struct ftrace_func_entry *entry; | ||
3944 | struct ftrace_func_map *map; | ||
3945 | struct hlist_head *hhd; | ||
3946 | int size = 1 << mapper->hash.size_bits; | ||
3947 | int i; | ||
3948 | |||
3949 | if (free_func && mapper->hash.count) { | ||
3950 | for (i = 0; i < size; i++) { | ||
3951 | hhd = &mapper->hash.buckets[i]; | ||
3952 | hlist_for_each_entry(entry, hhd, hlist) { | ||
3953 | map = (struct ftrace_func_map *)entry; | ||
3954 | free_func(map); | ||
3955 | } | ||
3956 | } | ||
3957 | } | ||
3958 | free_ftrace_hash(&mapper->hash); | ||
3959 | } | ||
3960 | |||
3961 | static void release_probe(struct ftrace_func_probe *probe) | ||
3962 | { | ||
3963 | struct ftrace_probe_ops *probe_ops; | ||
3964 | |||
3965 | mutex_lock(&ftrace_lock); | ||
3966 | |||
3967 | WARN_ON(probe->ref <= 0); | ||
3968 | |||
3969 | /* Subtract the ref that was used to protect this instance */ | ||
3970 | probe->ref--; | ||
3971 | |||
3972 | if (!probe->ref) { | ||
3973 | probe_ops = probe->probe_ops; | ||
3974 | /* | ||
3975 | * Sending zero as ip tells probe_ops to free | ||
3976 | * the probe->data itself | ||
3977 | */ | ||
3978 | if (probe_ops->free) | ||
3979 | probe_ops->free(probe_ops, probe->tr, 0, probe->data); | ||
3980 | list_del(&probe->list); | ||
3981 | kfree(probe); | ||
3982 | } | ||
3983 | mutex_unlock(&ftrace_lock); | ||
3984 | } | ||
3985 | |||
3986 | static void acquire_probe_locked(struct ftrace_func_probe *probe) | ||
3987 | { | ||
3988 | /* | ||
3989 | * Add one ref to keep it from being freed when releasing the | ||
3990 | * ftrace_lock mutex. | ||
3991 | */ | ||
3992 | probe->ref++; | ||
3784 | } | 3993 | } |
3785 | 3994 | ||
3786 | int | 3995 | int |
3787 | register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | 3996 | register_ftrace_function_probe(char *glob, struct trace_array *tr, |
3788 | void *data) | 3997 | struct ftrace_probe_ops *probe_ops, |
3998 | void *data) | ||
3789 | { | 3999 | { |
3790 | struct ftrace_ops_hash old_hash_ops; | 4000 | struct ftrace_func_entry *entry; |
3791 | struct ftrace_func_probe *entry; | 4001 | struct ftrace_func_probe *probe; |
3792 | struct ftrace_glob func_g; | 4002 | struct ftrace_hash **orig_hash; |
3793 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; | 4003 | struct ftrace_hash *old_hash; |
3794 | struct ftrace_hash *old_hash = *orig_hash; | ||
3795 | struct ftrace_hash *hash; | 4004 | struct ftrace_hash *hash; |
3796 | struct ftrace_page *pg; | ||
3797 | struct dyn_ftrace *rec; | ||
3798 | int not; | ||
3799 | unsigned long key; | ||
3800 | int count = 0; | 4005 | int count = 0; |
4006 | int size; | ||
3801 | int ret; | 4007 | int ret; |
4008 | int i; | ||
3802 | 4009 | ||
3803 | func_g.type = filter_parse_regex(glob, strlen(glob), | 4010 | if (WARN_ON(!tr)) |
3804 | &func_g.search, ¬); | ||
3805 | func_g.len = strlen(func_g.search); | ||
3806 | |||
3807 | /* we do not support '!' for function probes */ | ||
3808 | if (WARN_ON(not)) | ||
3809 | return -EINVAL; | 4011 | return -EINVAL; |
3810 | 4012 | ||
3811 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); | 4013 | /* We do not support '!' for function probes */ |
4014 | if (WARN_ON(glob[0] == '!')) | ||
4015 | return -EINVAL; | ||
3812 | 4016 | ||
3813 | old_hash_ops.filter_hash = old_hash; | ||
3814 | /* Probes only have filters */ | ||
3815 | old_hash_ops.notrace_hash = NULL; | ||
3816 | 4017 | ||
3817 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); | 4018 | mutex_lock(&ftrace_lock); |
3818 | if (!hash) { | 4019 | /* Check if the probe_ops is already registered */ |
3819 | count = -ENOMEM; | 4020 | list_for_each_entry(probe, &tr->func_probes, list) { |
3820 | goto out; | 4021 | if (probe->probe_ops == probe_ops) |
4022 | break; | ||
3821 | } | 4023 | } |
3822 | 4024 | if (&probe->list == &tr->func_probes) { | |
3823 | if (unlikely(ftrace_disabled)) { | 4025 | probe = kzalloc(sizeof(*probe), GFP_KERNEL); |
3824 | count = -ENODEV; | 4026 | if (!probe) { |
3825 | goto out; | 4027 | mutex_unlock(&ftrace_lock); |
4028 | return -ENOMEM; | ||
4029 | } | ||
4030 | probe->probe_ops = probe_ops; | ||
4031 | probe->ops.func = function_trace_probe_call; | ||
4032 | probe->tr = tr; | ||
4033 | ftrace_ops_init(&probe->ops); | ||
4034 | list_add(&probe->list, &tr->func_probes); | ||
3826 | } | 4035 | } |
3827 | 4036 | ||
3828 | mutex_lock(&ftrace_lock); | 4037 | acquire_probe_locked(probe); |
3829 | 4038 | ||
3830 | do_for_each_ftrace_rec(pg, rec) { | 4039 | mutex_unlock(&ftrace_lock); |
3831 | 4040 | ||
3832 | if (rec->flags & FTRACE_FL_DISABLED) | 4041 | mutex_lock(&probe->ops.func_hash->regex_lock); |
3833 | continue; | ||
3834 | 4042 | ||
3835 | if (!ftrace_match_record(rec, &func_g, NULL, 0)) | 4043 | orig_hash = &probe->ops.func_hash->filter_hash; |
3836 | continue; | 4044 | old_hash = *orig_hash; |
4045 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); | ||
3837 | 4046 | ||
3838 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 4047 | ret = ftrace_match_records(hash, glob, strlen(glob)); |
3839 | if (!entry) { | ||
3840 | /* If we did not process any, then return error */ | ||
3841 | if (!count) | ||
3842 | count = -ENOMEM; | ||
3843 | goto out_unlock; | ||
3844 | } | ||
3845 | 4048 | ||
3846 | count++; | 4049 | /* Nothing found? */ |
4050 | if (!ret) | ||
4051 | ret = -EINVAL; | ||
3847 | 4052 | ||
3848 | entry->data = data; | 4053 | if (ret < 0) |
4054 | goto out; | ||
3849 | 4055 | ||
3850 | /* | 4056 | size = 1 << hash->size_bits; |
3851 | * The caller might want to do something special | 4057 | for (i = 0; i < size; i++) { |
3852 | * for each function we find. We call the callback | 4058 | hlist_for_each_entry(entry, &hash->buckets[i], hlist) { |
3853 | * to give the caller an opportunity to do so. | 4059 | if (ftrace_lookup_ip(old_hash, entry->ip)) |
3854 | */ | ||
3855 | if (ops->init) { | ||
3856 | if (ops->init(ops, rec->ip, &entry->data) < 0) { | ||
3857 | /* caller does not like this func */ | ||
3858 | kfree(entry); | ||
3859 | continue; | 4060 | continue; |
4061 | /* | ||
4062 | * The caller might want to do something special | ||
4063 | * for each function we find. We call the callback | ||
4064 | * to give the caller an opportunity to do so. | ||
4065 | */ | ||
4066 | if (probe_ops->init) { | ||
4067 | ret = probe_ops->init(probe_ops, tr, | ||
4068 | entry->ip, data, | ||
4069 | &probe->data); | ||
4070 | if (ret < 0) { | ||
4071 | if (probe_ops->free && count) | ||
4072 | probe_ops->free(probe_ops, tr, | ||
4073 | 0, probe->data); | ||
4074 | probe->data = NULL; | ||
4075 | goto out; | ||
4076 | } | ||
3860 | } | 4077 | } |
4078 | count++; | ||
3861 | } | 4079 | } |
4080 | } | ||
3862 | 4081 | ||
3863 | ret = enter_record(hash, rec, 0); | 4082 | mutex_lock(&ftrace_lock); |
3864 | if (ret < 0) { | ||
3865 | kfree(entry); | ||
3866 | count = ret; | ||
3867 | goto out_unlock; | ||
3868 | } | ||
3869 | |||
3870 | entry->ops = ops; | ||
3871 | entry->ip = rec->ip; | ||
3872 | |||
3873 | key = hash_long(entry->ip, FTRACE_HASH_BITS); | ||
3874 | hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); | ||
3875 | 4083 | ||
3876 | } while_for_each_ftrace_rec(); | 4084 | if (!count) { |
4085 | /* Nothing was added? */ | ||
4086 | ret = -EINVAL; | ||
4087 | goto out_unlock; | ||
4088 | } | ||
3877 | 4089 | ||
3878 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 4090 | ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, |
4091 | hash, 1); | ||
4092 | if (ret < 0) | ||
4093 | goto err_unlock; | ||
3879 | 4094 | ||
3880 | __enable_ftrace_function_probe(&old_hash_ops); | 4095 | /* One ref for each new function traced */ |
4096 | probe->ref += count; | ||
3881 | 4097 | ||
3882 | if (!ret) | 4098 | if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) |
3883 | free_ftrace_hash_rcu(old_hash); | 4099 | ret = ftrace_startup(&probe->ops, 0); |
3884 | else | ||
3885 | count = ret; | ||
3886 | 4100 | ||
3887 | out_unlock: | 4101 | out_unlock: |
3888 | mutex_unlock(&ftrace_lock); | 4102 | mutex_unlock(&ftrace_lock); |
4103 | |||
4104 | if (!ret) | ||
4105 | ret = count; | ||
3889 | out: | 4106 | out: |
3890 | mutex_unlock(&trace_probe_ops.func_hash->regex_lock); | 4107 | mutex_unlock(&probe->ops.func_hash->regex_lock); |
3891 | free_ftrace_hash(hash); | 4108 | free_ftrace_hash(hash); |
3892 | 4109 | ||
3893 | return count; | 4110 | release_probe(probe); |
3894 | } | ||
3895 | 4111 | ||
3896 | enum { | 4112 | return ret; |
3897 | PROBE_TEST_FUNC = 1, | ||
3898 | PROBE_TEST_DATA = 2 | ||
3899 | }; | ||
3900 | 4113 | ||
3901 | static void | 4114 | err_unlock: |
3902 | __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | 4115 | if (!probe_ops->free || !count) |
3903 | void *data, int flags) | 4116 | goto out_unlock; |
4117 | |||
4118 | /* Failed to do the move, need to call the free functions */ | ||
4119 | for (i = 0; i < size; i++) { | ||
4120 | hlist_for_each_entry(entry, &hash->buckets[i], hlist) { | ||
4121 | if (ftrace_lookup_ip(old_hash, entry->ip)) | ||
4122 | continue; | ||
4123 | probe_ops->free(probe_ops, tr, entry->ip, probe->data); | ||
4124 | } | ||
4125 | } | ||
4126 | goto out_unlock; | ||
4127 | } | ||
4128 | |||
4129 | int | ||
4130 | unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, | ||
4131 | struct ftrace_probe_ops *probe_ops) | ||
3904 | { | 4132 | { |
3905 | struct ftrace_ops_hash old_hash_ops; | 4133 | struct ftrace_ops_hash old_hash_ops; |
3906 | struct ftrace_func_entry *rec_entry; | 4134 | struct ftrace_func_entry *entry; |
3907 | struct ftrace_func_probe *entry; | 4135 | struct ftrace_func_probe *probe; |
3908 | struct ftrace_func_probe *p; | ||
3909 | struct ftrace_glob func_g; | 4136 | struct ftrace_glob func_g; |
3910 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; | 4137 | struct ftrace_hash **orig_hash; |
3911 | struct ftrace_hash *old_hash = *orig_hash; | 4138 | struct ftrace_hash *old_hash; |
3912 | struct list_head free_list; | 4139 | struct ftrace_hash *hash = NULL; |
3913 | struct ftrace_hash *hash; | ||
3914 | struct hlist_node *tmp; | 4140 | struct hlist_node *tmp; |
4141 | struct hlist_head hhd; | ||
3915 | char str[KSYM_SYMBOL_LEN]; | 4142 | char str[KSYM_SYMBOL_LEN]; |
3916 | int i, ret; | 4143 | int count = 0; |
3917 | bool disabled; | 4144 | int i, ret = -ENODEV; |
4145 | int size; | ||
3918 | 4146 | ||
3919 | if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) | 4147 | if (!glob || !strlen(glob) || !strcmp(glob, "*")) |
3920 | func_g.search = NULL; | 4148 | func_g.search = NULL; |
3921 | else if (glob) { | 4149 | else { |
3922 | int not; | 4150 | int not; |
3923 | 4151 | ||
3924 | func_g.type = filter_parse_regex(glob, strlen(glob), | 4152 | func_g.type = filter_parse_regex(glob, strlen(glob), |
@@ -3928,95 +4156,112 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
3928 | 4156 | ||
3929 | /* we do not support '!' for function probes */ | 4157 | /* we do not support '!' for function probes */ |
3930 | if (WARN_ON(not)) | 4158 | if (WARN_ON(not)) |
3931 | return; | 4159 | return -EINVAL; |
3932 | } | 4160 | } |
3933 | 4161 | ||
3934 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); | 4162 | mutex_lock(&ftrace_lock); |
4163 | /* Check if the probe_ops is already registered */ | ||
4164 | list_for_each_entry(probe, &tr->func_probes, list) { | ||
4165 | if (probe->probe_ops == probe_ops) | ||
4166 | break; | ||
4167 | } | ||
4168 | if (&probe->list == &tr->func_probes) | ||
4169 | goto err_unlock_ftrace; | ||
4170 | |||
4171 | ret = -EINVAL; | ||
4172 | if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) | ||
4173 | goto err_unlock_ftrace; | ||
4174 | |||
4175 | acquire_probe_locked(probe); | ||
4176 | |||
4177 | mutex_unlock(&ftrace_lock); | ||
4178 | |||
4179 | mutex_lock(&probe->ops.func_hash->regex_lock); | ||
4180 | |||
4181 | orig_hash = &probe->ops.func_hash->filter_hash; | ||
4182 | old_hash = *orig_hash; | ||
4183 | |||
4184 | if (ftrace_hash_empty(old_hash)) | ||
4185 | goto out_unlock; | ||
3935 | 4186 | ||
3936 | old_hash_ops.filter_hash = old_hash; | 4187 | old_hash_ops.filter_hash = old_hash; |
3937 | /* Probes only have filters */ | 4188 | /* Probes only have filters */ |
3938 | old_hash_ops.notrace_hash = NULL; | 4189 | old_hash_ops.notrace_hash = NULL; |
3939 | 4190 | ||
3940 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 4191 | ret = -ENOMEM; |
4192 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); | ||
3941 | if (!hash) | 4193 | if (!hash) |
3942 | /* Hmm, should report this somehow */ | ||
3943 | goto out_unlock; | 4194 | goto out_unlock; |
3944 | 4195 | ||
3945 | INIT_LIST_HEAD(&free_list); | 4196 | INIT_HLIST_HEAD(&hhd); |
3946 | |||
3947 | for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { | ||
3948 | struct hlist_head *hhd = &ftrace_func_hash[i]; | ||
3949 | 4197 | ||
3950 | hlist_for_each_entry_safe(entry, tmp, hhd, node) { | 4198 | size = 1 << hash->size_bits; |
3951 | 4199 | for (i = 0; i < size; i++) { | |
3952 | /* break up if statements for readability */ | 4200 | hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { |
3953 | if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) | ||
3954 | continue; | ||
3955 | |||
3956 | if ((flags & PROBE_TEST_DATA) && entry->data != data) | ||
3957 | continue; | ||
3958 | 4201 | ||
3959 | /* do this last, since it is the most expensive */ | ||
3960 | if (func_g.search) { | 4202 | if (func_g.search) { |
3961 | kallsyms_lookup(entry->ip, NULL, NULL, | 4203 | kallsyms_lookup(entry->ip, NULL, NULL, |
3962 | NULL, str); | 4204 | NULL, str); |
3963 | if (!ftrace_match(str, &func_g)) | 4205 | if (!ftrace_match(str, &func_g)) |
3964 | continue; | 4206 | continue; |
3965 | } | 4207 | } |
3966 | 4208 | count++; | |
3967 | rec_entry = ftrace_lookup_ip(hash, entry->ip); | 4209 | remove_hash_entry(hash, entry); |
3968 | /* It is possible more than one entry had this ip */ | 4210 | hlist_add_head(&entry->hlist, &hhd); |
3969 | if (rec_entry) | ||
3970 | free_hash_entry(hash, rec_entry); | ||
3971 | |||
3972 | hlist_del_rcu(&entry->node); | ||
3973 | list_add(&entry->free_list, &free_list); | ||
3974 | } | 4211 | } |
3975 | } | 4212 | } |
4213 | |||
4214 | /* Nothing found? */ | ||
4215 | if (!count) { | ||
4216 | ret = -EINVAL; | ||
4217 | goto out_unlock; | ||
4218 | } | ||
4219 | |||
3976 | mutex_lock(&ftrace_lock); | 4220 | mutex_lock(&ftrace_lock); |
3977 | disabled = __disable_ftrace_function_probe(); | 4221 | |
3978 | /* | 4222 | WARN_ON(probe->ref < count); |
3979 | * Remove after the disable is called. Otherwise, if the last | 4223 | |
3980 | * probe is removed, a null hash means *all enabled*. | 4224 | probe->ref -= count; |
3981 | */ | 4225 | |
3982 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 4226 | if (ftrace_hash_empty(hash)) |
4227 | ftrace_shutdown(&probe->ops, 0); | ||
4228 | |||
4229 | ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, | ||
4230 | hash, 1); | ||
3983 | 4231 | ||
3984 | /* still need to update the function call sites */ | 4232 | /* still need to update the function call sites */ |
3985 | if (ftrace_enabled && !disabled) | 4233 | if (ftrace_enabled && !ftrace_hash_empty(hash)) |
3986 | ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, | 4234 | ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, |
3987 | &old_hash_ops); | 4235 | &old_hash_ops); |
3988 | synchronize_sched(); | 4236 | synchronize_sched(); |
3989 | if (!ret) | ||
3990 | free_ftrace_hash_rcu(old_hash); | ||
3991 | 4237 | ||
3992 | list_for_each_entry_safe(entry, p, &free_list, free_list) { | 4238 | hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { |
3993 | list_del(&entry->free_list); | 4239 | hlist_del(&entry->hlist); |
3994 | ftrace_free_entry(entry); | 4240 | if (probe_ops->free) |
4241 | probe_ops->free(probe_ops, tr, entry->ip, probe->data); | ||
4242 | kfree(entry); | ||
3995 | } | 4243 | } |
3996 | mutex_unlock(&ftrace_lock); | 4244 | mutex_unlock(&ftrace_lock); |
3997 | 4245 | ||
3998 | out_unlock: | 4246 | out_unlock: |
3999 | mutex_unlock(&trace_probe_ops.func_hash->regex_lock); | 4247 | mutex_unlock(&probe->ops.func_hash->regex_lock); |
4000 | free_ftrace_hash(hash); | 4248 | free_ftrace_hash(hash); |
4001 | } | ||
4002 | 4249 | ||
4003 | void | 4250 | release_probe(probe); |
4004 | unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | ||
4005 | void *data) | ||
4006 | { | ||
4007 | __unregister_ftrace_function_probe(glob, ops, data, | ||
4008 | PROBE_TEST_FUNC | PROBE_TEST_DATA); | ||
4009 | } | ||
4010 | 4251 | ||
4011 | void | 4252 | return ret; |
4012 | unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) | 4253 | |
4013 | { | 4254 | err_unlock_ftrace: |
4014 | __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); | 4255 | mutex_unlock(&ftrace_lock); |
4256 | return ret; | ||
4015 | } | 4257 | } |
4016 | 4258 | ||
4017 | void unregister_ftrace_function_probe_all(char *glob) | 4259 | void clear_ftrace_function_probes(struct trace_array *tr) |
4018 | { | 4260 | { |
4019 | __unregister_ftrace_function_probe(glob, NULL, NULL, 0); | 4261 | struct ftrace_func_probe *probe, *n; |
4262 | |||
4263 | list_for_each_entry_safe(probe, n, &tr->func_probes, list) | ||
4264 | unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); | ||
4020 | } | 4265 | } |
4021 | 4266 | ||
4022 | static LIST_HEAD(ftrace_commands); | 4267 | static LIST_HEAD(ftrace_commands); |
@@ -4068,9 +4313,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd) | |||
4068 | return ret; | 4313 | return ret; |
4069 | } | 4314 | } |
4070 | 4315 | ||
4071 | static int ftrace_process_regex(struct ftrace_hash *hash, | 4316 | static int ftrace_process_regex(struct ftrace_iterator *iter, |
4072 | char *buff, int len, int enable) | 4317 | char *buff, int len, int enable) |
4073 | { | 4318 | { |
4319 | struct ftrace_hash *hash = iter->hash; | ||
4320 | struct trace_array *tr = iter->ops->private; | ||
4074 | char *func, *command, *next = buff; | 4321 | char *func, *command, *next = buff; |
4075 | struct ftrace_func_command *p; | 4322 | struct ftrace_func_command *p; |
4076 | int ret = -EINVAL; | 4323 | int ret = -EINVAL; |
@@ -4090,10 +4337,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash, | |||
4090 | 4337 | ||
4091 | command = strsep(&next, ":"); | 4338 | command = strsep(&next, ":"); |
4092 | 4339 | ||
4340 | if (WARN_ON_ONCE(!tr)) | ||
4341 | return -EINVAL; | ||
4342 | |||
4093 | mutex_lock(&ftrace_cmd_mutex); | 4343 | mutex_lock(&ftrace_cmd_mutex); |
4094 | list_for_each_entry(p, &ftrace_commands, list) { | 4344 | list_for_each_entry(p, &ftrace_commands, list) { |
4095 | if (strcmp(p->name, command) == 0) { | 4345 | if (strcmp(p->name, command) == 0) { |
4096 | ret = p->func(hash, func, command, next, enable); | 4346 | ret = p->func(tr, hash, func, command, next, enable); |
4097 | goto out_unlock; | 4347 | goto out_unlock; |
4098 | } | 4348 | } |
4099 | } | 4349 | } |
@@ -4130,7 +4380,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
4130 | 4380 | ||
4131 | if (read >= 0 && trace_parser_loaded(parser) && | 4381 | if (read >= 0 && trace_parser_loaded(parser) && |
4132 | !trace_parser_cont(parser)) { | 4382 | !trace_parser_cont(parser)) { |
4133 | ret = ftrace_process_regex(iter->hash, parser->buffer, | 4383 | ret = ftrace_process_regex(iter, parser->buffer, |
4134 | parser->idx, enable); | 4384 | parser->idx, enable); |
4135 | trace_parser_clear(parser); | 4385 | trace_parser_clear(parser); |
4136 | if (ret < 0) | 4386 | if (ret < 0) |
@@ -4175,44 +4425,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) | |||
4175 | return add_hash_entry(hash, ip); | 4425 | return add_hash_entry(hash, ip); |
4176 | } | 4426 | } |
4177 | 4427 | ||
4178 | static void ftrace_ops_update_code(struct ftrace_ops *ops, | ||
4179 | struct ftrace_ops_hash *old_hash) | ||
4180 | { | ||
4181 | struct ftrace_ops *op; | ||
4182 | |||
4183 | if (!ftrace_enabled) | ||
4184 | return; | ||
4185 | |||
4186 | if (ops->flags & FTRACE_OPS_FL_ENABLED) { | ||
4187 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); | ||
4188 | return; | ||
4189 | } | ||
4190 | |||
4191 | /* | ||
4192 | * If this is the shared global_ops filter, then we need to | ||
4193 | * check if there is another ops that shares it, is enabled. | ||
4194 | * If so, we still need to run the modify code. | ||
4195 | */ | ||
4196 | if (ops->func_hash != &global_ops.local_hash) | ||
4197 | return; | ||
4198 | |||
4199 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
4200 | if (op->func_hash == &global_ops.local_hash && | ||
4201 | op->flags & FTRACE_OPS_FL_ENABLED) { | ||
4202 | ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); | ||
4203 | /* Only need to do this once */ | ||
4204 | return; | ||
4205 | } | ||
4206 | } while_for_each_ftrace_op(op); | ||
4207 | } | ||
4208 | |||
4209 | static int | 4428 | static int |
4210 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | 4429 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, |
4211 | unsigned long ip, int remove, int reset, int enable) | 4430 | unsigned long ip, int remove, int reset, int enable) |
4212 | { | 4431 | { |
4213 | struct ftrace_hash **orig_hash; | 4432 | struct ftrace_hash **orig_hash; |
4214 | struct ftrace_ops_hash old_hash_ops; | ||
4215 | struct ftrace_hash *old_hash; | ||
4216 | struct ftrace_hash *hash; | 4433 | struct ftrace_hash *hash; |
4217 | int ret; | 4434 | int ret; |
4218 | 4435 | ||
@@ -4247,14 +4464,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
4247 | } | 4464 | } |
4248 | 4465 | ||
4249 | mutex_lock(&ftrace_lock); | 4466 | mutex_lock(&ftrace_lock); |
4250 | old_hash = *orig_hash; | 4467 | ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); |
4251 | old_hash_ops.filter_hash = ops->func_hash->filter_hash; | ||
4252 | old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; | ||
4253 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | ||
4254 | if (!ret) { | ||
4255 | ftrace_ops_update_code(ops, &old_hash_ops); | ||
4256 | free_ftrace_hash_rcu(old_hash); | ||
4257 | } | ||
4258 | mutex_unlock(&ftrace_lock); | 4468 | mutex_unlock(&ftrace_lock); |
4259 | 4469 | ||
4260 | out_regex_unlock: | 4470 | out_regex_unlock: |
@@ -4493,10 +4703,8 @@ static void __init set_ftrace_early_filters(void) | |||
4493 | int ftrace_regex_release(struct inode *inode, struct file *file) | 4703 | int ftrace_regex_release(struct inode *inode, struct file *file) |
4494 | { | 4704 | { |
4495 | struct seq_file *m = (struct seq_file *)file->private_data; | 4705 | struct seq_file *m = (struct seq_file *)file->private_data; |
4496 | struct ftrace_ops_hash old_hash_ops; | ||
4497 | struct ftrace_iterator *iter; | 4706 | struct ftrace_iterator *iter; |
4498 | struct ftrace_hash **orig_hash; | 4707 | struct ftrace_hash **orig_hash; |
4499 | struct ftrace_hash *old_hash; | ||
4500 | struct trace_parser *parser; | 4708 | struct trace_parser *parser; |
4501 | int filter_hash; | 4709 | int filter_hash; |
4502 | int ret; | 4710 | int ret; |
@@ -4526,16 +4734,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
4526 | orig_hash = &iter->ops->func_hash->notrace_hash; | 4734 | orig_hash = &iter->ops->func_hash->notrace_hash; |
4527 | 4735 | ||
4528 | mutex_lock(&ftrace_lock); | 4736 | mutex_lock(&ftrace_lock); |
4529 | old_hash = *orig_hash; | 4737 | ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, |
4530 | old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; | 4738 | iter->hash, filter_hash); |
4531 | old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; | ||
4532 | ret = ftrace_hash_move(iter->ops, filter_hash, | ||
4533 | orig_hash, iter->hash); | ||
4534 | if (!ret) { | ||
4535 | ftrace_ops_update_code(iter->ops, &old_hash_ops); | ||
4536 | free_ftrace_hash_rcu(old_hash); | ||
4537 | } | ||
4538 | mutex_unlock(&ftrace_lock); | 4739 | mutex_unlock(&ftrace_lock); |
4740 | } else { | ||
4741 | /* For read only, the hash is the ops hash */ | ||
4742 | iter->hash = NULL; | ||
4539 | } | 4743 | } |
4540 | 4744 | ||
4541 | mutex_unlock(&iter->ops->func_hash->regex_lock); | 4745 | mutex_unlock(&iter->ops->func_hash->regex_lock); |
@@ -5274,6 +5478,50 @@ void ftrace_module_init(struct module *mod) | |||
5274 | } | 5478 | } |
5275 | #endif /* CONFIG_MODULES */ | 5479 | #endif /* CONFIG_MODULES */ |
5276 | 5480 | ||
5481 | void __init ftrace_free_init_mem(void) | ||
5482 | { | ||
5483 | unsigned long start = (unsigned long)(&__init_begin); | ||
5484 | unsigned long end = (unsigned long)(&__init_end); | ||
5485 | struct ftrace_page **last_pg = &ftrace_pages_start; | ||
5486 | struct ftrace_page *pg; | ||
5487 | struct dyn_ftrace *rec; | ||
5488 | struct dyn_ftrace key; | ||
5489 | int order; | ||
5490 | |||
5491 | key.ip = start; | ||
5492 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
5493 | |||
5494 | mutex_lock(&ftrace_lock); | ||
5495 | |||
5496 | for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { | ||
5497 | if (end < pg->records[0].ip || | ||
5498 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
5499 | continue; | ||
5500 | again: | ||
5501 | rec = bsearch(&key, pg->records, pg->index, | ||
5502 | sizeof(struct dyn_ftrace), | ||
5503 | ftrace_cmp_recs); | ||
5504 | if (!rec) | ||
5505 | continue; | ||
5506 | pg->index--; | ||
5507 | if (!pg->index) { | ||
5508 | *last_pg = pg->next; | ||
5509 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | ||
5510 | free_pages((unsigned long)pg->records, order); | ||
5511 | kfree(pg); | ||
5512 | pg = container_of(last_pg, struct ftrace_page, next); | ||
5513 | if (!(*last_pg)) | ||
5514 | ftrace_pages = pg; | ||
5515 | continue; | ||
5516 | } | ||
5517 | memmove(rec, rec + 1, | ||
5518 | (pg->index - (rec - pg->records)) * sizeof(*rec)); | ||
5519 | /* More than one function may be in this block */ | ||
5520 | goto again; | ||
5521 | } | ||
5522 | mutex_unlock(&ftrace_lock); | ||
5523 | } | ||
5524 | |||
5277 | void __init ftrace_init(void) | 5525 | void __init ftrace_init(void) |
5278 | { | 5526 | { |
5279 | extern unsigned long __start_mcount_loc[]; | 5527 | extern unsigned long __start_mcount_loc[]; |
@@ -5316,25 +5564,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) | |||
5316 | 5564 | ||
5317 | static void ftrace_update_trampoline(struct ftrace_ops *ops) | 5565 | static void ftrace_update_trampoline(struct ftrace_ops *ops) |
5318 | { | 5566 | { |
5319 | |||
5320 | /* | ||
5321 | * Currently there's no safe way to free a trampoline when the kernel | ||
5322 | * is configured with PREEMPT. That is because a task could be preempted | ||
5323 | * when it jumped to the trampoline, it may be preempted for a long time | ||
5324 | * depending on the system load, and currently there's no way to know | ||
5325 | * when it will be off the trampoline. If the trampoline is freed | ||
5326 | * too early, when the task runs again, it will be executing on freed | ||
5327 | * memory and crash. | ||
5328 | */ | ||
5329 | #ifdef CONFIG_PREEMPT | ||
5330 | /* Currently, only non dynamic ops can have a trampoline */ | ||
5331 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
5332 | return; | ||
5333 | #endif | ||
5334 | |||
5335 | arch_ftrace_update_trampoline(ops); | 5567 | arch_ftrace_update_trampoline(ops); |
5336 | } | 5568 | } |
5337 | 5569 | ||
5570 | void ftrace_init_trace_array(struct trace_array *tr) | ||
5571 | { | ||
5572 | INIT_LIST_HEAD(&tr->func_probes); | ||
5573 | } | ||
5338 | #else | 5574 | #else |
5339 | 5575 | ||
5340 | static struct ftrace_ops global_ops = { | 5576 | static struct ftrace_ops global_ops = { |
@@ -5389,6 +5625,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr) | |||
5389 | { | 5625 | { |
5390 | tr->ops = &global_ops; | 5626 | tr->ops = &global_ops; |
5391 | tr->ops->private = tr; | 5627 | tr->ops->private = tr; |
5628 | ftrace_init_trace_array(tr); | ||
5392 | } | 5629 | } |
5393 | 5630 | ||
5394 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) | 5631 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) |
@@ -5543,6 +5780,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, | |||
5543 | trace_ignore_this_task(pid_list, next)); | 5780 | trace_ignore_this_task(pid_list, next)); |
5544 | } | 5781 | } |
5545 | 5782 | ||
5783 | static void | ||
5784 | ftrace_pid_follow_sched_process_fork(void *data, | ||
5785 | struct task_struct *self, | ||
5786 | struct task_struct *task) | ||
5787 | { | ||
5788 | struct trace_pid_list *pid_list; | ||
5789 | struct trace_array *tr = data; | ||
5790 | |||
5791 | pid_list = rcu_dereference_sched(tr->function_pids); | ||
5792 | trace_filter_add_remove_task(pid_list, self, task); | ||
5793 | } | ||
5794 | |||
5795 | static void | ||
5796 | ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) | ||
5797 | { | ||
5798 | struct trace_pid_list *pid_list; | ||
5799 | struct trace_array *tr = data; | ||
5800 | |||
5801 | pid_list = rcu_dereference_sched(tr->function_pids); | ||
5802 | trace_filter_add_remove_task(pid_list, NULL, task); | ||
5803 | } | ||
5804 | |||
5805 | void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) | ||
5806 | { | ||
5807 | if (enable) { | ||
5808 | register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, | ||
5809 | tr); | ||
5810 | register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, | ||
5811 | tr); | ||
5812 | } else { | ||
5813 | unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, | ||
5814 | tr); | ||
5815 | unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, | ||
5816 | tr); | ||
5817 | } | ||
5818 | } | ||
5819 | |||
5546 | static void clear_ftrace_pids(struct trace_array *tr) | 5820 | static void clear_ftrace_pids(struct trace_array *tr) |
5547 | { | 5821 | { |
5548 | struct trace_pid_list *pid_list; | 5822 | struct trace_pid_list *pid_list; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ca47a4fa2986..4ae268e687fe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -438,6 +438,7 @@ struct ring_buffer_per_cpu { | |||
438 | raw_spinlock_t reader_lock; /* serialize readers */ | 438 | raw_spinlock_t reader_lock; /* serialize readers */ |
439 | arch_spinlock_t lock; | 439 | arch_spinlock_t lock; |
440 | struct lock_class_key lock_key; | 440 | struct lock_class_key lock_key; |
441 | struct buffer_data_page *free_page; | ||
441 | unsigned long nr_pages; | 442 | unsigned long nr_pages; |
442 | unsigned int current_context; | 443 | unsigned int current_context; |
443 | struct list_head *pages; | 444 | struct list_head *pages; |
@@ -4389,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); | |||
4389 | */ | 4390 | */ |
4390 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) | 4391 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) |
4391 | { | 4392 | { |
4392 | struct buffer_data_page *bpage; | 4393 | struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; |
4394 | struct buffer_data_page *bpage = NULL; | ||
4395 | unsigned long flags; | ||
4393 | struct page *page; | 4396 | struct page *page; |
4394 | 4397 | ||
4398 | local_irq_save(flags); | ||
4399 | arch_spin_lock(&cpu_buffer->lock); | ||
4400 | |||
4401 | if (cpu_buffer->free_page) { | ||
4402 | bpage = cpu_buffer->free_page; | ||
4403 | cpu_buffer->free_page = NULL; | ||
4404 | } | ||
4405 | |||
4406 | arch_spin_unlock(&cpu_buffer->lock); | ||
4407 | local_irq_restore(flags); | ||
4408 | |||
4409 | if (bpage) | ||
4410 | goto out; | ||
4411 | |||
4395 | page = alloc_pages_node(cpu_to_node(cpu), | 4412 | page = alloc_pages_node(cpu_to_node(cpu), |
4396 | GFP_KERNEL | __GFP_NORETRY, 0); | 4413 | GFP_KERNEL | __GFP_NORETRY, 0); |
4397 | if (!page) | 4414 | if (!page) |
@@ -4399,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) | |||
4399 | 4416 | ||
4400 | bpage = page_address(page); | 4417 | bpage = page_address(page); |
4401 | 4418 | ||
4419 | out: | ||
4402 | rb_init_page(bpage); | 4420 | rb_init_page(bpage); |
4403 | 4421 | ||
4404 | return bpage; | 4422 | return bpage; |
@@ -4408,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); | |||
4408 | /** | 4426 | /** |
4409 | * ring_buffer_free_read_page - free an allocated read page | 4427 | * ring_buffer_free_read_page - free an allocated read page |
4410 | * @buffer: the buffer the page was allocate for | 4428 | * @buffer: the buffer the page was allocate for |
4429 | * @cpu: the cpu buffer the page came from | ||
4411 | * @data: the page to free | 4430 | * @data: the page to free |
4412 | * | 4431 | * |
4413 | * Free a page allocated from ring_buffer_alloc_read_page. | 4432 | * Free a page allocated from ring_buffer_alloc_read_page. |
4414 | */ | 4433 | */ |
4415 | void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) | 4434 | void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) |
4416 | { | 4435 | { |
4417 | free_page((unsigned long)data); | 4436 | struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; |
4437 | struct buffer_data_page *bpage = data; | ||
4438 | unsigned long flags; | ||
4439 | |||
4440 | local_irq_save(flags); | ||
4441 | arch_spin_lock(&cpu_buffer->lock); | ||
4442 | |||
4443 | if (!cpu_buffer->free_page) { | ||
4444 | cpu_buffer->free_page = bpage; | ||
4445 | bpage = NULL; | ||
4446 | } | ||
4447 | |||
4448 | arch_spin_unlock(&cpu_buffer->lock); | ||
4449 | local_irq_restore(flags); | ||
4450 | |||
4451 | free_page((unsigned long)bpage); | ||
4418 | } | 4452 | } |
4419 | EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); | 4453 | EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); |
4420 | 4454 | ||
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index c190a4d5013c..9fbcaf567886 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
@@ -171,7 +171,7 @@ static enum event_status read_page(int cpu) | |||
171 | } | 171 | } |
172 | } | 172 | } |
173 | } | 173 | } |
174 | ring_buffer_free_read_page(buffer, bpage); | 174 | ring_buffer_free_read_page(buffer, cpu, bpage); |
175 | 175 | ||
176 | if (ret < 0) | 176 | if (ret < 0) |
177 | return EVENT_DROPPED; | 177 | return EVENT_DROPPED; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ad75e9698f6..1122f151466f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec) | |||
257 | 257 | ||
258 | /* trace_flags that are default zero for instances */ | 258 | /* trace_flags that are default zero for instances */ |
259 | #define ZEROED_TRACE_FLAGS \ | 259 | #define ZEROED_TRACE_FLAGS \ |
260 | TRACE_ITER_EVENT_FORK | 260 | (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) |
261 | 261 | ||
262 | /* | 262 | /* |
263 | * The global_trace is the descriptor that holds the top-level tracing | 263 | * The global_trace is the descriptor that holds the top-level tracing |
@@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, | |||
757 | return event; | 757 | return event; |
758 | } | 758 | } |
759 | 759 | ||
760 | static void tracer_tracing_on(struct trace_array *tr) | 760 | void tracer_tracing_on(struct trace_array *tr) |
761 | { | 761 | { |
762 | if (tr->trace_buffer.buffer) | 762 | if (tr->trace_buffer.buffer) |
763 | ring_buffer_record_on(tr->trace_buffer.buffer); | 763 | ring_buffer_record_on(tr->trace_buffer.buffer); |
@@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
894 | EXPORT_SYMBOL_GPL(__trace_bputs); | 894 | EXPORT_SYMBOL_GPL(__trace_bputs); |
895 | 895 | ||
896 | #ifdef CONFIG_TRACER_SNAPSHOT | 896 | #ifdef CONFIG_TRACER_SNAPSHOT |
897 | /** | 897 | static void tracing_snapshot_instance(struct trace_array *tr) |
898 | * trace_snapshot - take a snapshot of the current buffer. | ||
899 | * | ||
900 | * This causes a swap between the snapshot buffer and the current live | ||
901 | * tracing buffer. You can use this to take snapshots of the live | ||
902 | * trace when some condition is triggered, but continue to trace. | ||
903 | * | ||
904 | * Note, make sure to allocate the snapshot with either | ||
905 | * a tracing_snapshot_alloc(), or by doing it manually | ||
906 | * with: echo 1 > /sys/kernel/debug/tracing/snapshot | ||
907 | * | ||
908 | * If the snapshot buffer is not allocated, it will stop tracing. | ||
909 | * Basically making a permanent snapshot. | ||
910 | */ | ||
911 | void tracing_snapshot(void) | ||
912 | { | 898 | { |
913 | struct trace_array *tr = &global_trace; | ||
914 | struct tracer *tracer = tr->current_trace; | 899 | struct tracer *tracer = tr->current_trace; |
915 | unsigned long flags; | 900 | unsigned long flags; |
916 | 901 | ||
@@ -938,6 +923,27 @@ void tracing_snapshot(void) | |||
938 | update_max_tr(tr, current, smp_processor_id()); | 923 | update_max_tr(tr, current, smp_processor_id()); |
939 | local_irq_restore(flags); | 924 | local_irq_restore(flags); |
940 | } | 925 | } |
926 | |||
927 | /** | ||
928 | * trace_snapshot - take a snapshot of the current buffer. | ||
929 | * | ||
930 | * This causes a swap between the snapshot buffer and the current live | ||
931 | * tracing buffer. You can use this to take snapshots of the live | ||
932 | * trace when some condition is triggered, but continue to trace. | ||
933 | * | ||
934 | * Note, make sure to allocate the snapshot with either | ||
935 | * a tracing_snapshot_alloc(), or by doing it manually | ||
936 | * with: echo 1 > /sys/kernel/debug/tracing/snapshot | ||
937 | * | ||
938 | * If the snapshot buffer is not allocated, it will stop tracing. | ||
939 | * Basically making a permanent snapshot. | ||
940 | */ | ||
941 | void tracing_snapshot(void) | ||
942 | { | ||
943 | struct trace_array *tr = &global_trace; | ||
944 | |||
945 | tracing_snapshot_instance(tr); | ||
946 | } | ||
941 | EXPORT_SYMBOL_GPL(tracing_snapshot); | 947 | EXPORT_SYMBOL_GPL(tracing_snapshot); |
942 | 948 | ||
943 | static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, | 949 | static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, |
@@ -1039,7 +1045,7 @@ void tracing_snapshot_alloc(void) | |||
1039 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); | 1045 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); |
1040 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 1046 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
1041 | 1047 | ||
1042 | static void tracer_tracing_off(struct trace_array *tr) | 1048 | void tracer_tracing_off(struct trace_array *tr) |
1043 | { | 1049 | { |
1044 | if (tr->trace_buffer.buffer) | 1050 | if (tr->trace_buffer.buffer) |
1045 | ring_buffer_record_off(tr->trace_buffer.buffer); | 1051 | ring_buffer_record_off(tr->trace_buffer.buffer); |
@@ -1424,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full) | |||
1424 | } | 1430 | } |
1425 | 1431 | ||
1426 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1432 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
1433 | static bool selftests_can_run; | ||
1434 | |||
1435 | struct trace_selftests { | ||
1436 | struct list_head list; | ||
1437 | struct tracer *type; | ||
1438 | }; | ||
1439 | |||
1440 | static LIST_HEAD(postponed_selftests); | ||
1441 | |||
1442 | static int save_selftest(struct tracer *type) | ||
1443 | { | ||
1444 | struct trace_selftests *selftest; | ||
1445 | |||
1446 | selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); | ||
1447 | if (!selftest) | ||
1448 | return -ENOMEM; | ||
1449 | |||
1450 | selftest->type = type; | ||
1451 | list_add(&selftest->list, &postponed_selftests); | ||
1452 | return 0; | ||
1453 | } | ||
1454 | |||
1427 | static int run_tracer_selftest(struct tracer *type) | 1455 | static int run_tracer_selftest(struct tracer *type) |
1428 | { | 1456 | { |
1429 | struct trace_array *tr = &global_trace; | 1457 | struct trace_array *tr = &global_trace; |
@@ -1434,6 +1462,14 @@ static int run_tracer_selftest(struct tracer *type) | |||
1434 | return 0; | 1462 | return 0; |
1435 | 1463 | ||
1436 | /* | 1464 | /* |
1465 | * If a tracer registers early in boot up (before scheduling is | ||
1466 | * initialized and such), then do not run its selftests yet. | ||
1467 | * Instead, run it a little later in the boot process. | ||
1468 | */ | ||
1469 | if (!selftests_can_run) | ||
1470 | return save_selftest(type); | ||
1471 | |||
1472 | /* | ||
1437 | * Run a selftest on this tracer. | 1473 | * Run a selftest on this tracer. |
1438 | * Here we reset the trace buffer, and set the current | 1474 | * Here we reset the trace buffer, and set the current |
1439 | * tracer to be this tracer. The tracer can then run some | 1475 | * tracer to be this tracer. The tracer can then run some |
@@ -1482,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type) | |||
1482 | printk(KERN_CONT "PASSED\n"); | 1518 | printk(KERN_CONT "PASSED\n"); |
1483 | return 0; | 1519 | return 0; |
1484 | } | 1520 | } |
1521 | |||
1522 | static __init int init_trace_selftests(void) | ||
1523 | { | ||
1524 | struct trace_selftests *p, *n; | ||
1525 | struct tracer *t, **last; | ||
1526 | int ret; | ||
1527 | |||
1528 | selftests_can_run = true; | ||
1529 | |||
1530 | mutex_lock(&trace_types_lock); | ||
1531 | |||
1532 | if (list_empty(&postponed_selftests)) | ||
1533 | goto out; | ||
1534 | |||
1535 | pr_info("Running postponed tracer tests:\n"); | ||
1536 | |||
1537 | list_for_each_entry_safe(p, n, &postponed_selftests, list) { | ||
1538 | ret = run_tracer_selftest(p->type); | ||
1539 | /* If the test fails, then warn and remove from available_tracers */ | ||
1540 | if (ret < 0) { | ||
1541 | WARN(1, "tracer: %s failed selftest, disabling\n", | ||
1542 | p->type->name); | ||
1543 | last = &trace_types; | ||
1544 | for (t = trace_types; t; t = t->next) { | ||
1545 | if (t == p->type) { | ||
1546 | *last = t->next; | ||
1547 | break; | ||
1548 | } | ||
1549 | last = &t->next; | ||
1550 | } | ||
1551 | } | ||
1552 | list_del(&p->list); | ||
1553 | kfree(p); | ||
1554 | } | ||
1555 | |||
1556 | out: | ||
1557 | mutex_unlock(&trace_types_lock); | ||
1558 | |||
1559 | return 0; | ||
1560 | } | ||
1561 | core_initcall(init_trace_selftests); | ||
1485 | #else | 1562 | #else |
1486 | static inline int run_tracer_selftest(struct tracer *type) | 1563 | static inline int run_tracer_selftest(struct tracer *type) |
1487 | { | 1564 | { |
@@ -1899,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[]) | |||
1899 | 1976 | ||
1900 | map = savedcmd->map_pid_to_cmdline[pid]; | 1977 | map = savedcmd->map_pid_to_cmdline[pid]; |
1901 | if (map != NO_CMDLINE_MAP) | 1978 | if (map != NO_CMDLINE_MAP) |
1902 | strcpy(comm, get_saved_cmdlines(map)); | 1979 | strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); |
1903 | else | 1980 | else |
1904 | strcpy(comm, "<...>"); | 1981 | strcpy(comm, "<...>"); |
1905 | } | 1982 | } |
@@ -1927,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk) | |||
1927 | __this_cpu_write(trace_cmdline_save, false); | 2004 | __this_cpu_write(trace_cmdline_save, false); |
1928 | } | 2005 | } |
1929 | 2006 | ||
2007 | /* | ||
2008 | * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq | ||
2009 | * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function | ||
2010 | * simplifies those functions and keeps them in sync. | ||
2011 | */ | ||
2012 | enum print_line_t trace_handle_return(struct trace_seq *s) | ||
2013 | { | ||
2014 | return trace_seq_has_overflowed(s) ? | ||
2015 | TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; | ||
2016 | } | ||
2017 | EXPORT_SYMBOL_GPL(trace_handle_return); | ||
2018 | |||
1930 | void | 2019 | void |
1931 | tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | 2020 | tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, |
1932 | int pc) | 2021 | int pc) |
@@ -2479,7 +2568,36 @@ static inline void ftrace_trace_stack(struct trace_array *tr, | |||
2479 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, | 2568 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, |
2480 | int pc) | 2569 | int pc) |
2481 | { | 2570 | { |
2482 | __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); | 2571 | struct ring_buffer *buffer = tr->trace_buffer.buffer; |
2572 | |||
2573 | if (rcu_is_watching()) { | ||
2574 | __ftrace_trace_stack(buffer, flags, skip, pc, NULL); | ||
2575 | return; | ||
2576 | } | ||
2577 | |||
2578 | /* | ||
2579 | * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), | ||
2580 | * but if the above rcu_is_watching() failed, then the NMI | ||
2581 | * triggered someplace critical, and rcu_irq_enter() should | ||
2582 | * not be called from NMI. | ||
2583 | */ | ||
2584 | if (unlikely(in_nmi())) | ||
2585 | return; | ||
2586 | |||
2587 | /* | ||
2588 | * It is possible that a function is being traced in a | ||
2589 | * location that RCU is not watching. A call to | ||
2590 | * rcu_irq_enter() will make sure that it is, but there's | ||
2591 | * a few internal rcu functions that could be traced | ||
2592 | * where that wont work either. In those cases, we just | ||
2593 | * do nothing. | ||
2594 | */ | ||
2595 | if (unlikely(rcu_irq_enter_disabled())) | ||
2596 | return; | ||
2597 | |||
2598 | rcu_irq_enter_irqson(); | ||
2599 | __ftrace_trace_stack(buffer, flags, skip, pc, NULL); | ||
2600 | rcu_irq_exit_irqson(); | ||
2483 | } | 2601 | } |
2484 | 2602 | ||
2485 | /** | 2603 | /** |
@@ -3222,13 +3340,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) | |||
3222 | if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) | 3340 | if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) |
3223 | return; | 3341 | return; |
3224 | 3342 | ||
3225 | if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) | 3343 | if (cpumask_available(iter->started) && |
3344 | cpumask_test_cpu(iter->cpu, iter->started)) | ||
3226 | return; | 3345 | return; |
3227 | 3346 | ||
3228 | if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) | 3347 | if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) |
3229 | return; | 3348 | return; |
3230 | 3349 | ||
3231 | if (iter->started) | 3350 | if (cpumask_available(iter->started)) |
3232 | cpumask_set_cpu(iter->cpu, iter->started); | 3351 | cpumask_set_cpu(iter->cpu, iter->started); |
3233 | 3352 | ||
3234 | /* Don't print started cpu buffer for the first entry of the trace */ | 3353 | /* Don't print started cpu buffer for the first entry of the trace */ |
@@ -4122,6 +4241,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) | |||
4122 | if (mask == TRACE_ITER_EVENT_FORK) | 4241 | if (mask == TRACE_ITER_EVENT_FORK) |
4123 | trace_event_follow_fork(tr, enabled); | 4242 | trace_event_follow_fork(tr, enabled); |
4124 | 4243 | ||
4244 | if (mask == TRACE_ITER_FUNC_FORK) | ||
4245 | ftrace_pid_follow_fork(tr, enabled); | ||
4246 | |||
4125 | if (mask == TRACE_ITER_OVERWRITE) { | 4247 | if (mask == TRACE_ITER_OVERWRITE) { |
4126 | ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); | 4248 | ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); |
4127 | #ifdef CONFIG_TRACER_MAX_TRACE | 4249 | #ifdef CONFIG_TRACER_MAX_TRACE |
@@ -4355,6 +4477,7 @@ static const char readme_msg[] = | |||
4355 | "\t -:[<group>/]<event>\n" | 4477 | "\t -:[<group>/]<event>\n" |
4356 | #ifdef CONFIG_KPROBE_EVENTS | 4478 | #ifdef CONFIG_KPROBE_EVENTS |
4357 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" | 4479 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" |
4480 | "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" | ||
4358 | #endif | 4481 | #endif |
4359 | #ifdef CONFIG_UPROBE_EVENTS | 4482 | #ifdef CONFIG_UPROBE_EVENTS |
4360 | "\t place: <path>:<offset>\n" | 4483 | "\t place: <path>:<offset>\n" |
@@ -5529,7 +5652,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
5529 | .partial = partial_def, | 5652 | .partial = partial_def, |
5530 | .nr_pages = 0, /* This gets updated below. */ | 5653 | .nr_pages = 0, /* This gets updated below. */ |
5531 | .nr_pages_max = PIPE_DEF_BUFFERS, | 5654 | .nr_pages_max = PIPE_DEF_BUFFERS, |
5532 | .flags = flags, | ||
5533 | .ops = &tracing_pipe_buf_ops, | 5655 | .ops = &tracing_pipe_buf_ops, |
5534 | .spd_release = tracing_spd_release_pipe, | 5656 | .spd_release = tracing_spd_release_pipe, |
5535 | }; | 5657 | }; |
@@ -5962,6 +6084,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) | |||
5962 | struct ftrace_buffer_info { | 6084 | struct ftrace_buffer_info { |
5963 | struct trace_iterator iter; | 6085 | struct trace_iterator iter; |
5964 | void *spare; | 6086 | void *spare; |
6087 | unsigned int spare_cpu; | ||
5965 | unsigned int read; | 6088 | unsigned int read; |
5966 | }; | 6089 | }; |
5967 | 6090 | ||
@@ -6291,9 +6414,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
6291 | return -EBUSY; | 6414 | return -EBUSY; |
6292 | #endif | 6415 | #endif |
6293 | 6416 | ||
6294 | if (!info->spare) | 6417 | if (!info->spare) { |
6295 | info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, | 6418 | info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, |
6296 | iter->cpu_file); | 6419 | iter->cpu_file); |
6420 | info->spare_cpu = iter->cpu_file; | ||
6421 | } | ||
6297 | if (!info->spare) | 6422 | if (!info->spare) |
6298 | return -ENOMEM; | 6423 | return -ENOMEM; |
6299 | 6424 | ||
@@ -6353,7 +6478,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) | |||
6353 | __trace_array_put(iter->tr); | 6478 | __trace_array_put(iter->tr); |
6354 | 6479 | ||
6355 | if (info->spare) | 6480 | if (info->spare) |
6356 | ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); | 6481 | ring_buffer_free_read_page(iter->trace_buffer->buffer, |
6482 | info->spare_cpu, info->spare); | ||
6357 | kfree(info); | 6483 | kfree(info); |
6358 | 6484 | ||
6359 | mutex_unlock(&trace_types_lock); | 6485 | mutex_unlock(&trace_types_lock); |
@@ -6364,6 +6490,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) | |||
6364 | struct buffer_ref { | 6490 | struct buffer_ref { |
6365 | struct ring_buffer *buffer; | 6491 | struct ring_buffer *buffer; |
6366 | void *page; | 6492 | void *page; |
6493 | int cpu; | ||
6367 | int ref; | 6494 | int ref; |
6368 | }; | 6495 | }; |
6369 | 6496 | ||
@@ -6375,7 +6502,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, | |||
6375 | if (--ref->ref) | 6502 | if (--ref->ref) |
6376 | return; | 6503 | return; |
6377 | 6504 | ||
6378 | ring_buffer_free_read_page(ref->buffer, ref->page); | 6505 | ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); |
6379 | kfree(ref); | 6506 | kfree(ref); |
6380 | buf->private = 0; | 6507 | buf->private = 0; |
6381 | } | 6508 | } |
@@ -6409,7 +6536,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) | |||
6409 | if (--ref->ref) | 6536 | if (--ref->ref) |
6410 | return; | 6537 | return; |
6411 | 6538 | ||
6412 | ring_buffer_free_read_page(ref->buffer, ref->page); | 6539 | ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); |
6413 | kfree(ref); | 6540 | kfree(ref); |
6414 | spd->partial[i].private = 0; | 6541 | spd->partial[i].private = 0; |
6415 | } | 6542 | } |
@@ -6427,7 +6554,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6427 | .pages = pages_def, | 6554 | .pages = pages_def, |
6428 | .partial = partial_def, | 6555 | .partial = partial_def, |
6429 | .nr_pages_max = PIPE_DEF_BUFFERS, | 6556 | .nr_pages_max = PIPE_DEF_BUFFERS, |
6430 | .flags = flags, | ||
6431 | .ops = &buffer_pipe_buf_ops, | 6557 | .ops = &buffer_pipe_buf_ops, |
6432 | .spd_release = buffer_spd_release, | 6558 | .spd_release = buffer_spd_release, |
6433 | }; | 6559 | }; |
@@ -6474,11 +6600,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6474 | kfree(ref); | 6600 | kfree(ref); |
6475 | break; | 6601 | break; |
6476 | } | 6602 | } |
6603 | ref->cpu = iter->cpu_file; | ||
6477 | 6604 | ||
6478 | r = ring_buffer_read_page(ref->buffer, &ref->page, | 6605 | r = ring_buffer_read_page(ref->buffer, &ref->page, |
6479 | len, iter->cpu_file, 1); | 6606 | len, iter->cpu_file, 1); |
6480 | if (r < 0) { | 6607 | if (r < 0) { |
6481 | ring_buffer_free_read_page(ref->buffer, ref->page); | 6608 | ring_buffer_free_read_page(ref->buffer, ref->cpu, |
6609 | ref->page); | ||
6482 | kfree(ref); | 6610 | kfree(ref); |
6483 | break; | 6611 | break; |
6484 | } | 6612 | } |
@@ -6649,43 +6777,89 @@ static const struct file_operations tracing_dyn_info_fops = { | |||
6649 | 6777 | ||
6650 | #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) | 6778 | #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) |
6651 | static void | 6779 | static void |
6652 | ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) | 6780 | ftrace_snapshot(unsigned long ip, unsigned long parent_ip, |
6781 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
6782 | void *data) | ||
6653 | { | 6783 | { |
6654 | tracing_snapshot(); | 6784 | tracing_snapshot_instance(tr); |
6655 | } | 6785 | } |
6656 | 6786 | ||
6657 | static void | 6787 | static void |
6658 | ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) | 6788 | ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, |
6789 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
6790 | void *data) | ||
6659 | { | 6791 | { |
6660 | unsigned long *count = (long *)data; | 6792 | struct ftrace_func_mapper *mapper = data; |
6793 | long *count = NULL; | ||
6661 | 6794 | ||
6662 | if (!*count) | 6795 | if (mapper) |
6663 | return; | 6796 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); |
6797 | |||
6798 | if (count) { | ||
6799 | |||
6800 | if (*count <= 0) | ||
6801 | return; | ||
6664 | 6802 | ||
6665 | if (*count != -1) | ||
6666 | (*count)--; | 6803 | (*count)--; |
6804 | } | ||
6667 | 6805 | ||
6668 | tracing_snapshot(); | 6806 | tracing_snapshot_instance(tr); |
6669 | } | 6807 | } |
6670 | 6808 | ||
6671 | static int | 6809 | static int |
6672 | ftrace_snapshot_print(struct seq_file *m, unsigned long ip, | 6810 | ftrace_snapshot_print(struct seq_file *m, unsigned long ip, |
6673 | struct ftrace_probe_ops *ops, void *data) | 6811 | struct ftrace_probe_ops *ops, void *data) |
6674 | { | 6812 | { |
6675 | long count = (long)data; | 6813 | struct ftrace_func_mapper *mapper = data; |
6814 | long *count = NULL; | ||
6676 | 6815 | ||
6677 | seq_printf(m, "%ps:", (void *)ip); | 6816 | seq_printf(m, "%ps:", (void *)ip); |
6678 | 6817 | ||
6679 | seq_puts(m, "snapshot"); | 6818 | seq_puts(m, "snapshot"); |
6680 | 6819 | ||
6681 | if (count == -1) | 6820 | if (mapper) |
6682 | seq_puts(m, ":unlimited\n"); | 6821 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); |
6822 | |||
6823 | if (count) | ||
6824 | seq_printf(m, ":count=%ld\n", *count); | ||
6683 | else | 6825 | else |
6684 | seq_printf(m, ":count=%ld\n", count); | 6826 | seq_puts(m, ":unlimited\n"); |
6685 | 6827 | ||
6686 | return 0; | 6828 | return 0; |
6687 | } | 6829 | } |
6688 | 6830 | ||
6831 | static int | ||
6832 | ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, | ||
6833 | unsigned long ip, void *init_data, void **data) | ||
6834 | { | ||
6835 | struct ftrace_func_mapper *mapper = *data; | ||
6836 | |||
6837 | if (!mapper) { | ||
6838 | mapper = allocate_ftrace_func_mapper(); | ||
6839 | if (!mapper) | ||
6840 | return -ENOMEM; | ||
6841 | *data = mapper; | ||
6842 | } | ||
6843 | |||
6844 | return ftrace_func_mapper_add_ip(mapper, ip, init_data); | ||
6845 | } | ||
6846 | |||
6847 | static void | ||
6848 | ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, | ||
6849 | unsigned long ip, void *data) | ||
6850 | { | ||
6851 | struct ftrace_func_mapper *mapper = data; | ||
6852 | |||
6853 | if (!ip) { | ||
6854 | if (!mapper) | ||
6855 | return; | ||
6856 | free_ftrace_func_mapper(mapper, NULL); | ||
6857 | return; | ||
6858 | } | ||
6859 | |||
6860 | ftrace_func_mapper_remove_ip(mapper, ip); | ||
6861 | } | ||
6862 | |||
6689 | static struct ftrace_probe_ops snapshot_probe_ops = { | 6863 | static struct ftrace_probe_ops snapshot_probe_ops = { |
6690 | .func = ftrace_snapshot, | 6864 | .func = ftrace_snapshot, |
6691 | .print = ftrace_snapshot_print, | 6865 | .print = ftrace_snapshot_print, |
@@ -6694,10 +6868,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = { | |||
6694 | static struct ftrace_probe_ops snapshot_count_probe_ops = { | 6868 | static struct ftrace_probe_ops snapshot_count_probe_ops = { |
6695 | .func = ftrace_count_snapshot, | 6869 | .func = ftrace_count_snapshot, |
6696 | .print = ftrace_snapshot_print, | 6870 | .print = ftrace_snapshot_print, |
6871 | .init = ftrace_snapshot_init, | ||
6872 | .free = ftrace_snapshot_free, | ||
6697 | }; | 6873 | }; |
6698 | 6874 | ||
6699 | static int | 6875 | static int |
6700 | ftrace_trace_snapshot_callback(struct ftrace_hash *hash, | 6876 | ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, |
6701 | char *glob, char *cmd, char *param, int enable) | 6877 | char *glob, char *cmd, char *param, int enable) |
6702 | { | 6878 | { |
6703 | struct ftrace_probe_ops *ops; | 6879 | struct ftrace_probe_ops *ops; |
@@ -6711,10 +6887,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, | |||
6711 | 6887 | ||
6712 | ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; | 6888 | ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; |
6713 | 6889 | ||
6714 | if (glob[0] == '!') { | 6890 | if (glob[0] == '!') |
6715 | unregister_ftrace_function_probe_func(glob+1, ops); | 6891 | return unregister_ftrace_function_probe_func(glob+1, tr, ops); |
6716 | return 0; | ||
6717 | } | ||
6718 | 6892 | ||
6719 | if (!param) | 6893 | if (!param) |
6720 | goto out_reg; | 6894 | goto out_reg; |
@@ -6733,11 +6907,11 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, | |||
6733 | return ret; | 6907 | return ret; |
6734 | 6908 | ||
6735 | out_reg: | 6909 | out_reg: |
6736 | ret = alloc_snapshot(&global_trace); | 6910 | ret = alloc_snapshot(tr); |
6737 | if (ret < 0) | 6911 | if (ret < 0) |
6738 | goto out; | 6912 | goto out; |
6739 | 6913 | ||
6740 | ret = register_ftrace_function_probe(glob, ops, count); | 6914 | ret = register_ftrace_function_probe(glob, tr, ops, count); |
6741 | 6915 | ||
6742 | out: | 6916 | out: |
6743 | return ret < 0 ? ret : 0; | 6917 | return ret < 0 ? ret : 0; |
@@ -7348,6 +7522,8 @@ static int instance_mkdir(const char *name) | |||
7348 | goto out_free_tr; | 7522 | goto out_free_tr; |
7349 | } | 7523 | } |
7350 | 7524 | ||
7525 | ftrace_init_trace_array(tr); | ||
7526 | |||
7351 | init_tracer_tracefs(tr, tr->dir); | 7527 | init_tracer_tracefs(tr, tr->dir); |
7352 | init_trace_flags_index(tr); | 7528 | init_trace_flags_index(tr); |
7353 | __update_tracer_options(tr); | 7529 | __update_tracer_options(tr); |
@@ -7403,6 +7579,7 @@ static int instance_rmdir(const char *name) | |||
7403 | } | 7579 | } |
7404 | 7580 | ||
7405 | tracing_set_nop(tr); | 7581 | tracing_set_nop(tr); |
7582 | clear_ftrace_function_probes(tr); | ||
7406 | event_trace_del_tracer(tr); | 7583 | event_trace_del_tracer(tr); |
7407 | ftrace_clear_pids(tr); | 7584 | ftrace_clear_pids(tr); |
7408 | ftrace_destroy_function_files(tr); | 7585 | ftrace_destroy_function_files(tr); |
@@ -7968,6 +8145,9 @@ __init static int tracer_alloc_buffers(void) | |||
7968 | 8145 | ||
7969 | register_tracer(&nop_trace); | 8146 | register_tracer(&nop_trace); |
7970 | 8147 | ||
8148 | /* Function tracing may start here (via kernel command line) */ | ||
8149 | init_function_trace(); | ||
8150 | |||
7971 | /* All seems OK, enable tracing */ | 8151 | /* All seems OK, enable tracing */ |
7972 | tracing_disabled = 0; | 8152 | tracing_disabled = 0; |
7973 | 8153 | ||
@@ -8002,7 +8182,7 @@ out: | |||
8002 | return ret; | 8182 | return ret; |
8003 | } | 8183 | } |
8004 | 8184 | ||
8005 | void __init trace_init(void) | 8185 | void __init early_trace_init(void) |
8006 | { | 8186 | { |
8007 | if (tracepoint_printk) { | 8187 | if (tracepoint_printk) { |
8008 | tracepoint_print_iter = | 8188 | tracepoint_print_iter = |
@@ -8013,6 +8193,10 @@ void __init trace_init(void) | |||
8013 | static_key_enable(&tracepoint_printk_key.key); | 8193 | static_key_enable(&tracepoint_printk_key.key); |
8014 | } | 8194 | } |
8015 | tracer_alloc_buffers(); | 8195 | tracer_alloc_buffers(); |
8196 | } | ||
8197 | |||
8198 | void __init trace_init(void) | ||
8199 | { | ||
8016 | trace_event_init(); | 8200 | trace_event_init(); |
8017 | } | 8201 | } |
8018 | 8202 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d19d52d600d6..39fd77330aab 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -262,6 +262,9 @@ struct trace_array { | |||
262 | #ifdef CONFIG_FUNCTION_TRACER | 262 | #ifdef CONFIG_FUNCTION_TRACER |
263 | struct ftrace_ops *ops; | 263 | struct ftrace_ops *ops; |
264 | struct trace_pid_list __rcu *function_pids; | 264 | struct trace_pid_list __rcu *function_pids; |
265 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
266 | struct list_head func_probes; | ||
267 | #endif | ||
265 | /* function tracing enabled */ | 268 | /* function tracing enabled */ |
266 | int function_enabled; | 269 | int function_enabled; |
267 | #endif | 270 | #endif |
@@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void); | |||
579 | int tracing_open_generic(struct inode *inode, struct file *filp); | 582 | int tracing_open_generic(struct inode *inode, struct file *filp); |
580 | bool tracing_is_disabled(void); | 583 | bool tracing_is_disabled(void); |
581 | int tracer_tracing_is_on(struct trace_array *tr); | 584 | int tracer_tracing_is_on(struct trace_array *tr); |
585 | void tracer_tracing_on(struct trace_array *tr); | ||
586 | void tracer_tracing_off(struct trace_array *tr); | ||
582 | struct dentry *trace_create_file(const char *name, | 587 | struct dentry *trace_create_file(const char *name, |
583 | umode_t mode, | 588 | umode_t mode, |
584 | struct dentry *parent, | 589 | struct dentry *parent, |
@@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); | |||
696 | 701 | ||
697 | #ifdef CONFIG_DYNAMIC_FTRACE | 702 | #ifdef CONFIG_DYNAMIC_FTRACE |
698 | extern unsigned long ftrace_update_tot_cnt; | 703 | extern unsigned long ftrace_update_tot_cnt; |
704 | void ftrace_init_trace_array(struct trace_array *tr); | ||
705 | #else | ||
706 | static inline void ftrace_init_trace_array(struct trace_array *tr) { } | ||
699 | #endif | 707 | #endif |
700 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 708 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
701 | extern int DYN_FTRACE_TEST_NAME(void); | 709 | extern int DYN_FTRACE_TEST_NAME(void); |
@@ -880,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
880 | extern struct list_head ftrace_pids; | 888 | extern struct list_head ftrace_pids; |
881 | 889 | ||
882 | #ifdef CONFIG_FUNCTION_TRACER | 890 | #ifdef CONFIG_FUNCTION_TRACER |
891 | struct ftrace_func_command { | ||
892 | struct list_head list; | ||
893 | char *name; | ||
894 | int (*func)(struct trace_array *tr, | ||
895 | struct ftrace_hash *hash, | ||
896 | char *func, char *cmd, | ||
897 | char *params, int enable); | ||
898 | }; | ||
883 | extern bool ftrace_filter_param __initdata; | 899 | extern bool ftrace_filter_param __initdata; |
884 | static inline int ftrace_trace_task(struct trace_array *tr) | 900 | static inline int ftrace_trace_task(struct trace_array *tr) |
885 | { | 901 | { |
@@ -897,6 +913,8 @@ void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); | |||
897 | void ftrace_init_tracefs_toplevel(struct trace_array *tr, | 913 | void ftrace_init_tracefs_toplevel(struct trace_array *tr, |
898 | struct dentry *d_tracer); | 914 | struct dentry *d_tracer); |
899 | void ftrace_clear_pids(struct trace_array *tr); | 915 | void ftrace_clear_pids(struct trace_array *tr); |
916 | int init_function_trace(void); | ||
917 | void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); | ||
900 | #else | 918 | #else |
901 | static inline int ftrace_trace_task(struct trace_array *tr) | 919 | static inline int ftrace_trace_task(struct trace_array *tr) |
902 | { | 920 | { |
@@ -916,15 +934,75 @@ static inline void ftrace_reset_array_ops(struct trace_array *tr) { } | |||
916 | static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } | 934 | static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } |
917 | static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } | 935 | static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } |
918 | static inline void ftrace_clear_pids(struct trace_array *tr) { } | 936 | static inline void ftrace_clear_pids(struct trace_array *tr) { } |
937 | static inline int init_function_trace(void) { return 0; } | ||
938 | static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } | ||
919 | /* ftace_func_t type is not defined, use macro instead of static inline */ | 939 | /* ftace_func_t type is not defined, use macro instead of static inline */ |
920 | #define ftrace_init_array_ops(tr, func) do { } while (0) | 940 | #define ftrace_init_array_ops(tr, func) do { } while (0) |
921 | #endif /* CONFIG_FUNCTION_TRACER */ | 941 | #endif /* CONFIG_FUNCTION_TRACER */ |
922 | 942 | ||
923 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) | 943 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) |
944 | |||
945 | struct ftrace_probe_ops { | ||
946 | void (*func)(unsigned long ip, | ||
947 | unsigned long parent_ip, | ||
948 | struct trace_array *tr, | ||
949 | struct ftrace_probe_ops *ops, | ||
950 | void *data); | ||
951 | int (*init)(struct ftrace_probe_ops *ops, | ||
952 | struct trace_array *tr, | ||
953 | unsigned long ip, void *init_data, | ||
954 | void **data); | ||
955 | void (*free)(struct ftrace_probe_ops *ops, | ||
956 | struct trace_array *tr, | ||
957 | unsigned long ip, void *data); | ||
958 | int (*print)(struct seq_file *m, | ||
959 | unsigned long ip, | ||
960 | struct ftrace_probe_ops *ops, | ||
961 | void *data); | ||
962 | }; | ||
963 | |||
964 | struct ftrace_func_mapper; | ||
965 | typedef int (*ftrace_mapper_func)(void *data); | ||
966 | |||
967 | struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); | ||
968 | void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, | ||
969 | unsigned long ip); | ||
970 | int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, | ||
971 | unsigned long ip, void *data); | ||
972 | void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, | ||
973 | unsigned long ip); | ||
974 | void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, | ||
975 | ftrace_mapper_func free_func); | ||
976 | |||
977 | extern int | ||
978 | register_ftrace_function_probe(char *glob, struct trace_array *tr, | ||
979 | struct ftrace_probe_ops *ops, void *data); | ||
980 | extern int | ||
981 | unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, | ||
982 | struct ftrace_probe_ops *ops); | ||
983 | extern void clear_ftrace_function_probes(struct trace_array *tr); | ||
984 | |||
985 | int register_ftrace_command(struct ftrace_func_command *cmd); | ||
986 | int unregister_ftrace_command(struct ftrace_func_command *cmd); | ||
987 | |||
924 | void ftrace_create_filter_files(struct ftrace_ops *ops, | 988 | void ftrace_create_filter_files(struct ftrace_ops *ops, |
925 | struct dentry *parent); | 989 | struct dentry *parent); |
926 | void ftrace_destroy_filter_files(struct ftrace_ops *ops); | 990 | void ftrace_destroy_filter_files(struct ftrace_ops *ops); |
927 | #else | 991 | #else |
992 | struct ftrace_func_command; | ||
993 | |||
994 | static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) | ||
995 | { | ||
996 | return -EINVAL; | ||
997 | } | ||
998 | static inline __init int unregister_ftrace_command(char *cmd_name) | ||
999 | { | ||
1000 | return -EINVAL; | ||
1001 | } | ||
1002 | static inline void clear_ftrace_function_probes(struct trace_array *tr) | ||
1003 | { | ||
1004 | } | ||
1005 | |||
928 | /* | 1006 | /* |
929 | * The ops parameter passed in is usually undefined. | 1007 | * The ops parameter passed in is usually undefined. |
930 | * This must be a macro. | 1008 | * This must be a macro. |
@@ -989,11 +1067,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
989 | 1067 | ||
990 | #ifdef CONFIG_FUNCTION_TRACER | 1068 | #ifdef CONFIG_FUNCTION_TRACER |
991 | # define FUNCTION_FLAGS \ | 1069 | # define FUNCTION_FLAGS \ |
992 | C(FUNCTION, "function-trace"), | 1070 | C(FUNCTION, "function-trace"), \ |
1071 | C(FUNC_FORK, "function-fork"), | ||
993 | # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION | 1072 | # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION |
994 | #else | 1073 | #else |
995 | # define FUNCTION_FLAGS | 1074 | # define FUNCTION_FLAGS |
996 | # define FUNCTION_DEFAULT_FLAGS 0UL | 1075 | # define FUNCTION_DEFAULT_FLAGS 0UL |
1076 | # define TRACE_ITER_FUNC_FORK 0UL | ||
997 | #endif | 1077 | #endif |
998 | 1078 | ||
999 | #ifdef CONFIG_STACKTRACE | 1079 | #ifdef CONFIG_STACKTRACE |
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e49fbe901cfc..16a8cf02eee9 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c | |||
@@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg) | |||
153 | trace_do_benchmark(); | 153 | trace_do_benchmark(); |
154 | 154 | ||
155 | /* | 155 | /* |
156 | * We don't go to sleep, but let others | 156 | * We don't go to sleep, but let others run as well. |
157 | * run as well. | 157 | * This is bascially a "yield()" to let any task that |
158 | * wants to run, schedule in, but if the CPU is idle, | ||
159 | * we'll keep burning cycles. | ||
160 | * | ||
161 | * Note the _rcu_qs() version of cond_resched() will | ||
162 | * notify synchronize_rcu_tasks() that this thread has | ||
163 | * passed a quiescent state for rcu_tasks. Otherwise | ||
164 | * this thread will never voluntarily schedule which would | ||
165 | * block synchronize_rcu_tasks() indefinitely. | ||
158 | */ | 166 | */ |
159 | cond_resched(); | 167 | cond_resched_rcu_qs(); |
160 | } | 168 | } |
161 | 169 | ||
162 | return 0; | 170 | return 0; |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c203ac4df791..adcdbbeae010 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -348,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry, | |||
348 | __field( u64, duration ) | 348 | __field( u64, duration ) |
349 | __field( u64, outer_duration ) | 349 | __field( u64, outer_duration ) |
350 | __field( u64, nmi_total_ts ) | 350 | __field( u64, nmi_total_ts ) |
351 | __field_struct( struct timespec, timestamp ) | 351 | __field_struct( struct timespec64, timestamp ) |
352 | __field_desc( long, timestamp, tv_sec ) | 352 | __field_desc( s64, timestamp, tv_sec ) |
353 | __field_desc( long, timestamp, tv_nsec ) | 353 | __field_desc( long, timestamp, tv_nsec ) |
354 | __field( unsigned int, nmi_count ) | 354 | __field( unsigned int, nmi_count ) |
355 | __field( unsigned int, seqnum ) | 355 | __field( unsigned int, seqnum ) |
356 | ), | 356 | ), |
357 | 357 | ||
358 | F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", | 358 | F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", |
359 | __entry->seqnum, | 359 | __entry->seqnum, |
360 | __entry->tv_sec, | 360 | __entry->tv_sec, |
361 | __entry->tv_nsec, | 361 | __entry->tv_nsec, |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 93116549a284..e7973e10398c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -2460,15 +2460,8 @@ struct event_probe_data { | |||
2460 | bool enable; | 2460 | bool enable; |
2461 | }; | 2461 | }; |
2462 | 2462 | ||
2463 | static void | 2463 | static void update_event_probe(struct event_probe_data *data) |
2464 | event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) | ||
2465 | { | 2464 | { |
2466 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | ||
2467 | struct event_probe_data *data = *pdata; | ||
2468 | |||
2469 | if (!data) | ||
2470 | return; | ||
2471 | |||
2472 | if (data->enable) | 2465 | if (data->enable) |
2473 | clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); | 2466 | clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); |
2474 | else | 2467 | else |
@@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) | |||
2476 | } | 2469 | } |
2477 | 2470 | ||
2478 | static void | 2471 | static void |
2479 | event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) | 2472 | event_enable_probe(unsigned long ip, unsigned long parent_ip, |
2473 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
2474 | void *data) | ||
2480 | { | 2475 | { |
2481 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | 2476 | struct ftrace_func_mapper *mapper = data; |
2482 | struct event_probe_data *data = *pdata; | 2477 | struct event_probe_data *edata; |
2478 | void **pdata; | ||
2483 | 2479 | ||
2484 | if (!data) | 2480 | pdata = ftrace_func_mapper_find_ip(mapper, ip); |
2481 | if (!pdata || !*pdata) | ||
2482 | return; | ||
2483 | |||
2484 | edata = *pdata; | ||
2485 | update_event_probe(edata); | ||
2486 | } | ||
2487 | |||
2488 | static void | ||
2489 | event_enable_count_probe(unsigned long ip, unsigned long parent_ip, | ||
2490 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
2491 | void *data) | ||
2492 | { | ||
2493 | struct ftrace_func_mapper *mapper = data; | ||
2494 | struct event_probe_data *edata; | ||
2495 | void **pdata; | ||
2496 | |||
2497 | pdata = ftrace_func_mapper_find_ip(mapper, ip); | ||
2498 | if (!pdata || !*pdata) | ||
2485 | return; | 2499 | return; |
2486 | 2500 | ||
2487 | if (!data->count) | 2501 | edata = *pdata; |
2502 | |||
2503 | if (!edata->count) | ||
2488 | return; | 2504 | return; |
2489 | 2505 | ||
2490 | /* Skip if the event is in a state we want to switch to */ | 2506 | /* Skip if the event is in a state we want to switch to */ |
2491 | if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) | 2507 | if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) |
2492 | return; | 2508 | return; |
2493 | 2509 | ||
2494 | if (data->count != -1) | 2510 | if (edata->count != -1) |
2495 | (data->count)--; | 2511 | (edata->count)--; |
2496 | 2512 | ||
2497 | event_enable_probe(ip, parent_ip, _data); | 2513 | update_event_probe(edata); |
2498 | } | 2514 | } |
2499 | 2515 | ||
2500 | static int | 2516 | static int |
2501 | event_enable_print(struct seq_file *m, unsigned long ip, | 2517 | event_enable_print(struct seq_file *m, unsigned long ip, |
2502 | struct ftrace_probe_ops *ops, void *_data) | 2518 | struct ftrace_probe_ops *ops, void *data) |
2503 | { | 2519 | { |
2504 | struct event_probe_data *data = _data; | 2520 | struct ftrace_func_mapper *mapper = data; |
2521 | struct event_probe_data *edata; | ||
2522 | void **pdata; | ||
2523 | |||
2524 | pdata = ftrace_func_mapper_find_ip(mapper, ip); | ||
2525 | |||
2526 | if (WARN_ON_ONCE(!pdata || !*pdata)) | ||
2527 | return 0; | ||
2528 | |||
2529 | edata = *pdata; | ||
2505 | 2530 | ||
2506 | seq_printf(m, "%ps:", (void *)ip); | 2531 | seq_printf(m, "%ps:", (void *)ip); |
2507 | 2532 | ||
2508 | seq_printf(m, "%s:%s:%s", | 2533 | seq_printf(m, "%s:%s:%s", |
2509 | data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, | 2534 | edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, |
2510 | data->file->event_call->class->system, | 2535 | edata->file->event_call->class->system, |
2511 | trace_event_name(data->file->event_call)); | 2536 | trace_event_name(edata->file->event_call)); |
2512 | 2537 | ||
2513 | if (data->count == -1) | 2538 | if (edata->count == -1) |
2514 | seq_puts(m, ":unlimited\n"); | 2539 | seq_puts(m, ":unlimited\n"); |
2515 | else | 2540 | else |
2516 | seq_printf(m, ":count=%ld\n", data->count); | 2541 | seq_printf(m, ":count=%ld\n", edata->count); |
2517 | 2542 | ||
2518 | return 0; | 2543 | return 0; |
2519 | } | 2544 | } |
2520 | 2545 | ||
2521 | static int | 2546 | static int |
2522 | event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, | 2547 | event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, |
2523 | void **_data) | 2548 | unsigned long ip, void *init_data, void **data) |
2524 | { | 2549 | { |
2525 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | 2550 | struct ftrace_func_mapper *mapper = *data; |
2526 | struct event_probe_data *data = *pdata; | 2551 | struct event_probe_data *edata = init_data; |
2552 | int ret; | ||
2553 | |||
2554 | if (!mapper) { | ||
2555 | mapper = allocate_ftrace_func_mapper(); | ||
2556 | if (!mapper) | ||
2557 | return -ENODEV; | ||
2558 | *data = mapper; | ||
2559 | } | ||
2560 | |||
2561 | ret = ftrace_func_mapper_add_ip(mapper, ip, edata); | ||
2562 | if (ret < 0) | ||
2563 | return ret; | ||
2564 | |||
2565 | edata->ref++; | ||
2527 | 2566 | ||
2528 | data->ref++; | 2567 | return 0; |
2568 | } | ||
2569 | |||
2570 | static int free_probe_data(void *data) | ||
2571 | { | ||
2572 | struct event_probe_data *edata = data; | ||
2573 | |||
2574 | edata->ref--; | ||
2575 | if (!edata->ref) { | ||
2576 | /* Remove the SOFT_MODE flag */ | ||
2577 | __ftrace_event_enable_disable(edata->file, 0, 1); | ||
2578 | module_put(edata->file->event_call->mod); | ||
2579 | kfree(edata); | ||
2580 | } | ||
2529 | return 0; | 2581 | return 0; |
2530 | } | 2582 | } |
2531 | 2583 | ||
2532 | static void | 2584 | static void |
2533 | event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, | 2585 | event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, |
2534 | void **_data) | 2586 | unsigned long ip, void *data) |
2535 | { | 2587 | { |
2536 | struct event_probe_data **pdata = (struct event_probe_data **)_data; | 2588 | struct ftrace_func_mapper *mapper = data; |
2537 | struct event_probe_data *data = *pdata; | 2589 | struct event_probe_data *edata; |
2538 | 2590 | ||
2539 | if (WARN_ON_ONCE(data->ref <= 0)) | 2591 | if (!ip) { |
2592 | if (!mapper) | ||
2593 | return; | ||
2594 | free_ftrace_func_mapper(mapper, free_probe_data); | ||
2540 | return; | 2595 | return; |
2541 | |||
2542 | data->ref--; | ||
2543 | if (!data->ref) { | ||
2544 | /* Remove the SOFT_MODE flag */ | ||
2545 | __ftrace_event_enable_disable(data->file, 0, 1); | ||
2546 | module_put(data->file->event_call->mod); | ||
2547 | kfree(data); | ||
2548 | } | 2596 | } |
2549 | *pdata = NULL; | 2597 | |
2598 | edata = ftrace_func_mapper_remove_ip(mapper, ip); | ||
2599 | |||
2600 | if (WARN_ON_ONCE(!edata)) | ||
2601 | return; | ||
2602 | |||
2603 | if (WARN_ON_ONCE(edata->ref <= 0)) | ||
2604 | return; | ||
2605 | |||
2606 | free_probe_data(edata); | ||
2550 | } | 2607 | } |
2551 | 2608 | ||
2552 | static struct ftrace_probe_ops event_enable_probe_ops = { | 2609 | static struct ftrace_probe_ops event_enable_probe_ops = { |
@@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = { | |||
2578 | }; | 2635 | }; |
2579 | 2636 | ||
2580 | static int | 2637 | static int |
2581 | event_enable_func(struct ftrace_hash *hash, | 2638 | event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, |
2582 | char *glob, char *cmd, char *param, int enabled) | 2639 | char *glob, char *cmd, char *param, int enabled) |
2583 | { | 2640 | { |
2584 | struct trace_array *tr = top_trace_array(); | ||
2585 | struct trace_event_file *file; | 2641 | struct trace_event_file *file; |
2586 | struct ftrace_probe_ops *ops; | 2642 | struct ftrace_probe_ops *ops; |
2587 | struct event_probe_data *data; | 2643 | struct event_probe_data *data; |
@@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash, | |||
2619 | ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; | 2675 | ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; |
2620 | 2676 | ||
2621 | if (glob[0] == '!') { | 2677 | if (glob[0] == '!') { |
2622 | unregister_ftrace_function_probe_func(glob+1, ops); | 2678 | ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); |
2623 | ret = 0; | ||
2624 | goto out; | 2679 | goto out; |
2625 | } | 2680 | } |
2626 | 2681 | ||
2627 | ret = -ENOMEM; | 2682 | ret = -ENOMEM; |
2683 | |||
2628 | data = kzalloc(sizeof(*data), GFP_KERNEL); | 2684 | data = kzalloc(sizeof(*data), GFP_KERNEL); |
2629 | if (!data) | 2685 | if (!data) |
2630 | goto out; | 2686 | goto out; |
@@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash, | |||
2661 | ret = __ftrace_event_enable_disable(file, 1, 1); | 2717 | ret = __ftrace_event_enable_disable(file, 1, 1); |
2662 | if (ret < 0) | 2718 | if (ret < 0) |
2663 | goto out_put; | 2719 | goto out_put; |
2664 | ret = register_ftrace_function_probe(glob, ops, data); | 2720 | |
2721 | ret = register_ftrace_function_probe(glob, tr, ops, data); | ||
2665 | /* | 2722 | /* |
2666 | * The above returns on success the # of functions enabled, | 2723 | * The above returns on success the # of functions enabled, |
2667 | * but if it didn't find any functions it returns zero. | 2724 | * but if it didn't find any functions it returns zero. |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0efa00d80623..a3bddbfd0874 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data = | |||
267 | }; | 267 | }; |
268 | 268 | ||
269 | #ifdef CONFIG_DYNAMIC_FTRACE | 269 | #ifdef CONFIG_DYNAMIC_FTRACE |
270 | static void update_traceon_count(void **data, bool on) | 270 | static void update_traceon_count(struct ftrace_probe_ops *ops, |
271 | unsigned long ip, | ||
272 | struct trace_array *tr, bool on, | ||
273 | void *data) | ||
271 | { | 274 | { |
272 | long *count = (long *)data; | 275 | struct ftrace_func_mapper *mapper = data; |
273 | long old_count = *count; | 276 | long *count; |
277 | long old_count; | ||
274 | 278 | ||
275 | /* | 279 | /* |
276 | * Tracing gets disabled (or enabled) once per count. | 280 | * Tracing gets disabled (or enabled) once per count. |
@@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on) | |||
301 | * setting the tracing_on file. But we currently don't care | 305 | * setting the tracing_on file. But we currently don't care |
302 | * about that. | 306 | * about that. |
303 | */ | 307 | */ |
304 | if (!old_count) | 308 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); |
309 | old_count = *count; | ||
310 | |||
311 | if (old_count <= 0) | ||
305 | return; | 312 | return; |
306 | 313 | ||
307 | /* Make sure we see count before checking tracing state */ | 314 | /* Make sure we see count before checking tracing state */ |
308 | smp_rmb(); | 315 | smp_rmb(); |
309 | 316 | ||
310 | if (on == !!tracing_is_on()) | 317 | if (on == !!tracer_tracing_is_on(tr)) |
311 | return; | 318 | return; |
312 | 319 | ||
313 | if (on) | 320 | if (on) |
314 | tracing_on(); | 321 | tracer_tracing_on(tr); |
315 | else | 322 | else |
316 | tracing_off(); | 323 | tracer_tracing_off(tr); |
317 | |||
318 | /* unlimited? */ | ||
319 | if (old_count == -1) | ||
320 | return; | ||
321 | 324 | ||
322 | /* Make sure tracing state is visible before updating count */ | 325 | /* Make sure tracing state is visible before updating count */ |
323 | smp_wmb(); | 326 | smp_wmb(); |
@@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on) | |||
326 | } | 329 | } |
327 | 330 | ||
328 | static void | 331 | static void |
329 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) | 332 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, |
333 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
334 | void *data) | ||
330 | { | 335 | { |
331 | update_traceon_count(data, 1); | 336 | update_traceon_count(ops, ip, tr, 1, data); |
332 | } | 337 | } |
333 | 338 | ||
334 | static void | 339 | static void |
335 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) | 340 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, |
341 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
342 | void *data) | ||
336 | { | 343 | { |
337 | update_traceon_count(data, 0); | 344 | update_traceon_count(ops, ip, tr, 0, data); |
338 | } | 345 | } |
339 | 346 | ||
340 | static void | 347 | static void |
341 | ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) | 348 | ftrace_traceon(unsigned long ip, unsigned long parent_ip, |
349 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
350 | void *data) | ||
342 | { | 351 | { |
343 | if (tracing_is_on()) | 352 | if (tracer_tracing_is_on(tr)) |
344 | return; | 353 | return; |
345 | 354 | ||
346 | tracing_on(); | 355 | tracer_tracing_on(tr); |
347 | } | 356 | } |
348 | 357 | ||
349 | static void | 358 | static void |
350 | ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) | 359 | ftrace_traceoff(unsigned long ip, unsigned long parent_ip, |
360 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
361 | void *data) | ||
351 | { | 362 | { |
352 | if (!tracing_is_on()) | 363 | if (!tracer_tracing_is_on(tr)) |
353 | return; | 364 | return; |
354 | 365 | ||
355 | tracing_off(); | 366 | tracer_tracing_off(tr); |
356 | } | 367 | } |
357 | 368 | ||
358 | /* | 369 | /* |
@@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) | |||
364 | */ | 375 | */ |
365 | #define STACK_SKIP 4 | 376 | #define STACK_SKIP 4 |
366 | 377 | ||
378 | static __always_inline void trace_stack(struct trace_array *tr) | ||
379 | { | ||
380 | unsigned long flags; | ||
381 | int pc; | ||
382 | |||
383 | local_save_flags(flags); | ||
384 | pc = preempt_count(); | ||
385 | |||
386 | __trace_stack(tr, flags, STACK_SKIP, pc); | ||
387 | } | ||
388 | |||
367 | static void | 389 | static void |
368 | ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) | 390 | ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, |
391 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
392 | void *data) | ||
369 | { | 393 | { |
370 | trace_dump_stack(STACK_SKIP); | 394 | trace_stack(tr); |
371 | } | 395 | } |
372 | 396 | ||
373 | static void | 397 | static void |
374 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) | 398 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, |
399 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
400 | void *data) | ||
375 | { | 401 | { |
376 | long *count = (long *)data; | 402 | struct ftrace_func_mapper *mapper = data; |
403 | long *count; | ||
377 | long old_count; | 404 | long old_count; |
378 | long new_count; | 405 | long new_count; |
379 | 406 | ||
407 | if (!tracing_is_on()) | ||
408 | return; | ||
409 | |||
410 | /* unlimited? */ | ||
411 | if (!mapper) { | ||
412 | trace_stack(tr); | ||
413 | return; | ||
414 | } | ||
415 | |||
416 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); | ||
417 | |||
380 | /* | 418 | /* |
381 | * Stack traces should only execute the number of times the | 419 | * Stack traces should only execute the number of times the |
382 | * user specified in the counter. | 420 | * user specified in the counter. |
383 | */ | 421 | */ |
384 | do { | 422 | do { |
385 | |||
386 | if (!tracing_is_on()) | ||
387 | return; | ||
388 | |||
389 | old_count = *count; | 423 | old_count = *count; |
390 | 424 | ||
391 | if (!old_count) | 425 | if (!old_count) |
392 | return; | 426 | return; |
393 | 427 | ||
394 | /* unlimited? */ | ||
395 | if (old_count == -1) { | ||
396 | trace_dump_stack(STACK_SKIP); | ||
397 | return; | ||
398 | } | ||
399 | |||
400 | new_count = old_count - 1; | 428 | new_count = old_count - 1; |
401 | new_count = cmpxchg(count, old_count, new_count); | 429 | new_count = cmpxchg(count, old_count, new_count); |
402 | if (new_count == old_count) | 430 | if (new_count == old_count) |
403 | trace_dump_stack(STACK_SKIP); | 431 | trace_stack(tr); |
432 | |||
433 | if (!tracing_is_on()) | ||
434 | return; | ||
404 | 435 | ||
405 | } while (new_count != old_count); | 436 | } while (new_count != old_count); |
406 | } | 437 | } |
407 | 438 | ||
408 | static int update_count(void **data) | 439 | static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, |
440 | void *data) | ||
409 | { | 441 | { |
410 | unsigned long *count = (long *)data; | 442 | struct ftrace_func_mapper *mapper = data; |
443 | long *count = NULL; | ||
411 | 444 | ||
412 | if (!*count) | 445 | if (mapper) |
413 | return 0; | 446 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); |
414 | 447 | ||
415 | if (*count != -1) | 448 | if (count) { |
449 | if (*count <= 0) | ||
450 | return 0; | ||
416 | (*count)--; | 451 | (*count)--; |
452 | } | ||
417 | 453 | ||
418 | return 1; | 454 | return 1; |
419 | } | 455 | } |
420 | 456 | ||
421 | static void | 457 | static void |
422 | ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) | 458 | ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, |
459 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
460 | void *data) | ||
423 | { | 461 | { |
424 | if (update_count(data)) | 462 | if (update_count(ops, ip, data)) |
425 | ftrace_dump(DUMP_ALL); | 463 | ftrace_dump(DUMP_ALL); |
426 | } | 464 | } |
427 | 465 | ||
428 | /* Only dump the current CPU buffer. */ | 466 | /* Only dump the current CPU buffer. */ |
429 | static void | 467 | static void |
430 | ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) | 468 | ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, |
469 | struct trace_array *tr, struct ftrace_probe_ops *ops, | ||
470 | void *data) | ||
431 | { | 471 | { |
432 | if (update_count(data)) | 472 | if (update_count(ops, ip, data)) |
433 | ftrace_dump(DUMP_ORIG); | 473 | ftrace_dump(DUMP_ORIG); |
434 | } | 474 | } |
435 | 475 | ||
436 | static int | 476 | static int |
437 | ftrace_probe_print(const char *name, struct seq_file *m, | 477 | ftrace_probe_print(const char *name, struct seq_file *m, |
438 | unsigned long ip, void *data) | 478 | unsigned long ip, struct ftrace_probe_ops *ops, |
479 | void *data) | ||
439 | { | 480 | { |
440 | long count = (long)data; | 481 | struct ftrace_func_mapper *mapper = data; |
482 | long *count = NULL; | ||
441 | 483 | ||
442 | seq_printf(m, "%ps:%s", (void *)ip, name); | 484 | seq_printf(m, "%ps:%s", (void *)ip, name); |
443 | 485 | ||
444 | if (count == -1) | 486 | if (mapper) |
445 | seq_puts(m, ":unlimited\n"); | 487 | count = (long *)ftrace_func_mapper_find_ip(mapper, ip); |
488 | |||
489 | if (count) | ||
490 | seq_printf(m, ":count=%ld\n", *count); | ||
446 | else | 491 | else |
447 | seq_printf(m, ":count=%ld\n", count); | 492 | seq_puts(m, ":unlimited\n"); |
448 | 493 | ||
449 | return 0; | 494 | return 0; |
450 | } | 495 | } |
451 | 496 | ||
452 | static int | 497 | static int |
453 | ftrace_traceon_print(struct seq_file *m, unsigned long ip, | 498 | ftrace_traceon_print(struct seq_file *m, unsigned long ip, |
454 | struct ftrace_probe_ops *ops, void *data) | 499 | struct ftrace_probe_ops *ops, |
500 | void *data) | ||
455 | { | 501 | { |
456 | return ftrace_probe_print("traceon", m, ip, data); | 502 | return ftrace_probe_print("traceon", m, ip, ops, data); |
457 | } | 503 | } |
458 | 504 | ||
459 | static int | 505 | static int |
460 | ftrace_traceoff_print(struct seq_file *m, unsigned long ip, | 506 | ftrace_traceoff_print(struct seq_file *m, unsigned long ip, |
461 | struct ftrace_probe_ops *ops, void *data) | 507 | struct ftrace_probe_ops *ops, void *data) |
462 | { | 508 | { |
463 | return ftrace_probe_print("traceoff", m, ip, data); | 509 | return ftrace_probe_print("traceoff", m, ip, ops, data); |
464 | } | 510 | } |
465 | 511 | ||
466 | static int | 512 | static int |
467 | ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, | 513 | ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, |
468 | struct ftrace_probe_ops *ops, void *data) | 514 | struct ftrace_probe_ops *ops, void *data) |
469 | { | 515 | { |
470 | return ftrace_probe_print("stacktrace", m, ip, data); | 516 | return ftrace_probe_print("stacktrace", m, ip, ops, data); |
471 | } | 517 | } |
472 | 518 | ||
473 | static int | 519 | static int |
474 | ftrace_dump_print(struct seq_file *m, unsigned long ip, | 520 | ftrace_dump_print(struct seq_file *m, unsigned long ip, |
475 | struct ftrace_probe_ops *ops, void *data) | 521 | struct ftrace_probe_ops *ops, void *data) |
476 | { | 522 | { |
477 | return ftrace_probe_print("dump", m, ip, data); | 523 | return ftrace_probe_print("dump", m, ip, ops, data); |
478 | } | 524 | } |
479 | 525 | ||
480 | static int | 526 | static int |
481 | ftrace_cpudump_print(struct seq_file *m, unsigned long ip, | 527 | ftrace_cpudump_print(struct seq_file *m, unsigned long ip, |
482 | struct ftrace_probe_ops *ops, void *data) | 528 | struct ftrace_probe_ops *ops, void *data) |
483 | { | 529 | { |
484 | return ftrace_probe_print("cpudump", m, ip, data); | 530 | return ftrace_probe_print("cpudump", m, ip, ops, data); |
531 | } | ||
532 | |||
533 | |||
534 | static int | ||
535 | ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, | ||
536 | unsigned long ip, void *init_data, void **data) | ||
537 | { | ||
538 | struct ftrace_func_mapper *mapper = *data; | ||
539 | |||
540 | if (!mapper) { | ||
541 | mapper = allocate_ftrace_func_mapper(); | ||
542 | if (!mapper) | ||
543 | return -ENOMEM; | ||
544 | *data = mapper; | ||
545 | } | ||
546 | |||
547 | return ftrace_func_mapper_add_ip(mapper, ip, init_data); | ||
548 | } | ||
549 | |||
550 | static void | ||
551 | ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, | ||
552 | unsigned long ip, void *data) | ||
553 | { | ||
554 | struct ftrace_func_mapper *mapper = data; | ||
555 | |||
556 | if (!ip) { | ||
557 | free_ftrace_func_mapper(mapper, NULL); | ||
558 | return; | ||
559 | } | ||
560 | |||
561 | ftrace_func_mapper_remove_ip(mapper, ip); | ||
485 | } | 562 | } |
486 | 563 | ||
487 | static struct ftrace_probe_ops traceon_count_probe_ops = { | 564 | static struct ftrace_probe_ops traceon_count_probe_ops = { |
488 | .func = ftrace_traceon_count, | 565 | .func = ftrace_traceon_count, |
489 | .print = ftrace_traceon_print, | 566 | .print = ftrace_traceon_print, |
567 | .init = ftrace_count_init, | ||
568 | .free = ftrace_count_free, | ||
490 | }; | 569 | }; |
491 | 570 | ||
492 | static struct ftrace_probe_ops traceoff_count_probe_ops = { | 571 | static struct ftrace_probe_ops traceoff_count_probe_ops = { |
493 | .func = ftrace_traceoff_count, | 572 | .func = ftrace_traceoff_count, |
494 | .print = ftrace_traceoff_print, | 573 | .print = ftrace_traceoff_print, |
574 | .init = ftrace_count_init, | ||
575 | .free = ftrace_count_free, | ||
495 | }; | 576 | }; |
496 | 577 | ||
497 | static struct ftrace_probe_ops stacktrace_count_probe_ops = { | 578 | static struct ftrace_probe_ops stacktrace_count_probe_ops = { |
498 | .func = ftrace_stacktrace_count, | 579 | .func = ftrace_stacktrace_count, |
499 | .print = ftrace_stacktrace_print, | 580 | .print = ftrace_stacktrace_print, |
581 | .init = ftrace_count_init, | ||
582 | .free = ftrace_count_free, | ||
500 | }; | 583 | }; |
501 | 584 | ||
502 | static struct ftrace_probe_ops dump_probe_ops = { | 585 | static struct ftrace_probe_ops dump_probe_ops = { |
503 | .func = ftrace_dump_probe, | 586 | .func = ftrace_dump_probe, |
504 | .print = ftrace_dump_print, | 587 | .print = ftrace_dump_print, |
588 | .init = ftrace_count_init, | ||
589 | .free = ftrace_count_free, | ||
505 | }; | 590 | }; |
506 | 591 | ||
507 | static struct ftrace_probe_ops cpudump_probe_ops = { | 592 | static struct ftrace_probe_ops cpudump_probe_ops = { |
@@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = { | |||
525 | }; | 610 | }; |
526 | 611 | ||
527 | static int | 612 | static int |
528 | ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, | 613 | ftrace_trace_probe_callback(struct trace_array *tr, |
614 | struct ftrace_probe_ops *ops, | ||
529 | struct ftrace_hash *hash, char *glob, | 615 | struct ftrace_hash *hash, char *glob, |
530 | char *cmd, char *param, int enable) | 616 | char *cmd, char *param, int enable) |
531 | { | 617 | { |
@@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, | |||
537 | if (!enable) | 623 | if (!enable) |
538 | return -EINVAL; | 624 | return -EINVAL; |
539 | 625 | ||
540 | if (glob[0] == '!') { | 626 | if (glob[0] == '!') |
541 | unregister_ftrace_function_probe_func(glob+1, ops); | 627 | return unregister_ftrace_function_probe_func(glob+1, tr, ops); |
542 | return 0; | ||
543 | } | ||
544 | 628 | ||
545 | if (!param) | 629 | if (!param) |
546 | goto out_reg; | 630 | goto out_reg; |
@@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, | |||
559 | return ret; | 643 | return ret; |
560 | 644 | ||
561 | out_reg: | 645 | out_reg: |
562 | ret = register_ftrace_function_probe(glob, ops, count); | 646 | ret = register_ftrace_function_probe(glob, tr, ops, count); |
563 | 647 | ||
564 | return ret < 0 ? ret : 0; | 648 | return ret < 0 ? ret : 0; |
565 | } | 649 | } |
566 | 650 | ||
567 | static int | 651 | static int |
568 | ftrace_trace_onoff_callback(struct ftrace_hash *hash, | 652 | ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, |
569 | char *glob, char *cmd, char *param, int enable) | 653 | char *glob, char *cmd, char *param, int enable) |
570 | { | 654 | { |
571 | struct ftrace_probe_ops *ops; | 655 | struct ftrace_probe_ops *ops; |
@@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, | |||
576 | else | 660 | else |
577 | ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; | 661 | ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; |
578 | 662 | ||
579 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | 663 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, |
580 | param, enable); | 664 | param, enable); |
581 | } | 665 | } |
582 | 666 | ||
583 | static int | 667 | static int |
584 | ftrace_stacktrace_callback(struct ftrace_hash *hash, | 668 | ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, |
585 | char *glob, char *cmd, char *param, int enable) | 669 | char *glob, char *cmd, char *param, int enable) |
586 | { | 670 | { |
587 | struct ftrace_probe_ops *ops; | 671 | struct ftrace_probe_ops *ops; |
588 | 672 | ||
589 | ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; | 673 | ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; |
590 | 674 | ||
591 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | 675 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, |
592 | param, enable); | 676 | param, enable); |
593 | } | 677 | } |
594 | 678 | ||
595 | static int | 679 | static int |
596 | ftrace_dump_callback(struct ftrace_hash *hash, | 680 | ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, |
597 | char *glob, char *cmd, char *param, int enable) | 681 | char *glob, char *cmd, char *param, int enable) |
598 | { | 682 | { |
599 | struct ftrace_probe_ops *ops; | 683 | struct ftrace_probe_ops *ops; |
@@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash, | |||
601 | ops = &dump_probe_ops; | 685 | ops = &dump_probe_ops; |
602 | 686 | ||
603 | /* Only dump once. */ | 687 | /* Only dump once. */ |
604 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | 688 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, |
605 | "1", enable); | 689 | "1", enable); |
606 | } | 690 | } |
607 | 691 | ||
608 | static int | 692 | static int |
609 | ftrace_cpudump_callback(struct ftrace_hash *hash, | 693 | ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, |
610 | char *glob, char *cmd, char *param, int enable) | 694 | char *glob, char *cmd, char *param, int enable) |
611 | { | 695 | { |
612 | struct ftrace_probe_ops *ops; | 696 | struct ftrace_probe_ops *ops; |
@@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash, | |||
614 | ops = &cpudump_probe_ops; | 698 | ops = &cpudump_probe_ops; |
615 | 699 | ||
616 | /* Only dump once. */ | 700 | /* Only dump once. */ |
617 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | 701 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, |
618 | "1", enable); | 702 | "1", enable); |
619 | } | 703 | } |
620 | 704 | ||
@@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void) | |||
687 | } | 771 | } |
688 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 772 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
689 | 773 | ||
690 | static __init int init_function_trace(void) | 774 | __init int init_function_trace(void) |
691 | { | 775 | { |
692 | init_func_cmd_traceon(); | 776 | init_func_cmd_traceon(); |
693 | return register_tracer(&function_trace); | 777 | return register_tracer(&function_trace); |
694 | } | 778 | } |
695 | core_initcall(init_function_trace); | ||
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 21ea6ae77d93..d7c8e4ec3d9d 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c | |||
@@ -79,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; | |||
79 | 79 | ||
80 | /* Individual latency samples are stored here when detected. */ | 80 | /* Individual latency samples are stored here when detected. */ |
81 | struct hwlat_sample { | 81 | struct hwlat_sample { |
82 | u64 seqnum; /* unique sequence */ | 82 | u64 seqnum; /* unique sequence */ |
83 | u64 duration; /* delta */ | 83 | u64 duration; /* delta */ |
84 | u64 outer_duration; /* delta (outer loop) */ | 84 | u64 outer_duration; /* delta (outer loop) */ |
85 | u64 nmi_total_ts; /* Total time spent in NMIs */ | 85 | u64 nmi_total_ts; /* Total time spent in NMIs */ |
86 | struct timespec timestamp; /* wall time */ | 86 | struct timespec64 timestamp; /* wall time */ |
87 | int nmi_count; /* # NMIs during this sample */ | 87 | int nmi_count; /* # NMIs during this sample */ |
88 | }; | 88 | }; |
89 | 89 | ||
90 | /* keep the global state somewhere. */ | 90 | /* keep the global state somewhere. */ |
@@ -250,7 +250,7 @@ static int get_sample(void) | |||
250 | s.seqnum = hwlat_data.count; | 250 | s.seqnum = hwlat_data.count; |
251 | s.duration = sample; | 251 | s.duration = sample; |
252 | s.outer_duration = outer_sample; | 252 | s.outer_duration = outer_sample; |
253 | s.timestamp = CURRENT_TIME; | 253 | ktime_get_real_ts64(&s.timestamp); |
254 | s.nmi_total_ts = nmi_total_ts; | 254 | s.nmi_total_ts = nmi_total_ts; |
255 | s.nmi_count = nmi_count; | 255 | s.nmi_count = nmi_count; |
256 | trace_hwlat_sample(&s); | 256 | trace_hwlat_sample(&s); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5f688cc724f0..c129fca6ec99 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include "trace_probe.h" | 25 | #include "trace_probe.h" |
26 | 26 | ||
27 | #define KPROBE_EVENT_SYSTEM "kprobes" | 27 | #define KPROBE_EVENT_SYSTEM "kprobes" |
28 | #define KRETPROBE_MAXACTIVE_MAX 4096 | ||
28 | 29 | ||
29 | /** | 30 | /** |
30 | * Kprobe event core functions | 31 | * Kprobe event core functions |
@@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, | |||
282 | void *addr, | 283 | void *addr, |
283 | const char *symbol, | 284 | const char *symbol, |
284 | unsigned long offs, | 285 | unsigned long offs, |
286 | int maxactive, | ||
285 | int nargs, bool is_return) | 287 | int nargs, bool is_return) |
286 | { | 288 | { |
287 | struct trace_kprobe *tk; | 289 | struct trace_kprobe *tk; |
@@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, | |||
309 | else | 311 | else |
310 | tk->rp.kp.pre_handler = kprobe_dispatcher; | 312 | tk->rp.kp.pre_handler = kprobe_dispatcher; |
311 | 313 | ||
314 | tk->rp.maxactive = maxactive; | ||
315 | |||
312 | if (!event || !is_good_name(event)) { | 316 | if (!event || !is_good_name(event)) { |
313 | ret = -EINVAL; | 317 | ret = -EINVAL; |
314 | goto error; | 318 | goto error; |
@@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv) | |||
598 | { | 602 | { |
599 | /* | 603 | /* |
600 | * Argument syntax: | 604 | * Argument syntax: |
601 | * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] | 605 | * - Add kprobe: |
602 | * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] | 606 | * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] |
607 | * - Add kretprobe: | ||
608 | * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] | ||
603 | * Fetch args: | 609 | * Fetch args: |
604 | * $retval : fetch return value | 610 | * $retval : fetch return value |
605 | * $stack : fetch stack address | 611 | * $stack : fetch stack address |
@@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv) | |||
619 | int i, ret = 0; | 625 | int i, ret = 0; |
620 | bool is_return = false, is_delete = false; | 626 | bool is_return = false, is_delete = false; |
621 | char *symbol = NULL, *event = NULL, *group = NULL; | 627 | char *symbol = NULL, *event = NULL, *group = NULL; |
628 | int maxactive = 0; | ||
622 | char *arg; | 629 | char *arg; |
623 | unsigned long offset = 0; | 630 | unsigned long offset = 0; |
624 | void *addr = NULL; | 631 | void *addr = NULL; |
@@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv) | |||
637 | return -EINVAL; | 644 | return -EINVAL; |
638 | } | 645 | } |
639 | 646 | ||
640 | if (argv[0][1] == ':') { | 647 | event = strchr(&argv[0][1], ':'); |
641 | event = &argv[0][2]; | 648 | if (event) { |
649 | event[0] = '\0'; | ||
650 | event++; | ||
651 | } | ||
652 | if (is_return && isdigit(argv[0][1])) { | ||
653 | ret = kstrtouint(&argv[0][1], 0, &maxactive); | ||
654 | if (ret) { | ||
655 | pr_info("Failed to parse maxactive.\n"); | ||
656 | return ret; | ||
657 | } | ||
658 | /* kretprobes instances are iterated over via a list. The | ||
659 | * maximum should stay reasonable. | ||
660 | */ | ||
661 | if (maxactive > KRETPROBE_MAXACTIVE_MAX) { | ||
662 | pr_info("Maxactive is too big (%d > %d).\n", | ||
663 | maxactive, KRETPROBE_MAXACTIVE_MAX); | ||
664 | return -E2BIG; | ||
665 | } | ||
666 | } | ||
667 | |||
668 | if (event) { | ||
642 | if (strchr(event, '/')) { | 669 | if (strchr(event, '/')) { |
643 | group = event; | 670 | group = event; |
644 | event = strchr(group, '/') + 1; | 671 | event = strchr(group, '/') + 1; |
@@ -681,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv) | |||
681 | return -EINVAL; | 708 | return -EINVAL; |
682 | } | 709 | } |
683 | if (isdigit(argv[1][0])) { | 710 | if (isdigit(argv[1][0])) { |
684 | if (is_return) { | ||
685 | pr_info("Return probe point must be a symbol.\n"); | ||
686 | return -EINVAL; | ||
687 | } | ||
688 | /* an address specified */ | 711 | /* an address specified */ |
689 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); | 712 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); |
690 | if (ret) { | 713 | if (ret) { |
@@ -700,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv) | |||
700 | pr_info("Failed to parse symbol.\n"); | 723 | pr_info("Failed to parse symbol.\n"); |
701 | return ret; | 724 | return ret; |
702 | } | 725 | } |
703 | if (offset && is_return) { | 726 | if (offset && is_return && |
704 | pr_info("Return probe must be used without offset.\n"); | 727 | !function_offset_within_entry(NULL, symbol, offset)) { |
728 | pr_info("Given offset is not valid for return probe.\n"); | ||
705 | return -EINVAL; | 729 | return -EINVAL; |
706 | } | 730 | } |
707 | } | 731 | } |
@@ -718,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv) | |||
718 | is_return ? 'r' : 'p', addr); | 742 | is_return ? 'r' : 'p', addr); |
719 | event = buf; | 743 | event = buf; |
720 | } | 744 | } |
721 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, | 745 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, |
722 | is_return); | 746 | argc, is_return); |
723 | if (IS_ERR(tk)) { | 747 | if (IS_ERR(tk)) { |
724 | pr_info("Failed to allocate trace_probe.(%d)\n", | 748 | pr_info("Failed to allocate trace_probe.(%d)\n", |
725 | (int)PTR_ERR(tk)); | 749 | (int)PTR_ERR(tk)); |
@@ -1511,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1511 | 1535 | ||
1512 | end: | 1536 | end: |
1513 | release_all_trace_kprobes(); | 1537 | release_all_trace_kprobes(); |
1538 | /* | ||
1539 | * Wait for the optimizer work to finish. Otherwise it might fiddle | ||
1540 | * with probes in already freed __init text. | ||
1541 | */ | ||
1542 | wait_for_kprobe_optimizer(); | ||
1514 | if (warn) | 1543 | if (warn) |
1515 | pr_cont("NG: Some tests are failed. Please check them.\n"); | 1544 | pr_cont("NG: Some tests are failed. Please check them.\n"); |
1516 | else | 1545 | else |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02a4aeb22c47..08f9bab8089e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> | 4 | * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> |
5 | * | 5 | * |
6 | */ | 6 | */ |
7 | |||
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
9 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
10 | #include <linux/ftrace.h> | 9 | #include <linux/ftrace.h> |
@@ -1161,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, | |||
1161 | 1160 | ||
1162 | trace_assign_type(field, entry); | 1161 | trace_assign_type(field, entry); |
1163 | 1162 | ||
1164 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", | 1163 | trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", |
1165 | field->seqnum, | 1164 | field->seqnum, |
1166 | field->duration, | 1165 | field->duration, |
1167 | field->outer_duration, | 1166 | field->outer_duration, |
1168 | field->timestamp.tv_sec, | 1167 | (long long)field->timestamp.tv_sec, |
1169 | field->timestamp.tv_nsec); | 1168 | field->timestamp.tv_nsec); |
1170 | 1169 | ||
1171 | if (field->nmi_count) { | 1170 | if (field->nmi_count) { |
@@ -1195,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags, | |||
1195 | 1194 | ||
1196 | trace_assign_type(field, iter->ent); | 1195 | trace_assign_type(field, iter->ent); |
1197 | 1196 | ||
1198 | trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", | 1197 | trace_seq_printf(s, "%llu %lld %lld %09ld %u\n", |
1199 | field->duration, | 1198 | field->duration, |
1200 | field->outer_duration, | 1199 | field->outer_duration, |
1201 | field->timestamp.tv_sec, | 1200 | (long long)field->timestamp.tv_sec, |
1202 | field->timestamp.tv_nsec, | 1201 | field->timestamp.tv_nsec, |
1203 | field->seqnum); | 1202 | field->seqnum); |
1204 | 1203 | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5fb1f2c87e6b..76aa04d4c925 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -35,7 +35,7 @@ unsigned long stack_trace_max_size; | |||
35 | arch_spinlock_t stack_trace_max_lock = | 35 | arch_spinlock_t stack_trace_max_lock = |
36 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 36 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
37 | 37 | ||
38 | static DEFINE_PER_CPU(int, trace_active); | 38 | DEFINE_PER_CPU(int, disable_stack_tracer); |
39 | static DEFINE_MUTEX(stack_sysctl_mutex); | 39 | static DEFINE_MUTEX(stack_sysctl_mutex); |
40 | 40 | ||
41 | int stack_tracer_enabled; | 41 | int stack_tracer_enabled; |
@@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
96 | if (in_nmi()) | 96 | if (in_nmi()) |
97 | return; | 97 | return; |
98 | 98 | ||
99 | /* | ||
100 | * There's a slight chance that we are tracing inside the | ||
101 | * RCU infrastructure, and rcu_irq_enter() will not work | ||
102 | * as expected. | ||
103 | */ | ||
104 | if (unlikely(rcu_irq_enter_disabled())) | ||
105 | return; | ||
106 | |||
99 | local_irq_save(flags); | 107 | local_irq_save(flags); |
100 | arch_spin_lock(&stack_trace_max_lock); | 108 | arch_spin_lock(&stack_trace_max_lock); |
101 | 109 | ||
@@ -207,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
207 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 215 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
208 | { | 216 | { |
209 | unsigned long stack; | 217 | unsigned long stack; |
210 | int cpu; | ||
211 | 218 | ||
212 | preempt_disable_notrace(); | 219 | preempt_disable_notrace(); |
213 | 220 | ||
214 | cpu = raw_smp_processor_id(); | ||
215 | /* no atomic needed, we only modify this variable by this cpu */ | 221 | /* no atomic needed, we only modify this variable by this cpu */ |
216 | if (per_cpu(trace_active, cpu)++ != 0) | 222 | __this_cpu_inc(disable_stack_tracer); |
223 | if (__this_cpu_read(disable_stack_tracer) != 1) | ||
217 | goto out; | 224 | goto out; |
218 | 225 | ||
219 | ip += MCOUNT_INSN_SIZE; | 226 | ip += MCOUNT_INSN_SIZE; |
@@ -221,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
221 | check_stack(ip, &stack); | 228 | check_stack(ip, &stack); |
222 | 229 | ||
223 | out: | 230 | out: |
224 | per_cpu(trace_active, cpu)--; | 231 | __this_cpu_dec(disable_stack_tracer); |
225 | /* prevent recursion in schedule */ | 232 | /* prevent recursion in schedule */ |
226 | preempt_enable_notrace(); | 233 | preempt_enable_notrace(); |
227 | } | 234 | } |
@@ -253,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
253 | long *ptr = filp->private_data; | 260 | long *ptr = filp->private_data; |
254 | unsigned long val, flags; | 261 | unsigned long val, flags; |
255 | int ret; | 262 | int ret; |
256 | int cpu; | ||
257 | 263 | ||
258 | ret = kstrtoul_from_user(ubuf, count, 10, &val); | 264 | ret = kstrtoul_from_user(ubuf, count, 10, &val); |
259 | if (ret) | 265 | if (ret) |
@@ -264,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
264 | /* | 270 | /* |
265 | * In case we trace inside arch_spin_lock() or after (NMI), | 271 | * In case we trace inside arch_spin_lock() or after (NMI), |
266 | * we will cause circular lock, so we also need to increase | 272 | * we will cause circular lock, so we also need to increase |
267 | * the percpu trace_active here. | 273 | * the percpu disable_stack_tracer here. |
268 | */ | 274 | */ |
269 | cpu = smp_processor_id(); | 275 | __this_cpu_inc(disable_stack_tracer); |
270 | per_cpu(trace_active, cpu)++; | ||
271 | 276 | ||
272 | arch_spin_lock(&stack_trace_max_lock); | 277 | arch_spin_lock(&stack_trace_max_lock); |
273 | *ptr = val; | 278 | *ptr = val; |
274 | arch_spin_unlock(&stack_trace_max_lock); | 279 | arch_spin_unlock(&stack_trace_max_lock); |
275 | 280 | ||
276 | per_cpu(trace_active, cpu)--; | 281 | __this_cpu_dec(disable_stack_tracer); |
277 | local_irq_restore(flags); | 282 | local_irq_restore(flags); |
278 | 283 | ||
279 | return count; | 284 | return count; |
@@ -307,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
307 | 312 | ||
308 | static void *t_start(struct seq_file *m, loff_t *pos) | 313 | static void *t_start(struct seq_file *m, loff_t *pos) |
309 | { | 314 | { |
310 | int cpu; | ||
311 | |||
312 | local_irq_disable(); | 315 | local_irq_disable(); |
313 | 316 | ||
314 | cpu = smp_processor_id(); | 317 | __this_cpu_inc(disable_stack_tracer); |
315 | per_cpu(trace_active, cpu)++; | ||
316 | 318 | ||
317 | arch_spin_lock(&stack_trace_max_lock); | 319 | arch_spin_lock(&stack_trace_max_lock); |
318 | 320 | ||
@@ -324,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
324 | 326 | ||
325 | static void t_stop(struct seq_file *m, void *p) | 327 | static void t_stop(struct seq_file *m, void *p) |
326 | { | 328 | { |
327 | int cpu; | ||
328 | |||
329 | arch_spin_unlock(&stack_trace_max_lock); | 329 | arch_spin_unlock(&stack_trace_max_lock); |
330 | 330 | ||
331 | cpu = smp_processor_id(); | 331 | __this_cpu_dec(disable_stack_tracer); |
332 | per_cpu(trace_active, cpu)--; | ||
333 | 332 | ||
334 | local_irq_enable(); | 333 | local_irq_enable(); |
335 | } | 334 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0168b7da1ea..c74bf39ef764 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool) | |||
3209 | INIT_LIST_HEAD(&pool->idle_list); | 3209 | INIT_LIST_HEAD(&pool->idle_list); |
3210 | hash_init(pool->busy_hash); | 3210 | hash_init(pool->busy_hash); |
3211 | 3211 | ||
3212 | init_timer_deferrable(&pool->idle_timer); | 3212 | setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, |
3213 | pool->idle_timer.function = idle_worker_timeout; | 3213 | (unsigned long)pool); |
3214 | pool->idle_timer.data = (unsigned long)pool; | ||
3215 | 3214 | ||
3216 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, | 3215 | setup_timer(&pool->mayday_timer, pool_mayday_timeout, |
3217 | (unsigned long)pool); | 3216 | (unsigned long)pool); |
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | |||
4735 | return wfc.ret; | 4734 | return wfc.ret; |
4736 | } | 4735 | } |
4737 | EXPORT_SYMBOL_GPL(work_on_cpu); | 4736 | EXPORT_SYMBOL_GPL(work_on_cpu); |
4737 | |||
4738 | /** | ||
4739 | * work_on_cpu_safe - run a function in thread context on a particular cpu | ||
4740 | * @cpu: the cpu to run on | ||
4741 | * @fn: the function to run | ||
4742 | * @arg: the function argument | ||
4743 | * | ||
4744 | * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold | ||
4745 | * any locks which would prevent @fn from completing. | ||
4746 | * | ||
4747 | * Return: The value @fn returns. | ||
4748 | */ | ||
4749 | long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) | ||
4750 | { | ||
4751 | long ret = -ENODEV; | ||
4752 | |||
4753 | get_online_cpus(); | ||
4754 | if (cpu_online(cpu)) | ||
4755 | ret = work_on_cpu(cpu, fn, arg); | ||
4756 | put_online_cpus(); | ||
4757 | return ret; | ||
4758 | } | ||
4759 | EXPORT_SYMBOL_GPL(work_on_cpu_safe); | ||
4738 | #endif /* CONFIG_SMP */ | 4760 | #endif /* CONFIG_SMP */ |
4739 | 4761 | ||
4740 | #ifdef CONFIG_FREEZER | 4762 | #ifdef CONFIG_FREEZER |