diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 14:35:36 -0500 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2015-02-10 14:35:36 -0500 |
| commit | 4ba24fef3eb3b142197135223b90ced2f319cd53 (patch) | |
| tree | a20c125b27740ec7b4c761b11d801108e1b316b2 /kernel | |
| parent | 47c1ffb2b6b630894e9a16442611c056ab21c057 (diff) | |
| parent | 98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff) | |
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'kernel')
171 files changed, 12976 insertions, 5148 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..a59481a3fa6c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o | |||
| 57 | obj-$(CONFIG_USER_NS) += user_namespace.o | 57 | obj-$(CONFIG_USER_NS) += user_namespace.o |
| 58 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 58 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
| 59 | obj-$(CONFIG_IKCONFIG) += configs.o | 59 | obj-$(CONFIG_IKCONFIG) += configs.o |
| 60 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | ||
| 61 | obj-$(CONFIG_SMP) += stop_machine.o | 60 | obj-$(CONFIG_SMP) += stop_machine.o |
| 62 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 61 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
| 63 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 62 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
| @@ -86,7 +85,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
| 86 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 85 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
| 87 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 86 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
| 88 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 87 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
| 89 | obj-$(CONFIG_NET) += bpf/ | 88 | obj-$(CONFIG_BPF) += bpf/ |
| 90 | 89 | ||
| 91 | obj-$(CONFIG_PERF_EVENTS) += events/ | 90 | obj-$(CONFIG_PERF_EVENTS) += events/ |
| 92 | 91 | ||
diff --git a/kernel/acct.c b/kernel/acct.c index b4c667d22e79..33738ef972f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct) | |||
| 472 | acct_t ac; | 472 | acct_t ac; |
| 473 | unsigned long flim; | 473 | unsigned long flim; |
| 474 | const struct cred *orig_cred; | 474 | const struct cred *orig_cred; |
| 475 | struct pid_namespace *ns = acct->ns; | ||
| 476 | struct file *file = acct->file; | 475 | struct file *file = acct->file; |
| 477 | 476 | ||
| 478 | /* | 477 | /* |
| @@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct) | |||
| 500 | ac.ac_gid16 = ac.ac_gid; | 499 | ac.ac_gid16 = ac.ac_gid; |
| 501 | #endif | 500 | #endif |
| 502 | #if ACCT_VERSION == 3 | 501 | #if ACCT_VERSION == 3 |
| 503 | ac.ac_pid = task_tgid_nr_ns(current, ns); | 502 | { |
| 504 | rcu_read_lock(); | 503 | struct pid_namespace *ns = acct->ns; |
| 505 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); | 504 | |
| 506 | rcu_read_unlock(); | 505 | ac.ac_pid = task_tgid_nr_ns(current, ns); |
| 506 | rcu_read_lock(); | ||
| 507 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), | ||
| 508 | ns); | ||
| 509 | rcu_read_unlock(); | ||
| 510 | } | ||
| 507 | #endif | 511 | #endif |
| 508 | /* | 512 | /* |
| 509 | * Get freeze protection. If the fs is frozen, just skip the write | 513 | * Get freeze protection. If the fs is frozen, just skip the write |
diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 115 | 115 | ||
| 116 | /* 1) run (and print duration) */ | 116 | /* 1) run (and print duration) */ |
| 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 118 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", | 118 | pr_debug("calling %lli_%pF @ %i\n", |
| 119 | (long long)entry->cookie, | 119 | (long long)entry->cookie, |
| 120 | entry->func, task_pid_nr(current)); | 120 | entry->func, task_pid_nr(current)); |
| 121 | calltime = ktime_get(); | 121 | calltime = ktime_get(); |
| @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 125 | rettime = ktime_get(); | 125 | rettime = ktime_get(); |
| 126 | delta = ktime_sub(rettime, calltime); | 126 | delta = ktime_sub(rettime, calltime); |
| 127 | printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", | 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", |
| 128 | (long long)entry->cookie, | 128 | (long long)entry->cookie, |
| 129 | entry->func, | 129 | entry->func, |
| 130 | (long long)ktime_to_ns(delta) >> 10); | 130 | (long long)ktime_to_ns(delta) >> 10); |
| @@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
| 285 | ktime_t uninitialized_var(starttime), delta, endtime; | 285 | ktime_t uninitialized_var(starttime), delta, endtime; |
| 286 | 286 | ||
| 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 288 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); |
| 289 | starttime = ktime_get(); | 289 | starttime = ktime_get(); |
| 290 | } | 290 | } |
| 291 | 291 | ||
| @@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
| 295 | endtime = ktime_get(); | 295 | endtime = ktime_get(); |
| 296 | delta = ktime_sub(endtime, starttime); | 296 | delta = ktime_sub(endtime, starttime); |
| 297 | 297 | ||
| 298 | printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", | 298 | pr_debug("async_continuing @ %i after %lli usec\n", |
| 299 | task_pid_nr(current), | 299 | task_pid_nr(current), |
| 300 | (long long)ktime_to_ns(delta) >> 10); | 300 | (long long)ktime_to_ns(delta) >> 10); |
| 301 | } | 301 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index ba2ff5a5c600..72ab759a0b43 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
| 126 | 126 | ||
| 127 | /* The netlink socket. */ | 127 | /* The netlink socket. */ |
| 128 | static struct sock *audit_sock; | 128 | static struct sock *audit_sock; |
| 129 | int audit_net_id; | 129 | static int audit_net_id; |
| 130 | 130 | ||
| 131 | /* Hash for inode-based rules */ | 131 | /* Hash for inode-based rules */ |
| 132 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | 132 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; |
| @@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
| 429 | * This function doesn't consume an skb as might be expected since it has to | 429 | * This function doesn't consume an skb as might be expected since it has to |
| 430 | * copy it anyways. | 430 | * copy it anyways. |
| 431 | */ | 431 | */ |
| 432 | static void kauditd_send_multicast_skb(struct sk_buff *skb) | 432 | static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) |
| 433 | { | 433 | { |
| 434 | struct sk_buff *copy; | 434 | struct sk_buff *copy; |
| 435 | struct audit_net *aunet = net_generic(&init_net, audit_net_id); | 435 | struct audit_net *aunet = net_generic(&init_net, audit_net_id); |
| @@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) | |||
| 448 | * no reason for new multicast clients to continue with this | 448 | * no reason for new multicast clients to continue with this |
| 449 | * non-compliance. | 449 | * non-compliance. |
| 450 | */ | 450 | */ |
| 451 | copy = skb_copy(skb, GFP_KERNEL); | 451 | copy = skb_copy(skb, gfp_mask); |
| 452 | if (!copy) | 452 | if (!copy) |
| 453 | return; | 453 | return; |
| 454 | 454 | ||
| 455 | nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); | 455 | nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); |
| 456 | } | 456 | } |
| 457 | 457 | ||
| 458 | /* | 458 | /* |
| @@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy) | |||
| 499 | set_freezable(); | 499 | set_freezable(); |
| 500 | while (!kthread_should_stop()) { | 500 | while (!kthread_should_stop()) { |
| 501 | struct sk_buff *skb; | 501 | struct sk_buff *skb; |
| 502 | DECLARE_WAITQUEUE(wait, current); | ||
| 503 | 502 | ||
| 504 | flush_hold_queue(); | 503 | flush_hold_queue(); |
| 505 | 504 | ||
| @@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy) | |||
| 514 | audit_printk_skb(skb); | 513 | audit_printk_skb(skb); |
| 515 | continue; | 514 | continue; |
| 516 | } | 515 | } |
| 517 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 518 | add_wait_queue(&kauditd_wait, &wait); | ||
| 519 | 516 | ||
| 520 | if (!skb_queue_len(&audit_skb_queue)) { | 517 | wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); |
| 521 | try_to_freeze(); | ||
| 522 | schedule(); | ||
| 523 | } | ||
| 524 | |||
| 525 | __set_current_state(TASK_RUNNING); | ||
| 526 | remove_wait_queue(&kauditd_wait, &wait); | ||
| 527 | } | 518 | } |
| 528 | return 0; | 519 | return 0; |
| 529 | } | 520 | } |
| @@ -724,7 +715,7 @@ static int audit_get_feature(struct sk_buff *skb) | |||
| 724 | 715 | ||
| 725 | seq = nlmsg_hdr(skb)->nlmsg_seq; | 716 | seq = nlmsg_hdr(skb)->nlmsg_seq; |
| 726 | 717 | ||
| 727 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); | 718 | audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); |
| 728 | 719 | ||
| 729 | return 0; | 720 | return 0; |
| 730 | } | 721 | } |
| @@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature | |||
| 739 | 730 | ||
| 740 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); | 731 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); |
| 741 | audit_log_task_info(ab, current); | 732 | audit_log_task_info(ab, current); |
| 742 | audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", | 733 | audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", |
| 743 | audit_feature_names[which], !!old_feature, !!new_feature, | 734 | audit_feature_names[which], !!old_feature, !!new_feature, |
| 744 | !!old_lock, !!new_lock, res); | 735 | !!old_lock, !!new_lock, res); |
| 745 | audit_log_end(ab); | 736 | audit_log_end(ab); |
| @@ -750,7 +741,7 @@ static int audit_set_feature(struct sk_buff *skb) | |||
| 750 | struct audit_features *uaf; | 741 | struct audit_features *uaf; |
| 751 | int i; | 742 | int i; |
| 752 | 743 | ||
| 753 | BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); | 744 | BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); |
| 754 | uaf = nlmsg_data(nlmsg_hdr(skb)); | 745 | uaf = nlmsg_data(nlmsg_hdr(skb)); |
| 755 | 746 | ||
| 756 | /* if there is ever a version 2 we should handle that here */ | 747 | /* if there is ever a version 2 we should handle that here */ |
| @@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 842 | s.backlog_limit = audit_backlog_limit; | 833 | s.backlog_limit = audit_backlog_limit; |
| 843 | s.lost = atomic_read(&audit_lost); | 834 | s.lost = atomic_read(&audit_lost); |
| 844 | s.backlog = skb_queue_len(&audit_skb_queue); | 835 | s.backlog = skb_queue_len(&audit_skb_queue); |
| 845 | s.version = AUDIT_VERSION_LATEST; | 836 | s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; |
| 846 | s.backlog_wait_time = audit_backlog_wait_time; | 837 | s.backlog_wait_time = audit_backlog_wait_time; |
| 847 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); | 838 | audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); |
| 848 | break; | 839 | break; |
| @@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb) | |||
| 1109 | } | 1100 | } |
| 1110 | 1101 | ||
| 1111 | /* Run custom bind function on netlink socket group connect or bind requests. */ | 1102 | /* Run custom bind function on netlink socket group connect or bind requests. */ |
| 1112 | static int audit_bind(int group) | 1103 | static int audit_bind(struct net *net, int group) |
| 1113 | { | 1104 | { |
| 1114 | if (!capable(CAP_AUDIT_READ)) | 1105 | if (!capable(CAP_AUDIT_READ)) |
| 1115 | return -EPERM; | 1106 | return -EPERM; |
| @@ -1301,19 +1292,9 @@ err: | |||
| 1301 | */ | 1292 | */ |
| 1302 | unsigned int audit_serial(void) | 1293 | unsigned int audit_serial(void) |
| 1303 | { | 1294 | { |
| 1304 | static DEFINE_SPINLOCK(serial_lock); | 1295 | static atomic_t serial = ATOMIC_INIT(0); |
| 1305 | static unsigned int serial = 0; | ||
| 1306 | |||
| 1307 | unsigned long flags; | ||
| 1308 | unsigned int ret; | ||
| 1309 | |||
| 1310 | spin_lock_irqsave(&serial_lock, flags); | ||
| 1311 | do { | ||
| 1312 | ret = ++serial; | ||
| 1313 | } while (unlikely(!ret)); | ||
| 1314 | spin_unlock_irqrestore(&serial_lock, flags); | ||
| 1315 | 1296 | ||
| 1316 | return ret; | 1297 | return atomic_add_return(1, &serial); |
| 1317 | } | 1298 | } |
| 1318 | 1299 | ||
| 1319 | static inline void audit_get_stamp(struct audit_context *ctx, | 1300 | static inline void audit_get_stamp(struct audit_context *ctx, |
| @@ -1681,7 +1662,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | |||
| 1681 | } | 1662 | } |
| 1682 | } | 1663 | } |
| 1683 | 1664 | ||
| 1684 | void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | 1665 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) |
| 1685 | { | 1666 | { |
| 1686 | kernel_cap_t *perm = &name->fcap.permitted; | 1667 | kernel_cap_t *perm = &name->fcap.permitted; |
| 1687 | kernel_cap_t *inh = &name->fcap.inheritable; | 1668 | kernel_cap_t *inh = &name->fcap.inheritable; |
| @@ -1860,7 +1841,7 @@ EXPORT_SYMBOL(audit_log_task_context); | |||
| 1860 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 1841 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
| 1861 | { | 1842 | { |
| 1862 | const struct cred *cred; | 1843 | const struct cred *cred; |
| 1863 | char name[sizeof(tsk->comm)]; | 1844 | char comm[sizeof(tsk->comm)]; |
| 1864 | struct mm_struct *mm = tsk->mm; | 1845 | struct mm_struct *mm = tsk->mm; |
| 1865 | char *tty; | 1846 | char *tty; |
| 1866 | 1847 | ||
| @@ -1894,9 +1875,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | |||
| 1894 | from_kgid(&init_user_ns, cred->fsgid), | 1875 | from_kgid(&init_user_ns, cred->fsgid), |
| 1895 | tty, audit_get_sessionid(tsk)); | 1876 | tty, audit_get_sessionid(tsk)); |
| 1896 | 1877 | ||
| 1897 | get_task_comm(name, tsk); | ||
| 1898 | audit_log_format(ab, " comm="); | 1878 | audit_log_format(ab, " comm="); |
| 1899 | audit_log_untrustedstring(ab, name); | 1879 | audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); |
| 1900 | 1880 | ||
| 1901 | if (mm) { | 1881 | if (mm) { |
| 1902 | down_read(&mm->mmap_sem); | 1882 | down_read(&mm->mmap_sem); |
| @@ -1959,7 +1939,8 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1959 | } else { | 1939 | } else { |
| 1960 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1940 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
| 1961 | 1941 | ||
| 1962 | kauditd_send_multicast_skb(ab->skb); | 1942 | nlh->nlmsg_len = ab->skb->len; |
| 1943 | kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); | ||
| 1963 | 1944 | ||
| 1964 | /* | 1945 | /* |
| 1965 | * The original kaudit unicast socket sends up messages with | 1946 | * The original kaudit unicast socket sends up messages with |
| @@ -1970,7 +1951,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
| 1970 | * protocol between the kaudit kernel subsystem and the auditd | 1951 | * protocol between the kaudit kernel subsystem and the auditd |
| 1971 | * userspace code. | 1952 | * userspace code. |
| 1972 | */ | 1953 | */ |
| 1973 | nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; | 1954 | nlh->nlmsg_len -= NLMSG_HDRLEN; |
| 1974 | 1955 | ||
| 1975 | if (audit_pid) { | 1956 | if (audit_pid) { |
| 1976 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1957 | skb_queue_tail(&audit_skb_queue, ab->skb); |
diff --git a/kernel/audit.h b/kernel/audit.h index 7bb65730c890..3cdffad5a1d9 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name, | |||
| 222 | const struct inode *inode); | 222 | const struct inode *inode); |
| 223 | extern void audit_log_cap(struct audit_buffer *ab, char *prefix, | 223 | extern void audit_log_cap(struct audit_buffer *ab, char *prefix, |
| 224 | kernel_cap_t *cap); | 224 | kernel_cap_t *cap); |
| 225 | extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); | ||
| 226 | extern void audit_log_name(struct audit_context *context, | 225 | extern void audit_log_name(struct audit_context *context, |
| 227 | struct audit_names *n, struct path *path, | 226 | struct audit_names *n, struct path *path, |
| 228 | int record_num, int *call_panic); | 227 | int record_num, int *call_panic); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 135944a7b28a..2e0c97427b33 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count) | |||
| 154 | chunk->owners[i].index = i; | 154 | chunk->owners[i].index = i; |
| 155 | } | 155 | } |
| 156 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); | 156 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); |
| 157 | chunk->mark.mask = FS_IN_IGNORED; | ||
| 157 | return chunk; | 158 | return chunk; |
| 158 | } | 159 | } |
| 159 | 160 | ||
| @@ -173,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) | |||
| 173 | struct fsnotify_mark *entry = &chunk->mark; | 174 | struct fsnotify_mark *entry = &chunk->mark; |
| 174 | struct list_head *list; | 175 | struct list_head *list; |
| 175 | 176 | ||
| 176 | if (!entry->i.inode) | 177 | if (!entry->inode) |
| 177 | return; | 178 | return; |
| 178 | list = chunk_hash(entry->i.inode); | 179 | list = chunk_hash(entry->inode); |
| 179 | list_add_rcu(&chunk->hash, list); | 180 | list_add_rcu(&chunk->hash, list); |
| 180 | } | 181 | } |
| 181 | 182 | ||
| @@ -187,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
| 187 | 188 | ||
| 188 | list_for_each_entry_rcu(p, list, hash) { | 189 | list_for_each_entry_rcu(p, list, hash) { |
| 189 | /* mark.inode may have gone NULL, but who cares? */ | 190 | /* mark.inode may have gone NULL, but who cares? */ |
| 190 | if (p->mark.i.inode == inode) { | 191 | if (p->mark.inode == inode) { |
| 191 | atomic_long_inc(&p->refs); | 192 | atomic_long_inc(&p->refs); |
| 192 | return p; | 193 | return p; |
| 193 | } | 194 | } |
| @@ -230,7 +231,7 @@ static void untag_chunk(struct node *p) | |||
| 230 | new = alloc_chunk(size); | 231 | new = alloc_chunk(size); |
| 231 | 232 | ||
| 232 | spin_lock(&entry->lock); | 233 | spin_lock(&entry->lock); |
| 233 | if (chunk->dead || !entry->i.inode) { | 234 | if (chunk->dead || !entry->inode) { |
| 234 | spin_unlock(&entry->lock); | 235 | spin_unlock(&entry->lock); |
| 235 | if (new) | 236 | if (new) |
| 236 | free_chunk(new); | 237 | free_chunk(new); |
| @@ -257,7 +258,7 @@ static void untag_chunk(struct node *p) | |||
| 257 | goto Fallback; | 258 | goto Fallback; |
| 258 | 259 | ||
| 259 | fsnotify_duplicate_mark(&new->mark, entry); | 260 | fsnotify_duplicate_mark(&new->mark, entry); |
| 260 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { |
| 261 | fsnotify_put_mark(&new->mark); | 262 | fsnotify_put_mark(&new->mark); |
| 262 | goto Fallback; | 263 | goto Fallback; |
| 263 | } | 264 | } |
| @@ -385,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 385 | chunk_entry = &chunk->mark; | 386 | chunk_entry = &chunk->mark; |
| 386 | 387 | ||
| 387 | spin_lock(&old_entry->lock); | 388 | spin_lock(&old_entry->lock); |
| 388 | if (!old_entry->i.inode) { | 389 | if (!old_entry->inode) { |
| 389 | /* old_entry is being shot, lets just lie */ | 390 | /* old_entry is being shot, lets just lie */ |
| 390 | spin_unlock(&old_entry->lock); | 391 | spin_unlock(&old_entry->lock); |
| 391 | fsnotify_put_mark(old_entry); | 392 | fsnotify_put_mark(old_entry); |
| @@ -394,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 394 | } | 395 | } |
| 395 | 396 | ||
| 396 | fsnotify_duplicate_mark(chunk_entry, old_entry); | 397 | fsnotify_duplicate_mark(chunk_entry, old_entry); |
| 397 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | 398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { |
| 398 | spin_unlock(&old_entry->lock); | 399 | spin_unlock(&old_entry->lock); |
| 399 | fsnotify_put_mark(chunk_entry); | 400 | fsnotify_put_mark(chunk_entry); |
| 400 | fsnotify_put_mark(old_entry); | 401 | fsnotify_put_mark(old_entry); |
| @@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
| 449 | return 0; | 450 | return 0; |
| 450 | } | 451 | } |
| 451 | 452 | ||
| 452 | static void audit_log_remove_rule(struct audit_krule *rule) | 453 | static void audit_tree_log_remove_rule(struct audit_krule *rule) |
| 453 | { | 454 | { |
| 454 | struct audit_buffer *ab; | 455 | struct audit_buffer *ab; |
| 455 | 456 | ||
| @@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule) | |||
| 457 | if (unlikely(!ab)) | 458 | if (unlikely(!ab)) |
| 458 | return; | 459 | return; |
| 459 | audit_log_format(ab, "op="); | 460 | audit_log_format(ab, "op="); |
| 460 | audit_log_string(ab, "remove rule"); | 461 | audit_log_string(ab, "remove_rule"); |
| 461 | audit_log_format(ab, " dir="); | 462 | audit_log_format(ab, " dir="); |
| 462 | audit_log_untrustedstring(ab, rule->tree->pathname); | 463 | audit_log_untrustedstring(ab, rule->tree->pathname); |
| 463 | audit_log_key(ab, rule->filterkey); | 464 | audit_log_key(ab, rule->filterkey); |
| @@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree) | |||
| 476 | list_del_init(&rule->rlist); | 477 | list_del_init(&rule->rlist); |
| 477 | if (rule->tree) { | 478 | if (rule->tree) { |
| 478 | /* not a half-baked one */ | 479 | /* not a half-baked one */ |
| 479 | audit_log_remove_rule(rule); | 480 | audit_tree_log_remove_rule(rule); |
| 480 | rule->tree = NULL; | 481 | rule->tree = NULL; |
| 481 | list_del_rcu(&entry->list); | 482 | list_del_rcu(&entry->list); |
| 482 | list_del(&entry->rule.list); | 483 | list_del(&entry->rule.list); |
| @@ -610,7 +611,7 @@ void audit_trim_trees(void) | |||
| 610 | list_for_each_entry(node, &tree->chunks, list) { | 611 | list_for_each_entry(node, &tree->chunks, list) { |
| 611 | struct audit_chunk *chunk = find_chunk(node); | 612 | struct audit_chunk *chunk = find_chunk(node); |
| 612 | /* this could be NULL if the watch is dying else where... */ | 613 | /* this could be NULL if the watch is dying else where... */ |
| 613 | struct inode *inode = chunk->mark.i.inode; | 614 | struct inode *inode = chunk->mark.inode; |
| 614 | node->index |= 1U<<31; | 615 | node->index |= 1U<<31; |
| 615 | if (iterate_mounts(compare_root, inode, root_mnt)) | 616 | if (iterate_mounts(compare_root, inode, root_mnt)) |
| 616 | node->index &= ~(1U<<31); | 617 | node->index &= ~(1U<<31); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 70b4554d2fbe..ad9c1682f616 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent, | |||
| 314 | &nentry->rule.list); | 314 | &nentry->rule.list); |
| 315 | } | 315 | } |
| 316 | 316 | ||
| 317 | audit_watch_log_rule_change(r, owatch, "updated rules"); | 317 | audit_watch_log_rule_change(r, owatch, "updated_rules"); |
| 318 | 318 | ||
| 319 | call_rcu(&oentry->rcu, audit_free_rule_rcu); | 319 | call_rcu(&oentry->rcu, audit_free_rule_rcu); |
| 320 | } | 320 | } |
| @@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
| 342 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 342 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
| 343 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 343 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
| 344 | e = container_of(r, struct audit_entry, rule); | 344 | e = container_of(r, struct audit_entry, rule); |
| 345 | audit_watch_log_rule_change(r, w, "remove rule"); | 345 | audit_watch_log_rule_change(r, w, "remove_rule"); |
| 346 | list_del(&r->rlist); | 346 | list_del(&r->rlist); |
| 347 | list_del(&r->list); | 347 | list_del(&r->list); |
| 348 | list_del_rcu(&e->list); | 348 | list_del_rcu(&e->list); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c447cd9848d1..4f68a326d92e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { | |||
| 71 | 71 | ||
| 72 | DEFINE_MUTEX(audit_filter_mutex); | 72 | DEFINE_MUTEX(audit_filter_mutex); |
| 73 | 73 | ||
| 74 | static void audit_free_lsm_field(struct audit_field *f) | ||
| 75 | { | ||
| 76 | switch (f->type) { | ||
| 77 | case AUDIT_SUBJ_USER: | ||
| 78 | case AUDIT_SUBJ_ROLE: | ||
| 79 | case AUDIT_SUBJ_TYPE: | ||
| 80 | case AUDIT_SUBJ_SEN: | ||
| 81 | case AUDIT_SUBJ_CLR: | ||
| 82 | case AUDIT_OBJ_USER: | ||
| 83 | case AUDIT_OBJ_ROLE: | ||
| 84 | case AUDIT_OBJ_TYPE: | ||
| 85 | case AUDIT_OBJ_LEV_LOW: | ||
| 86 | case AUDIT_OBJ_LEV_HIGH: | ||
| 87 | kfree(f->lsm_str); | ||
| 88 | security_audit_rule_free(f->lsm_rule); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 74 | static inline void audit_free_rule(struct audit_entry *e) | 92 | static inline void audit_free_rule(struct audit_entry *e) |
| 75 | { | 93 | { |
| 76 | int i; | 94 | int i; |
| @@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
| 80 | if (erule->watch) | 98 | if (erule->watch) |
| 81 | audit_put_watch(erule->watch); | 99 | audit_put_watch(erule->watch); |
| 82 | if (erule->fields) | 100 | if (erule->fields) |
| 83 | for (i = 0; i < erule->field_count; i++) { | 101 | for (i = 0; i < erule->field_count; i++) |
| 84 | struct audit_field *f = &erule->fields[i]; | 102 | audit_free_lsm_field(&erule->fields[i]); |
| 85 | kfree(f->lsm_str); | ||
| 86 | security_audit_rule_free(f->lsm_rule); | ||
| 87 | } | ||
| 88 | kfree(erule->fields); | 103 | kfree(erule->fields); |
| 89 | kfree(erule->filterkey); | 104 | kfree(erule->filterkey); |
| 90 | kfree(e); | 105 | kfree(e); |
| @@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule, | |||
| 148 | struct audit_field *f) | 163 | struct audit_field *f) |
| 149 | { | 164 | { |
| 150 | if (krule->listnr != AUDIT_FILTER_EXIT || | 165 | if (krule->listnr != AUDIT_FILTER_EXIT || |
| 151 | krule->watch || krule->inode_f || krule->tree || | 166 | krule->inode_f || krule->watch || krule->tree || |
| 152 | (f->op != Audit_equal && f->op != Audit_not_equal)) | 167 | (f->op != Audit_equal && f->op != Audit_not_equal)) |
| 153 | return -EINVAL; | 168 | return -EINVAL; |
| 154 | 169 | ||
| @@ -422,28 +437,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 422 | 437 | ||
| 423 | f->type = data->fields[i]; | 438 | f->type = data->fields[i]; |
| 424 | f->val = data->values[i]; | 439 | f->val = data->values[i]; |
| 425 | f->uid = INVALID_UID; | ||
| 426 | f->gid = INVALID_GID; | ||
| 427 | f->lsm_str = NULL; | ||
| 428 | f->lsm_rule = NULL; | ||
| 429 | 440 | ||
| 430 | /* Support legacy tests for a valid loginuid */ | 441 | /* Support legacy tests for a valid loginuid */ |
| 431 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { | 442 | if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { |
| 432 | f->type = AUDIT_LOGINUID_SET; | 443 | f->type = AUDIT_LOGINUID_SET; |
| 433 | f->val = 0; | 444 | f->val = 0; |
| 434 | } | 445 | entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; |
| 435 | |||
| 436 | if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { | ||
| 437 | struct pid *pid; | ||
| 438 | rcu_read_lock(); | ||
| 439 | pid = find_vpid(f->val); | ||
| 440 | if (!pid) { | ||
| 441 | rcu_read_unlock(); | ||
| 442 | err = -ESRCH; | ||
| 443 | goto exit_free; | ||
| 444 | } | ||
| 445 | f->val = pid_nr(pid); | ||
| 446 | rcu_read_unlock(); | ||
| 447 | } | 446 | } |
| 448 | 447 | ||
| 449 | err = audit_field_valid(entry, f); | 448 | err = audit_field_valid(entry, f); |
| @@ -619,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
| 619 | data->buflen += data->values[i] = | 618 | data->buflen += data->values[i] = |
| 620 | audit_pack_string(&bufp, krule->filterkey); | 619 | audit_pack_string(&bufp, krule->filterkey); |
| 621 | break; | 620 | break; |
| 621 | case AUDIT_LOGINUID_SET: | ||
| 622 | if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { | ||
| 623 | data->fields[i] = AUDIT_LOGINUID; | ||
| 624 | data->values[i] = AUDIT_UID_UNSET; | ||
| 625 | break; | ||
| 626 | } | ||
| 627 | /* fallthrough if set */ | ||
| 622 | default: | 628 | default: |
| 623 | data->values[i] = f->val; | 629 | data->values[i] = f->val; |
| 624 | } | 630 | } |
| @@ -635,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 635 | int i; | 641 | int i; |
| 636 | 642 | ||
| 637 | if (a->flags != b->flags || | 643 | if (a->flags != b->flags || |
| 644 | a->pflags != b->pflags || | ||
| 638 | a->listnr != b->listnr || | 645 | a->listnr != b->listnr || |
| 639 | a->action != b->action || | 646 | a->action != b->action || |
| 640 | a->field_count != b->field_count) | 647 | a->field_count != b->field_count) |
| @@ -753,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
| 753 | new = &entry->rule; | 760 | new = &entry->rule; |
| 754 | new->vers_ops = old->vers_ops; | 761 | new->vers_ops = old->vers_ops; |
| 755 | new->flags = old->flags; | 762 | new->flags = old->flags; |
| 763 | new->pflags = old->pflags; | ||
| 756 | new->listnr = old->listnr; | 764 | new->listnr = old->listnr; |
| 757 | new->action = old->action; | 765 | new->action = old->action; |
| 758 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 766 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
| @@ -1053,30 +1061,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, | |||
| 1053 | int err = 0; | 1061 | int err = 0; |
| 1054 | struct audit_entry *entry; | 1062 | struct audit_entry *entry; |
| 1055 | 1063 | ||
| 1064 | entry = audit_data_to_entry(data, datasz); | ||
| 1065 | if (IS_ERR(entry)) | ||
| 1066 | return PTR_ERR(entry); | ||
| 1067 | |||
| 1056 | switch (type) { | 1068 | switch (type) { |
| 1057 | case AUDIT_ADD_RULE: | 1069 | case AUDIT_ADD_RULE: |
| 1058 | entry = audit_data_to_entry(data, datasz); | ||
| 1059 | if (IS_ERR(entry)) | ||
| 1060 | return PTR_ERR(entry); | ||
| 1061 | |||
| 1062 | err = audit_add_rule(entry); | 1070 | err = audit_add_rule(entry); |
| 1063 | audit_log_rule_change("add rule", &entry->rule, !err); | 1071 | audit_log_rule_change("add_rule", &entry->rule, !err); |
| 1064 | if (err) | ||
| 1065 | audit_free_rule(entry); | ||
| 1066 | break; | 1072 | break; |
| 1067 | case AUDIT_DEL_RULE: | 1073 | case AUDIT_DEL_RULE: |
| 1068 | entry = audit_data_to_entry(data, datasz); | ||
| 1069 | if (IS_ERR(entry)) | ||
| 1070 | return PTR_ERR(entry); | ||
| 1071 | |||
| 1072 | err = audit_del_rule(entry); | 1074 | err = audit_del_rule(entry); |
| 1073 | audit_log_rule_change("remove rule", &entry->rule, !err); | 1075 | audit_log_rule_change("remove_rule", &entry->rule, !err); |
| 1074 | audit_free_rule(entry); | ||
| 1075 | break; | 1076 | break; |
| 1076 | default: | 1077 | default: |
| 1077 | return -EINVAL; | 1078 | err = -EINVAL; |
| 1079 | WARN_ON(1); | ||
| 1078 | } | 1080 | } |
| 1079 | 1081 | ||
| 1082 | if (err || type == AUDIT_DEL_RULE) | ||
| 1083 | audit_free_rule(entry); | ||
| 1084 | |||
| 1080 | return err; | 1085 | return err; |
| 1081 | } | 1086 | } |
| 1082 | 1087 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21eae3c05ec0..072566dd0caf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -67,10 +67,13 @@ | |||
| 67 | #include <linux/binfmts.h> | 67 | #include <linux/binfmts.h> |
| 68 | #include <linux/highmem.h> | 68 | #include <linux/highmem.h> |
| 69 | #include <linux/syscalls.h> | 69 | #include <linux/syscalls.h> |
| 70 | #include <asm/syscall.h> | ||
| 70 | #include <linux/capability.h> | 71 | #include <linux/capability.h> |
| 71 | #include <linux/fs_struct.h> | 72 | #include <linux/fs_struct.h> |
| 72 | #include <linux/compat.h> | 73 | #include <linux/compat.h> |
| 73 | #include <linux/ctype.h> | 74 | #include <linux/ctype.h> |
| 75 | #include <linux/string.h> | ||
| 76 | #include <uapi/linux/limits.h> | ||
| 74 | 77 | ||
| 75 | #include "audit.h" | 78 | #include "audit.h" |
| 76 | 79 | ||
| @@ -125,14 +128,6 @@ struct audit_tree_refs { | |||
| 125 | struct audit_chunk *c[31]; | 128 | struct audit_chunk *c[31]; |
| 126 | }; | 129 | }; |
| 127 | 130 | ||
| 128 | static inline int open_arg(int flags, int mask) | ||
| 129 | { | ||
| 130 | int n = ACC_MODE(flags); | ||
| 131 | if (flags & (O_TRUNC | O_CREAT)) | ||
| 132 | n |= AUDIT_PERM_WRITE; | ||
| 133 | return n & mask; | ||
| 134 | } | ||
| 135 | |||
| 136 | static int audit_match_perm(struct audit_context *ctx, int mask) | 131 | static int audit_match_perm(struct audit_context *ctx, int mask) |
| 137 | { | 132 | { |
| 138 | unsigned n; | 133 | unsigned n; |
| @@ -1505,7 +1500,6 @@ void __audit_free(struct task_struct *tsk) | |||
| 1505 | 1500 | ||
| 1506 | /** | 1501 | /** |
| 1507 | * audit_syscall_entry - fill in an audit record at syscall entry | 1502 | * audit_syscall_entry - fill in an audit record at syscall entry |
| 1508 | * @arch: architecture type | ||
| 1509 | * @major: major syscall type (function) | 1503 | * @major: major syscall type (function) |
| 1510 | * @a1: additional syscall register 1 | 1504 | * @a1: additional syscall register 1 |
| 1511 | * @a2: additional syscall register 2 | 1505 | * @a2: additional syscall register 2 |
| @@ -1520,9 +1514,8 @@ void __audit_free(struct task_struct *tsk) | |||
| 1520 | * will only be written if another part of the kernel requests that it | 1514 | * will only be written if another part of the kernel requests that it |
| 1521 | * be written). | 1515 | * be written). |
| 1522 | */ | 1516 | */ |
| 1523 | void __audit_syscall_entry(int arch, int major, | 1517 | void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, |
| 1524 | unsigned long a1, unsigned long a2, | 1518 | unsigned long a3, unsigned long a4) |
| 1525 | unsigned long a3, unsigned long a4) | ||
| 1526 | { | 1519 | { |
| 1527 | struct task_struct *tsk = current; | 1520 | struct task_struct *tsk = current; |
| 1528 | struct audit_context *context = tsk->audit_context; | 1521 | struct audit_context *context = tsk->audit_context; |
| @@ -1536,7 +1529,7 @@ void __audit_syscall_entry(int arch, int major, | |||
| 1536 | if (!audit_enabled) | 1529 | if (!audit_enabled) |
| 1537 | return; | 1530 | return; |
| 1538 | 1531 | ||
| 1539 | context->arch = arch; | 1532 | context->arch = syscall_get_arch(); |
| 1540 | context->major = major; | 1533 | context->major = major; |
| 1541 | context->argv[0] = a1; | 1534 | context->argv[0] = a1; |
| 1542 | context->argv[1] = a2; | 1535 | context->argv[1] = a2; |
| @@ -1870,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, | |||
| 1870 | } | 1863 | } |
| 1871 | 1864 | ||
| 1872 | list_for_each_entry_reverse(n, &context->names_list, list) { | 1865 | list_for_each_entry_reverse(n, &context->names_list, list) { |
| 1873 | /* does the name pointer match? */ | 1866 | if (!n->name || strcmp(n->name->name, name->name)) |
| 1874 | if (!n->name || n->name->name != name->name) | ||
| 1875 | continue; | 1867 | continue; |
| 1876 | 1868 | ||
| 1877 | /* match the correct record type */ | 1869 | /* match the correct record type */ |
| @@ -1886,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, | |||
| 1886 | } | 1878 | } |
| 1887 | 1879 | ||
| 1888 | out_alloc: | 1880 | out_alloc: |
| 1889 | /* unable to find the name from a previous getname(). Allocate a new | 1881 | /* unable to find an entry with both a matching name and type */ |
| 1890 | * anonymous entry. | 1882 | n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); |
| 1891 | */ | ||
| 1892 | n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); | ||
| 1893 | if (!n) | 1883 | if (!n) |
| 1894 | return; | 1884 | return; |
| 1885 | /* unfortunately, while we may have a path name to record with the | ||
| 1886 | * inode, we can't always rely on the string lasting until the end of | ||
| 1887 | * the syscall so we need to create our own copy, it may fail due to | ||
| 1888 | * memory allocation issues, but we do our best */ | ||
| 1889 | if (name) { | ||
| 1890 | /* we can't use getname_kernel() due to size limits */ | ||
| 1891 | size_t len = strlen(name->name) + 1; | ||
| 1892 | struct filename *new = __getname(); | ||
| 1893 | |||
| 1894 | if (unlikely(!new)) | ||
| 1895 | goto out; | ||
| 1896 | |||
| 1897 | if (len <= (PATH_MAX - sizeof(*new))) { | ||
| 1898 | new->name = (char *)(new) + sizeof(*new); | ||
| 1899 | new->separate = false; | ||
| 1900 | } else if (len <= PATH_MAX) { | ||
| 1901 | /* this looks odd, but is due to final_putname() */ | ||
| 1902 | struct filename *new2; | ||
| 1903 | |||
| 1904 | new2 = kmalloc(sizeof(*new2), GFP_KERNEL); | ||
| 1905 | if (unlikely(!new2)) { | ||
| 1906 | __putname(new); | ||
| 1907 | goto out; | ||
| 1908 | } | ||
| 1909 | new2->name = (char *)new; | ||
| 1910 | new2->separate = true; | ||
| 1911 | new = new2; | ||
| 1912 | } else { | ||
| 1913 | /* we should never get here, but let's be safe */ | ||
| 1914 | __putname(new); | ||
| 1915 | goto out; | ||
| 1916 | } | ||
| 1917 | strlcpy((char *)new->name, name->name, len); | ||
| 1918 | new->uptr = NULL; | ||
| 1919 | new->aname = n; | ||
| 1920 | n->name = new; | ||
| 1921 | n->name_put = true; | ||
| 1922 | } | ||
| 1895 | out: | 1923 | out: |
| 1896 | if (parent) { | 1924 | if (parent) { |
| 1897 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; | 1925 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; |
| @@ -1906,6 +1934,11 @@ out: | |||
| 1906 | audit_copy_inode(n, dentry, inode); | 1934 | audit_copy_inode(n, dentry, inode); |
| 1907 | } | 1935 | } |
| 1908 | 1936 | ||
| 1937 | void __audit_file(const struct file *file) | ||
| 1938 | { | ||
| 1939 | __audit_inode(NULL, file->f_path.dentry, 0); | ||
| 1940 | } | ||
| 1941 | |||
| 1909 | /** | 1942 | /** |
| 1910 | * __audit_inode_child - collect inode info for created/removed objects | 1943 | * __audit_inode_child - collect inode info for created/removed objects |
| 1911 | * @parent: inode of dentry parent | 1944 | * @parent: inode of dentry parent |
| @@ -2382,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2382 | ax->d.next = context->aux; | 2415 | ax->d.next = context->aux; |
| 2383 | context->aux = (void *)ax; | 2416 | context->aux = (void *)ax; |
| 2384 | 2417 | ||
| 2385 | dentry = dget(bprm->file->f_dentry); | 2418 | dentry = dget(bprm->file->f_path.dentry); |
| 2386 | get_vfs_caps_from_disk(dentry, &vcaps); | 2419 | get_vfs_caps_from_disk(dentry, &vcaps); |
| 2387 | dput(dentry); | 2420 | dput(dentry); |
| 2388 | 2421 | ||
| @@ -2406,7 +2439,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2406 | * @new: the new credentials | 2439 | * @new: the new credentials |
| 2407 | * @old: the old (current) credentials | 2440 | * @old: the old (current) credentials |
| 2408 | * | 2441 | * |
| 2409 | * Record the aguments userspace sent to sys_capset for later printing by the | 2442 | * Record the arguments userspace sent to sys_capset for later printing by the |
| 2410 | * audit system if applicable | 2443 | * audit system if applicable |
| 2411 | */ | 2444 | */ |
| 2412 | void __audit_log_capset(const struct cred *new, const struct cred *old) | 2445 | void __audit_log_capset(const struct cred *new, const struct cred *old) |
| @@ -2433,6 +2466,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
| 2433 | kgid_t gid; | 2466 | kgid_t gid; |
| 2434 | unsigned int sessionid; | 2467 | unsigned int sessionid; |
| 2435 | struct mm_struct *mm = current->mm; | 2468 | struct mm_struct *mm = current->mm; |
| 2469 | char comm[sizeof(current->comm)]; | ||
| 2436 | 2470 | ||
| 2437 | auid = audit_get_loginuid(current); | 2471 | auid = audit_get_loginuid(current); |
| 2438 | sessionid = audit_get_sessionid(current); | 2472 | sessionid = audit_get_sessionid(current); |
| @@ -2445,7 +2479,7 @@ static void audit_log_task(struct audit_buffer *ab) | |||
| 2445 | sessionid); | 2479 | sessionid); |
| 2446 | audit_log_task_context(ab); | 2480 | audit_log_task_context(ab); |
| 2447 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); | 2481 | audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); |
| 2448 | audit_log_untrustedstring(ab, current->comm); | 2482 | audit_log_untrustedstring(ab, get_task_comm(comm, current)); |
| 2449 | if (mm) { | 2483 | if (mm) { |
| 2450 | down_read(&mm->mmap_sem); | 2484 | down_read(&mm->mmap_sem); |
| 2451 | if (mm->exe_file) | 2485 | if (mm->exe_file) |
| @@ -2488,11 +2522,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) | |||
| 2488 | if (unlikely(!ab)) | 2522 | if (unlikely(!ab)) |
| 2489 | return; | 2523 | return; |
| 2490 | audit_log_task(ab); | 2524 | audit_log_task(ab); |
| 2491 | audit_log_format(ab, " sig=%ld", signr); | 2525 | audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", |
| 2492 | audit_log_format(ab, " syscall=%ld", syscall); | 2526 | signr, syscall_get_arch(), syscall, is_compat_task(), |
| 2493 | audit_log_format(ab, " compat=%d", is_compat_task()); | 2527 | KSTK_EIP(current), code); |
| 2494 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
| 2495 | audit_log_format(ab, " code=0x%x", code); | ||
| 2496 | audit_log_end(ab); | 2528 | audit_log_end(ab); |
| 2497 | } | 2529 | } |
| 2498 | 2530 | ||
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..a5ae60f0b0a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1 +1,5 @@ | |||
| 1 | obj-y := core.o | 1 | obj-y := core.o |
| 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o | ||
| 3 | ifdef CONFIG_TEST_BPF | ||
| 4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o | ||
| 5 | endif | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..9eb4d8a7cd87 --- /dev/null +++ b/kernel/bpf/arraymap.c | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/err.h> | ||
| 14 | #include <linux/vmalloc.h> | ||
| 15 | #include <linux/slab.h> | ||
| 16 | #include <linux/mm.h> | ||
| 17 | |||
| 18 | struct bpf_array { | ||
| 19 | struct bpf_map map; | ||
| 20 | u32 elem_size; | ||
| 21 | char value[0] __aligned(8); | ||
| 22 | }; | ||
| 23 | |||
| 24 | /* Called from syscall */ | ||
| 25 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) | ||
| 26 | { | ||
| 27 | struct bpf_array *array; | ||
| 28 | u32 elem_size, array_size; | ||
| 29 | |||
| 30 | /* check sanity of attributes */ | ||
| 31 | if (attr->max_entries == 0 || attr->key_size != 4 || | ||
| 32 | attr->value_size == 0) | ||
| 33 | return ERR_PTR(-EINVAL); | ||
| 34 | |||
| 35 | elem_size = round_up(attr->value_size, 8); | ||
| 36 | |||
| 37 | /* check round_up into zero and u32 overflow */ | ||
| 38 | if (elem_size == 0 || | ||
| 39 | attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) | ||
| 40 | return ERR_PTR(-ENOMEM); | ||
| 41 | |||
| 42 | array_size = sizeof(*array) + attr->max_entries * elem_size; | ||
| 43 | |||
| 44 | /* allocate all map elements and zero-initialize them */ | ||
| 45 | array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); | ||
| 46 | if (!array) { | ||
| 47 | array = vzalloc(array_size); | ||
| 48 | if (!array) | ||
| 49 | return ERR_PTR(-ENOMEM); | ||
| 50 | } | ||
| 51 | |||
| 52 | /* copy mandatory map attributes */ | ||
| 53 | array->map.key_size = attr->key_size; | ||
| 54 | array->map.value_size = attr->value_size; | ||
| 55 | array->map.max_entries = attr->max_entries; | ||
| 56 | |||
| 57 | array->elem_size = elem_size; | ||
| 58 | |||
| 59 | return &array->map; | ||
| 60 | } | ||
| 61 | |||
| 62 | /* Called from syscall or from eBPF program */ | ||
| 63 | static void *array_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 64 | { | ||
| 65 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 66 | u32 index = *(u32 *)key; | ||
| 67 | |||
| 68 | if (index >= array->map.max_entries) | ||
| 69 | return NULL; | ||
| 70 | |||
| 71 | return array->value + array->elem_size * index; | ||
| 72 | } | ||
| 73 | |||
| 74 | /* Called from syscall */ | ||
| 75 | static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
| 76 | { | ||
| 77 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 78 | u32 index = *(u32 *)key; | ||
| 79 | u32 *next = (u32 *)next_key; | ||
| 80 | |||
| 81 | if (index >= array->map.max_entries) { | ||
| 82 | *next = 0; | ||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | |||
| 86 | if (index == array->map.max_entries - 1) | ||
| 87 | return -ENOENT; | ||
| 88 | |||
| 89 | *next = index + 1; | ||
| 90 | return 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | /* Called from syscall or from eBPF program */ | ||
| 94 | static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | ||
| 95 | u64 map_flags) | ||
| 96 | { | ||
| 97 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 98 | u32 index = *(u32 *)key; | ||
| 99 | |||
| 100 | if (map_flags > BPF_EXIST) | ||
| 101 | /* unknown flags */ | ||
| 102 | return -EINVAL; | ||
| 103 | |||
| 104 | if (index >= array->map.max_entries) | ||
| 105 | /* all elements were pre-allocated, cannot insert a new one */ | ||
| 106 | return -E2BIG; | ||
| 107 | |||
| 108 | if (map_flags == BPF_NOEXIST) | ||
| 109 | /* all elements already exist */ | ||
| 110 | return -EEXIST; | ||
| 111 | |||
| 112 | memcpy(array->value + array->elem_size * index, value, array->elem_size); | ||
| 113 | return 0; | ||
| 114 | } | ||
| 115 | |||
| 116 | /* Called from syscall or from eBPF program */ | ||
| 117 | static int array_map_delete_elem(struct bpf_map *map, void *key) | ||
| 118 | { | ||
| 119 | return -EINVAL; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | ||
| 123 | static void array_map_free(struct bpf_map *map) | ||
| 124 | { | ||
| 125 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
| 126 | |||
| 127 | /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | ||
| 128 | * so the programs (can be more than one that used this map) were | ||
| 129 | * disconnected from events. Wait for outstanding programs to complete | ||
| 130 | * and free the array | ||
| 131 | */ | ||
| 132 | synchronize_rcu(); | ||
| 133 | |||
| 134 | kvfree(array); | ||
| 135 | } | ||
| 136 | |||
| 137 | static struct bpf_map_ops array_ops = { | ||
| 138 | .map_alloc = array_map_alloc, | ||
| 139 | .map_free = array_map_free, | ||
| 140 | .map_get_next_key = array_map_get_next_key, | ||
| 141 | .map_lookup_elem = array_map_lookup_elem, | ||
| 142 | .map_update_elem = array_map_update_elem, | ||
| 143 | .map_delete_elem = array_map_delete_elem, | ||
| 144 | }; | ||
| 145 | |||
| 146 | static struct bpf_map_type_list tl = { | ||
| 147 | .ops = &array_ops, | ||
| 148 | .type = BPF_MAP_TYPE_ARRAY, | ||
| 149 | }; | ||
| 150 | |||
| 151 | static int __init register_array_map(void) | ||
| 152 | { | ||
| 153 | bpf_register_map_type(&tl); | ||
| 154 | return 0; | ||
| 155 | } | ||
| 156 | late_initcall(register_array_map); | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -20,9 +20,14 @@ | |||
| 20 | * Andi Kleen - Fix a few bad bugs and races. | 20 | * Andi Kleen - Fix a few bad bugs and races. |
| 21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() | 21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() |
| 22 | */ | 22 | */ |
| 23 | |||
| 23 | #include <linux/filter.h> | 24 | #include <linux/filter.h> |
| 24 | #include <linux/skbuff.h> | 25 | #include <linux/skbuff.h> |
| 26 | #include <linux/vmalloc.h> | ||
| 27 | #include <linux/random.h> | ||
| 28 | #include <linux/moduleloader.h> | ||
| 25 | #include <asm/unaligned.h> | 29 | #include <asm/unaligned.h> |
| 30 | #include <linux/bpf.h> | ||
| 26 | 31 | ||
| 27 | /* Registers */ | 32 | /* Registers */ |
| 28 | #define BPF_R0 regs[BPF_REG_0] | 33 | #define BPF_R0 regs[BPF_REG_0] |
| @@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns | |||
| 63 | return NULL; | 68 | return NULL; |
| 64 | } | 69 | } |
| 65 | 70 | ||
| 71 | struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | ||
| 72 | { | ||
| 73 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | ||
| 74 | gfp_extra_flags; | ||
| 75 | struct bpf_prog_aux *aux; | ||
| 76 | struct bpf_prog *fp; | ||
| 77 | |||
| 78 | size = round_up(size, PAGE_SIZE); | ||
| 79 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); | ||
| 80 | if (fp == NULL) | ||
| 81 | return NULL; | ||
| 82 | |||
| 83 | aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); | ||
| 84 | if (aux == NULL) { | ||
| 85 | vfree(fp); | ||
| 86 | return NULL; | ||
| 87 | } | ||
| 88 | |||
| 89 | fp->pages = size / PAGE_SIZE; | ||
| 90 | fp->aux = aux; | ||
| 91 | |||
| 92 | return fp; | ||
| 93 | } | ||
| 94 | EXPORT_SYMBOL_GPL(bpf_prog_alloc); | ||
| 95 | |||
| 96 | struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, | ||
| 97 | gfp_t gfp_extra_flags) | ||
| 98 | { | ||
| 99 | gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | | ||
| 100 | gfp_extra_flags; | ||
| 101 | struct bpf_prog *fp; | ||
| 102 | |||
| 103 | BUG_ON(fp_old == NULL); | ||
| 104 | |||
| 105 | size = round_up(size, PAGE_SIZE); | ||
| 106 | if (size <= fp_old->pages * PAGE_SIZE) | ||
| 107 | return fp_old; | ||
| 108 | |||
| 109 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); | ||
| 110 | if (fp != NULL) { | ||
| 111 | memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); | ||
| 112 | fp->pages = size / PAGE_SIZE; | ||
| 113 | |||
| 114 | /* We keep fp->aux from fp_old around in the new | ||
| 115 | * reallocated structure. | ||
| 116 | */ | ||
| 117 | fp_old->aux = NULL; | ||
| 118 | __bpf_prog_free(fp_old); | ||
| 119 | } | ||
| 120 | |||
| 121 | return fp; | ||
| 122 | } | ||
| 123 | EXPORT_SYMBOL_GPL(bpf_prog_realloc); | ||
| 124 | |||
| 125 | void __bpf_prog_free(struct bpf_prog *fp) | ||
| 126 | { | ||
| 127 | kfree(fp->aux); | ||
| 128 | vfree(fp); | ||
| 129 | } | ||
| 130 | EXPORT_SYMBOL_GPL(__bpf_prog_free); | ||
| 131 | |||
| 132 | #ifdef CONFIG_BPF_JIT | ||
| 133 | struct bpf_binary_header * | ||
| 134 | bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, | ||
| 135 | unsigned int alignment, | ||
| 136 | bpf_jit_fill_hole_t bpf_fill_ill_insns) | ||
| 137 | { | ||
| 138 | struct bpf_binary_header *hdr; | ||
| 139 | unsigned int size, hole, start; | ||
| 140 | |||
| 141 | /* Most of BPF filters are really small, but if some of them | ||
| 142 | * fill a page, allow at least 128 extra bytes to insert a | ||
| 143 | * random section of illegal instructions. | ||
| 144 | */ | ||
| 145 | size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); | ||
| 146 | hdr = module_alloc(size); | ||
| 147 | if (hdr == NULL) | ||
| 148 | return NULL; | ||
| 149 | |||
| 150 | /* Fill space with illegal/arch-dep instructions. */ | ||
| 151 | bpf_fill_ill_insns(hdr, size); | ||
| 152 | |||
| 153 | hdr->pages = size / PAGE_SIZE; | ||
| 154 | hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), | ||
| 155 | PAGE_SIZE - sizeof(*hdr)); | ||
| 156 | start = (prandom_u32() % hole) & ~(alignment - 1); | ||
| 157 | |||
| 158 | /* Leave a random number of instructions before BPF code. */ | ||
| 159 | *image_ptr = &hdr->image[start]; | ||
| 160 | |||
| 161 | return hdr; | ||
| 162 | } | ||
| 163 | |||
| 164 | void bpf_jit_binary_free(struct bpf_binary_header *hdr) | ||
| 165 | { | ||
| 166 | module_free(NULL, hdr); | ||
| 167 | } | ||
| 168 | #endif /* CONFIG_BPF_JIT */ | ||
| 169 | |||
| 66 | /* Base function for offset calculation. Needs to go into .text section, | 170 | /* Base function for offset calculation. Needs to go into .text section, |
| 67 | * therefore keeping it non-static as well; will also be used by JITs | 171 | * therefore keeping it non-static as well; will also be used by JITs |
| 68 | * anyway later on, so do not let the compiler omit it. | 172 | * anyway later on, so do not let the compiler omit it. |
| @@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 180 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, | 284 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, |
| 181 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, | 285 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, |
| 182 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, | 286 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, |
| 287 | [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, | ||
| 183 | }; | 288 | }; |
| 184 | void *ptr; | 289 | void *ptr; |
| 185 | int off; | 290 | int off; |
| @@ -239,6 +344,10 @@ select_insn: | |||
| 239 | ALU64_MOV_K: | 344 | ALU64_MOV_K: |
| 240 | DST = IMM; | 345 | DST = IMM; |
| 241 | CONT; | 346 | CONT; |
| 347 | LD_IMM_DW: | ||
| 348 | DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; | ||
| 349 | insn++; | ||
| 350 | CONT; | ||
| 242 | ALU64_ARSH_X: | 351 | ALU64_ARSH_X: |
| 243 | (*(s64 *) &DST) >>= SRC; | 352 | (*(s64 *) &DST) >>= SRC; |
| 244 | CONT; | 353 | CONT; |
| @@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) | |||
| 523 | 632 | ||
| 524 | /* Probe if internal BPF can be JITed */ | 633 | /* Probe if internal BPF can be JITed */ |
| 525 | bpf_int_jit_compile(fp); | 634 | bpf_int_jit_compile(fp); |
| 635 | /* Lock whole bpf_prog as read-only */ | ||
| 636 | bpf_prog_lock_ro(fp); | ||
| 526 | } | 637 | } |
| 527 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); | 638 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); |
| 528 | 639 | ||
| 529 | /* free internal BPF program */ | 640 | static void bpf_prog_free_deferred(struct work_struct *work) |
| 641 | { | ||
| 642 | struct bpf_prog_aux *aux; | ||
| 643 | |||
| 644 | aux = container_of(work, struct bpf_prog_aux, work); | ||
| 645 | bpf_jit_free(aux->prog); | ||
| 646 | } | ||
| 647 | |||
| 648 | /* Free internal BPF program */ | ||
| 530 | void bpf_prog_free(struct bpf_prog *fp) | 649 | void bpf_prog_free(struct bpf_prog *fp) |
| 531 | { | 650 | { |
| 532 | bpf_jit_free(fp); | 651 | struct bpf_prog_aux *aux = fp->aux; |
| 652 | |||
| 653 | INIT_WORK(&aux->work, bpf_prog_free_deferred); | ||
| 654 | aux->prog = fp; | ||
| 655 | schedule_work(&aux->work); | ||
| 533 | } | 656 | } |
| 534 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 657 | EXPORT_SYMBOL_GPL(bpf_prog_free); |
| 658 | |||
| 659 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | ||
| 660 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | ||
| 661 | */ | ||
| 662 | int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, | ||
| 663 | int len) | ||
| 664 | { | ||
| 665 | return -EFAULT; | ||
| 666 | } | ||
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..b3ba43674310 --- /dev/null +++ b/kernel/bpf/hashtab.c | |||
| @@ -0,0 +1,367 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/jhash.h> | ||
| 14 | #include <linux/filter.h> | ||
| 15 | #include <linux/vmalloc.h> | ||
| 16 | |||
| 17 | struct bpf_htab { | ||
| 18 | struct bpf_map map; | ||
| 19 | struct hlist_head *buckets; | ||
| 20 | spinlock_t lock; | ||
| 21 | u32 count; /* number of elements in this hashtable */ | ||
| 22 | u32 n_buckets; /* number of hash buckets */ | ||
| 23 | u32 elem_size; /* size of each element in bytes */ | ||
| 24 | }; | ||
| 25 | |||
| 26 | /* each htab element is struct htab_elem + key + value */ | ||
| 27 | struct htab_elem { | ||
| 28 | struct hlist_node hash_node; | ||
| 29 | struct rcu_head rcu; | ||
| 30 | u32 hash; | ||
| 31 | char key[0] __aligned(8); | ||
| 32 | }; | ||
| 33 | |||
| 34 | /* Called from syscall */ | ||
| 35 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | ||
| 36 | { | ||
| 37 | struct bpf_htab *htab; | ||
| 38 | int err, i; | ||
| 39 | |||
| 40 | htab = kzalloc(sizeof(*htab), GFP_USER); | ||
| 41 | if (!htab) | ||
| 42 | return ERR_PTR(-ENOMEM); | ||
| 43 | |||
| 44 | /* mandatory map attributes */ | ||
| 45 | htab->map.key_size = attr->key_size; | ||
| 46 | htab->map.value_size = attr->value_size; | ||
| 47 | htab->map.max_entries = attr->max_entries; | ||
| 48 | |||
| 49 | /* check sanity of attributes. | ||
| 50 | * value_size == 0 may be allowed in the future to use map as a set | ||
| 51 | */ | ||
| 52 | err = -EINVAL; | ||
| 53 | if (htab->map.max_entries == 0 || htab->map.key_size == 0 || | ||
| 54 | htab->map.value_size == 0) | ||
| 55 | goto free_htab; | ||
| 56 | |||
| 57 | /* hash table size must be power of 2 */ | ||
| 58 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); | ||
| 59 | |||
| 60 | err = -E2BIG; | ||
| 61 | if (htab->map.key_size > MAX_BPF_STACK) | ||
| 62 | /* eBPF programs initialize keys on stack, so they cannot be | ||
| 63 | * larger than max stack size | ||
| 64 | */ | ||
| 65 | goto free_htab; | ||
| 66 | |||
| 67 | err = -ENOMEM; | ||
| 68 | /* prevent zero size kmalloc and check for u32 overflow */ | ||
| 69 | if (htab->n_buckets == 0 || | ||
| 70 | htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) | ||
| 71 | goto free_htab; | ||
| 72 | |||
| 73 | htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), | ||
| 74 | GFP_USER | __GFP_NOWARN); | ||
| 75 | |||
| 76 | if (!htab->buckets) { | ||
| 77 | htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); | ||
| 78 | if (!htab->buckets) | ||
| 79 | goto free_htab; | ||
| 80 | } | ||
| 81 | |||
| 82 | for (i = 0; i < htab->n_buckets; i++) | ||
| 83 | INIT_HLIST_HEAD(&htab->buckets[i]); | ||
| 84 | |||
| 85 | spin_lock_init(&htab->lock); | ||
| 86 | htab->count = 0; | ||
| 87 | |||
| 88 | htab->elem_size = sizeof(struct htab_elem) + | ||
| 89 | round_up(htab->map.key_size, 8) + | ||
| 90 | htab->map.value_size; | ||
| 91 | return &htab->map; | ||
| 92 | |||
| 93 | free_htab: | ||
| 94 | kfree(htab); | ||
| 95 | return ERR_PTR(err); | ||
| 96 | } | ||
| 97 | |||
| 98 | static inline u32 htab_map_hash(const void *key, u32 key_len) | ||
| 99 | { | ||
| 100 | return jhash(key, key_len, 0); | ||
| 101 | } | ||
| 102 | |||
| 103 | static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) | ||
| 104 | { | ||
| 105 | return &htab->buckets[hash & (htab->n_buckets - 1)]; | ||
| 106 | } | ||
| 107 | |||
| 108 | static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, | ||
| 109 | void *key, u32 key_size) | ||
| 110 | { | ||
| 111 | struct htab_elem *l; | ||
| 112 | |||
| 113 | hlist_for_each_entry_rcu(l, head, hash_node) | ||
| 114 | if (l->hash == hash && !memcmp(&l->key, key, key_size)) | ||
| 115 | return l; | ||
| 116 | |||
| 117 | return NULL; | ||
| 118 | } | ||
| 119 | |||
| 120 | /* Called from syscall or from eBPF program */ | ||
| 121 | static void *htab_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 122 | { | ||
| 123 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 124 | struct hlist_head *head; | ||
| 125 | struct htab_elem *l; | ||
| 126 | u32 hash, key_size; | ||
| 127 | |||
| 128 | /* Must be called with rcu_read_lock. */ | ||
| 129 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 130 | |||
| 131 | key_size = map->key_size; | ||
| 132 | |||
| 133 | hash = htab_map_hash(key, key_size); | ||
| 134 | |||
| 135 | head = select_bucket(htab, hash); | ||
| 136 | |||
| 137 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 138 | |||
| 139 | if (l) | ||
| 140 | return l->key + round_up(map->key_size, 8); | ||
| 141 | |||
| 142 | return NULL; | ||
| 143 | } | ||
| 144 | |||
| 145 | /* Called from syscall */ | ||
| 146 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
| 147 | { | ||
| 148 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 149 | struct hlist_head *head; | ||
| 150 | struct htab_elem *l, *next_l; | ||
| 151 | u32 hash, key_size; | ||
| 152 | int i; | ||
| 153 | |||
| 154 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 155 | |||
| 156 | key_size = map->key_size; | ||
| 157 | |||
| 158 | hash = htab_map_hash(key, key_size); | ||
| 159 | |||
| 160 | head = select_bucket(htab, hash); | ||
| 161 | |||
| 162 | /* lookup the key */ | ||
| 163 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 164 | |||
| 165 | if (!l) { | ||
| 166 | i = 0; | ||
| 167 | goto find_first_elem; | ||
| 168 | } | ||
| 169 | |||
| 170 | /* key was found, get next key in the same bucket */ | ||
| 171 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), | ||
| 172 | struct htab_elem, hash_node); | ||
| 173 | |||
| 174 | if (next_l) { | ||
| 175 | /* if next elem in this hash list is non-zero, just return it */ | ||
| 176 | memcpy(next_key, next_l->key, key_size); | ||
| 177 | return 0; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* no more elements in this hash list, go to the next bucket */ | ||
| 181 | i = hash & (htab->n_buckets - 1); | ||
| 182 | i++; | ||
| 183 | |||
| 184 | find_first_elem: | ||
| 185 | /* iterate over buckets */ | ||
| 186 | for (; i < htab->n_buckets; i++) { | ||
| 187 | head = select_bucket(htab, i); | ||
| 188 | |||
| 189 | /* pick first element in the bucket */ | ||
| 190 | next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), | ||
| 191 | struct htab_elem, hash_node); | ||
| 192 | if (next_l) { | ||
| 193 | /* if it's not empty, just return it */ | ||
| 194 | memcpy(next_key, next_l->key, key_size); | ||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | /* itereated over all buckets and all elements */ | ||
| 200 | return -ENOENT; | ||
| 201 | } | ||
| 202 | |||
| 203 | /* Called from syscall or from eBPF program */ | ||
| 204 | static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | ||
| 205 | u64 map_flags) | ||
| 206 | { | ||
| 207 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 208 | struct htab_elem *l_new, *l_old; | ||
| 209 | struct hlist_head *head; | ||
| 210 | unsigned long flags; | ||
| 211 | u32 key_size; | ||
| 212 | int ret; | ||
| 213 | |||
| 214 | if (map_flags > BPF_EXIST) | ||
| 215 | /* unknown flags */ | ||
| 216 | return -EINVAL; | ||
| 217 | |||
| 218 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 219 | |||
| 220 | /* allocate new element outside of lock */ | ||
| 221 | l_new = kmalloc(htab->elem_size, GFP_ATOMIC); | ||
| 222 | if (!l_new) | ||
| 223 | return -ENOMEM; | ||
| 224 | |||
| 225 | key_size = map->key_size; | ||
| 226 | |||
| 227 | memcpy(l_new->key, key, key_size); | ||
| 228 | memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); | ||
| 229 | |||
| 230 | l_new->hash = htab_map_hash(l_new->key, key_size); | ||
| 231 | |||
| 232 | /* bpf_map_update_elem() can be called in_irq() */ | ||
| 233 | spin_lock_irqsave(&htab->lock, flags); | ||
| 234 | |||
| 235 | head = select_bucket(htab, l_new->hash); | ||
| 236 | |||
| 237 | l_old = lookup_elem_raw(head, l_new->hash, key, key_size); | ||
| 238 | |||
| 239 | if (!l_old && unlikely(htab->count >= map->max_entries)) { | ||
| 240 | /* if elem with this 'key' doesn't exist and we've reached | ||
| 241 | * max_entries limit, fail insertion of new elem | ||
| 242 | */ | ||
| 243 | ret = -E2BIG; | ||
| 244 | goto err; | ||
| 245 | } | ||
| 246 | |||
| 247 | if (l_old && map_flags == BPF_NOEXIST) { | ||
| 248 | /* elem already exists */ | ||
| 249 | ret = -EEXIST; | ||
| 250 | goto err; | ||
| 251 | } | ||
| 252 | |||
| 253 | if (!l_old && map_flags == BPF_EXIST) { | ||
| 254 | /* elem doesn't exist, cannot update it */ | ||
| 255 | ret = -ENOENT; | ||
| 256 | goto err; | ||
| 257 | } | ||
| 258 | |||
| 259 | /* add new element to the head of the list, so that concurrent | ||
| 260 | * search will find it before old elem | ||
| 261 | */ | ||
| 262 | hlist_add_head_rcu(&l_new->hash_node, head); | ||
| 263 | if (l_old) { | ||
| 264 | hlist_del_rcu(&l_old->hash_node); | ||
| 265 | kfree_rcu(l_old, rcu); | ||
| 266 | } else { | ||
| 267 | htab->count++; | ||
| 268 | } | ||
| 269 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 270 | |||
| 271 | return 0; | ||
| 272 | err: | ||
| 273 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 274 | kfree(l_new); | ||
| 275 | return ret; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* Called from syscall or from eBPF program */ | ||
| 279 | static int htab_map_delete_elem(struct bpf_map *map, void *key) | ||
| 280 | { | ||
| 281 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 282 | struct hlist_head *head; | ||
| 283 | struct htab_elem *l; | ||
| 284 | unsigned long flags; | ||
| 285 | u32 hash, key_size; | ||
| 286 | int ret = -ENOENT; | ||
| 287 | |||
| 288 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 289 | |||
| 290 | key_size = map->key_size; | ||
| 291 | |||
| 292 | hash = htab_map_hash(key, key_size); | ||
| 293 | |||
| 294 | spin_lock_irqsave(&htab->lock, flags); | ||
| 295 | |||
| 296 | head = select_bucket(htab, hash); | ||
| 297 | |||
| 298 | l = lookup_elem_raw(head, hash, key, key_size); | ||
| 299 | |||
| 300 | if (l) { | ||
| 301 | hlist_del_rcu(&l->hash_node); | ||
| 302 | htab->count--; | ||
| 303 | kfree_rcu(l, rcu); | ||
| 304 | ret = 0; | ||
| 305 | } | ||
| 306 | |||
| 307 | spin_unlock_irqrestore(&htab->lock, flags); | ||
| 308 | return ret; | ||
| 309 | } | ||
| 310 | |||
| 311 | static void delete_all_elements(struct bpf_htab *htab) | ||
| 312 | { | ||
| 313 | int i; | ||
| 314 | |||
| 315 | for (i = 0; i < htab->n_buckets; i++) { | ||
| 316 | struct hlist_head *head = select_bucket(htab, i); | ||
| 317 | struct hlist_node *n; | ||
| 318 | struct htab_elem *l; | ||
| 319 | |||
| 320 | hlist_for_each_entry_safe(l, n, head, hash_node) { | ||
| 321 | hlist_del_rcu(&l->hash_node); | ||
| 322 | htab->count--; | ||
| 323 | kfree(l); | ||
| 324 | } | ||
| 325 | } | ||
| 326 | } | ||
| 327 | |||
| 328 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | ||
| 329 | static void htab_map_free(struct bpf_map *map) | ||
| 330 | { | ||
| 331 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 332 | |||
| 333 | /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | ||
| 334 | * so the programs (can be more than one that used this map) were | ||
| 335 | * disconnected from events. Wait for outstanding critical sections in | ||
| 336 | * these programs to complete | ||
| 337 | */ | ||
| 338 | synchronize_rcu(); | ||
| 339 | |||
| 340 | /* some of kfree_rcu() callbacks for elements of this map may not have | ||
| 341 | * executed. It's ok. Proceed to free residual elements and map itself | ||
| 342 | */ | ||
| 343 | delete_all_elements(htab); | ||
| 344 | kvfree(htab->buckets); | ||
| 345 | kfree(htab); | ||
| 346 | } | ||
| 347 | |||
| 348 | static struct bpf_map_ops htab_ops = { | ||
| 349 | .map_alloc = htab_map_alloc, | ||
| 350 | .map_free = htab_map_free, | ||
| 351 | .map_get_next_key = htab_map_get_next_key, | ||
| 352 | .map_lookup_elem = htab_map_lookup_elem, | ||
| 353 | .map_update_elem = htab_map_update_elem, | ||
| 354 | .map_delete_elem = htab_map_delete_elem, | ||
| 355 | }; | ||
| 356 | |||
| 357 | static struct bpf_map_type_list tl = { | ||
| 358 | .ops = &htab_ops, | ||
| 359 | .type = BPF_MAP_TYPE_HASH, | ||
| 360 | }; | ||
| 361 | |||
| 362 | static int __init register_htab_map(void) | ||
| 363 | { | ||
| 364 | bpf_register_map_type(&tl); | ||
| 365 | return 0; | ||
| 366 | } | ||
| 367 | late_initcall(register_htab_map); | ||
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..9e3414d85459 --- /dev/null +++ b/kernel/bpf/helpers.c | |||
| @@ -0,0 +1,89 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/rcupdate.h> | ||
| 14 | |||
| 15 | /* If kernel subsystem is allowing eBPF programs to call this function, | ||
| 16 | * inside its own verifier_ops->get_func_proto() callback it should return | ||
| 17 | * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments | ||
| 18 | * | ||
| 19 | * Different map implementations will rely on rcu in map methods | ||
| 20 | * lookup/update/delete, therefore eBPF programs must run under rcu lock | ||
| 21 | * if program is allowed to access maps, so check rcu_read_lock_held in | ||
| 22 | * all three functions. | ||
| 23 | */ | ||
| 24 | static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 25 | { | ||
| 26 | /* verifier checked that R1 contains a valid pointer to bpf_map | ||
| 27 | * and R2 points to a program stack and map->key_size bytes were | ||
| 28 | * initialized | ||
| 29 | */ | ||
| 30 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 31 | void *key = (void *) (unsigned long) r2; | ||
| 32 | void *value; | ||
| 33 | |||
| 34 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 35 | |||
| 36 | value = map->ops->map_lookup_elem(map, key); | ||
| 37 | |||
| 38 | /* lookup() returns either pointer to element value or NULL | ||
| 39 | * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type | ||
| 40 | */ | ||
| 41 | return (unsigned long) value; | ||
| 42 | } | ||
| 43 | |||
| 44 | struct bpf_func_proto bpf_map_lookup_elem_proto = { | ||
| 45 | .func = bpf_map_lookup_elem, | ||
| 46 | .gpl_only = false, | ||
| 47 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 48 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 49 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 50 | }; | ||
| 51 | |||
| 52 | static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 53 | { | ||
| 54 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 55 | void *key = (void *) (unsigned long) r2; | ||
| 56 | void *value = (void *) (unsigned long) r3; | ||
| 57 | |||
| 58 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 59 | |||
| 60 | return map->ops->map_update_elem(map, key, value, r4); | ||
| 61 | } | ||
| 62 | |||
| 63 | struct bpf_func_proto bpf_map_update_elem_proto = { | ||
| 64 | .func = bpf_map_update_elem, | ||
| 65 | .gpl_only = false, | ||
| 66 | .ret_type = RET_INTEGER, | ||
| 67 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 68 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 69 | .arg3_type = ARG_PTR_TO_MAP_VALUE, | ||
| 70 | .arg4_type = ARG_ANYTHING, | ||
| 71 | }; | ||
| 72 | |||
| 73 | static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 74 | { | ||
| 75 | struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 76 | void *key = (void *) (unsigned long) r2; | ||
| 77 | |||
| 78 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 79 | |||
| 80 | return map->ops->map_delete_elem(map, key); | ||
| 81 | } | ||
| 82 | |||
| 83 | struct bpf_func_proto bpf_map_delete_elem_proto = { | ||
| 84 | .func = bpf_map_delete_elem, | ||
| 85 | .gpl_only = false, | ||
| 86 | .ret_type = RET_INTEGER, | ||
| 87 | .arg1_type = ARG_CONST_MAP_PTR, | ||
| 88 | .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 89 | }; | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..088ac0b1b106 --- /dev/null +++ b/kernel/bpf/syscall.c | |||
| @@ -0,0 +1,606 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/bpf.h> | ||
| 13 | #include <linux/syscalls.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/anon_inodes.h> | ||
| 16 | #include <linux/file.h> | ||
| 17 | #include <linux/license.h> | ||
| 18 | #include <linux/filter.h> | ||
| 19 | |||
| 20 | static LIST_HEAD(bpf_map_types); | ||
| 21 | |||
| 22 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | ||
| 23 | { | ||
| 24 | struct bpf_map_type_list *tl; | ||
| 25 | struct bpf_map *map; | ||
| 26 | |||
| 27 | list_for_each_entry(tl, &bpf_map_types, list_node) { | ||
| 28 | if (tl->type == attr->map_type) { | ||
| 29 | map = tl->ops->map_alloc(attr); | ||
| 30 | if (IS_ERR(map)) | ||
| 31 | return map; | ||
| 32 | map->ops = tl->ops; | ||
| 33 | map->map_type = attr->map_type; | ||
| 34 | return map; | ||
| 35 | } | ||
| 36 | } | ||
| 37 | return ERR_PTR(-EINVAL); | ||
| 38 | } | ||
| 39 | |||
| 40 | /* boot time registration of different map implementations */ | ||
| 41 | void bpf_register_map_type(struct bpf_map_type_list *tl) | ||
| 42 | { | ||
| 43 | list_add(&tl->list_node, &bpf_map_types); | ||
| 44 | } | ||
| 45 | |||
| 46 | /* called from workqueue */ | ||
| 47 | static void bpf_map_free_deferred(struct work_struct *work) | ||
| 48 | { | ||
| 49 | struct bpf_map *map = container_of(work, struct bpf_map, work); | ||
| 50 | |||
| 51 | /* implementation dependent freeing */ | ||
| 52 | map->ops->map_free(map); | ||
| 53 | } | ||
| 54 | |||
| 55 | /* decrement map refcnt and schedule it for freeing via workqueue | ||
| 56 | * (unrelying map implementation ops->map_free() might sleep) | ||
| 57 | */ | ||
| 58 | void bpf_map_put(struct bpf_map *map) | ||
| 59 | { | ||
| 60 | if (atomic_dec_and_test(&map->refcnt)) { | ||
| 61 | INIT_WORK(&map->work, bpf_map_free_deferred); | ||
| 62 | schedule_work(&map->work); | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | static int bpf_map_release(struct inode *inode, struct file *filp) | ||
| 67 | { | ||
| 68 | struct bpf_map *map = filp->private_data; | ||
| 69 | |||
| 70 | bpf_map_put(map); | ||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | static const struct file_operations bpf_map_fops = { | ||
| 75 | .release = bpf_map_release, | ||
| 76 | }; | ||
| 77 | |||
| 78 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | ||
| 79 | #define CHECK_ATTR(CMD) \ | ||
| 80 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | ||
| 81 | sizeof(attr->CMD##_LAST_FIELD), 0, \ | ||
| 82 | sizeof(*attr) - \ | ||
| 83 | offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ | ||
| 84 | sizeof(attr->CMD##_LAST_FIELD)) != NULL | ||
| 85 | |||
| 86 | #define BPF_MAP_CREATE_LAST_FIELD max_entries | ||
| 87 | /* called via syscall */ | ||
| 88 | static int map_create(union bpf_attr *attr) | ||
| 89 | { | ||
| 90 | struct bpf_map *map; | ||
| 91 | int err; | ||
| 92 | |||
| 93 | err = CHECK_ATTR(BPF_MAP_CREATE); | ||
| 94 | if (err) | ||
| 95 | return -EINVAL; | ||
| 96 | |||
| 97 | /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ | ||
| 98 | map = find_and_alloc_map(attr); | ||
| 99 | if (IS_ERR(map)) | ||
| 100 | return PTR_ERR(map); | ||
| 101 | |||
| 102 | atomic_set(&map->refcnt, 1); | ||
| 103 | |||
| 104 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | ||
| 105 | |||
| 106 | if (err < 0) | ||
| 107 | /* failed to allocate fd */ | ||
| 108 | goto free_map; | ||
| 109 | |||
| 110 | return err; | ||
| 111 | |||
| 112 | free_map: | ||
| 113 | map->ops->map_free(map); | ||
| 114 | return err; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* if error is returned, fd is released. | ||
| 118 | * On success caller should complete fd access with matching fdput() | ||
| 119 | */ | ||
| 120 | struct bpf_map *bpf_map_get(struct fd f) | ||
| 121 | { | ||
| 122 | struct bpf_map *map; | ||
| 123 | |||
| 124 | if (!f.file) | ||
| 125 | return ERR_PTR(-EBADF); | ||
| 126 | |||
| 127 | if (f.file->f_op != &bpf_map_fops) { | ||
| 128 | fdput(f); | ||
| 129 | return ERR_PTR(-EINVAL); | ||
| 130 | } | ||
| 131 | |||
| 132 | map = f.file->private_data; | ||
| 133 | |||
| 134 | return map; | ||
| 135 | } | ||
| 136 | |||
| 137 | /* helper to convert user pointers passed inside __aligned_u64 fields */ | ||
| 138 | static void __user *u64_to_ptr(__u64 val) | ||
| 139 | { | ||
| 140 | return (void __user *) (unsigned long) val; | ||
| 141 | } | ||
| 142 | |||
| 143 | /* last field in 'union bpf_attr' used by this command */ | ||
| 144 | #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value | ||
| 145 | |||
| 146 | static int map_lookup_elem(union bpf_attr *attr) | ||
| 147 | { | ||
| 148 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 149 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 150 | int ufd = attr->map_fd; | ||
| 151 | struct fd f = fdget(ufd); | ||
| 152 | struct bpf_map *map; | ||
| 153 | void *key, *value; | ||
| 154 | int err; | ||
| 155 | |||
| 156 | if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | map = bpf_map_get(f); | ||
| 160 | if (IS_ERR(map)) | ||
| 161 | return PTR_ERR(map); | ||
| 162 | |||
| 163 | err = -ENOMEM; | ||
| 164 | key = kmalloc(map->key_size, GFP_USER); | ||
| 165 | if (!key) | ||
| 166 | goto err_put; | ||
| 167 | |||
| 168 | err = -EFAULT; | ||
| 169 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 170 | goto free_key; | ||
| 171 | |||
| 172 | err = -ENOENT; | ||
| 173 | rcu_read_lock(); | ||
| 174 | value = map->ops->map_lookup_elem(map, key); | ||
| 175 | if (!value) | ||
| 176 | goto err_unlock; | ||
| 177 | |||
| 178 | err = -EFAULT; | ||
| 179 | if (copy_to_user(uvalue, value, map->value_size) != 0) | ||
| 180 | goto err_unlock; | ||
| 181 | |||
| 182 | err = 0; | ||
| 183 | |||
| 184 | err_unlock: | ||
| 185 | rcu_read_unlock(); | ||
| 186 | free_key: | ||
| 187 | kfree(key); | ||
| 188 | err_put: | ||
| 189 | fdput(f); | ||
| 190 | return err; | ||
| 191 | } | ||
| 192 | |||
| 193 | #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags | ||
| 194 | |||
| 195 | static int map_update_elem(union bpf_attr *attr) | ||
| 196 | { | ||
| 197 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 198 | void __user *uvalue = u64_to_ptr(attr->value); | ||
| 199 | int ufd = attr->map_fd; | ||
| 200 | struct fd f = fdget(ufd); | ||
| 201 | struct bpf_map *map; | ||
| 202 | void *key, *value; | ||
| 203 | int err; | ||
| 204 | |||
| 205 | if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) | ||
| 206 | return -EINVAL; | ||
| 207 | |||
| 208 | map = bpf_map_get(f); | ||
| 209 | if (IS_ERR(map)) | ||
| 210 | return PTR_ERR(map); | ||
| 211 | |||
| 212 | err = -ENOMEM; | ||
| 213 | key = kmalloc(map->key_size, GFP_USER); | ||
| 214 | if (!key) | ||
| 215 | goto err_put; | ||
| 216 | |||
| 217 | err = -EFAULT; | ||
| 218 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 219 | goto free_key; | ||
| 220 | |||
| 221 | err = -ENOMEM; | ||
| 222 | value = kmalloc(map->value_size, GFP_USER); | ||
| 223 | if (!value) | ||
| 224 | goto free_key; | ||
| 225 | |||
| 226 | err = -EFAULT; | ||
| 227 | if (copy_from_user(value, uvalue, map->value_size) != 0) | ||
| 228 | goto free_value; | ||
| 229 | |||
| 230 | /* eBPF program that use maps are running under rcu_read_lock(), | ||
| 231 | * therefore all map accessors rely on this fact, so do the same here | ||
| 232 | */ | ||
| 233 | rcu_read_lock(); | ||
| 234 | err = map->ops->map_update_elem(map, key, value, attr->flags); | ||
| 235 | rcu_read_unlock(); | ||
| 236 | |||
| 237 | free_value: | ||
| 238 | kfree(value); | ||
| 239 | free_key: | ||
| 240 | kfree(key); | ||
| 241 | err_put: | ||
| 242 | fdput(f); | ||
| 243 | return err; | ||
| 244 | } | ||
| 245 | |||
| 246 | #define BPF_MAP_DELETE_ELEM_LAST_FIELD key | ||
| 247 | |||
| 248 | static int map_delete_elem(union bpf_attr *attr) | ||
| 249 | { | ||
| 250 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 251 | int ufd = attr->map_fd; | ||
| 252 | struct fd f = fdget(ufd); | ||
| 253 | struct bpf_map *map; | ||
| 254 | void *key; | ||
| 255 | int err; | ||
| 256 | |||
| 257 | if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) | ||
| 258 | return -EINVAL; | ||
| 259 | |||
| 260 | map = bpf_map_get(f); | ||
| 261 | if (IS_ERR(map)) | ||
| 262 | return PTR_ERR(map); | ||
| 263 | |||
| 264 | err = -ENOMEM; | ||
| 265 | key = kmalloc(map->key_size, GFP_USER); | ||
| 266 | if (!key) | ||
| 267 | goto err_put; | ||
| 268 | |||
| 269 | err = -EFAULT; | ||
| 270 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 271 | goto free_key; | ||
| 272 | |||
| 273 | rcu_read_lock(); | ||
| 274 | err = map->ops->map_delete_elem(map, key); | ||
| 275 | rcu_read_unlock(); | ||
| 276 | |||
| 277 | free_key: | ||
| 278 | kfree(key); | ||
| 279 | err_put: | ||
| 280 | fdput(f); | ||
| 281 | return err; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* last field in 'union bpf_attr' used by this command */ | ||
| 285 | #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key | ||
| 286 | |||
| 287 | static int map_get_next_key(union bpf_attr *attr) | ||
| 288 | { | ||
| 289 | void __user *ukey = u64_to_ptr(attr->key); | ||
| 290 | void __user *unext_key = u64_to_ptr(attr->next_key); | ||
| 291 | int ufd = attr->map_fd; | ||
| 292 | struct fd f = fdget(ufd); | ||
| 293 | struct bpf_map *map; | ||
| 294 | void *key, *next_key; | ||
| 295 | int err; | ||
| 296 | |||
| 297 | if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) | ||
| 298 | return -EINVAL; | ||
| 299 | |||
| 300 | map = bpf_map_get(f); | ||
| 301 | if (IS_ERR(map)) | ||
| 302 | return PTR_ERR(map); | ||
| 303 | |||
| 304 | err = -ENOMEM; | ||
| 305 | key = kmalloc(map->key_size, GFP_USER); | ||
| 306 | if (!key) | ||
| 307 | goto err_put; | ||
| 308 | |||
| 309 | err = -EFAULT; | ||
| 310 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 311 | goto free_key; | ||
| 312 | |||
| 313 | err = -ENOMEM; | ||
| 314 | next_key = kmalloc(map->key_size, GFP_USER); | ||
| 315 | if (!next_key) | ||
| 316 | goto free_key; | ||
| 317 | |||
| 318 | rcu_read_lock(); | ||
| 319 | err = map->ops->map_get_next_key(map, key, next_key); | ||
| 320 | rcu_read_unlock(); | ||
| 321 | if (err) | ||
| 322 | goto free_next_key; | ||
| 323 | |||
| 324 | err = -EFAULT; | ||
| 325 | if (copy_to_user(unext_key, next_key, map->key_size) != 0) | ||
| 326 | goto free_next_key; | ||
| 327 | |||
| 328 | err = 0; | ||
| 329 | |||
| 330 | free_next_key: | ||
| 331 | kfree(next_key); | ||
| 332 | free_key: | ||
| 333 | kfree(key); | ||
| 334 | err_put: | ||
| 335 | fdput(f); | ||
| 336 | return err; | ||
| 337 | } | ||
| 338 | |||
| 339 | static LIST_HEAD(bpf_prog_types); | ||
| 340 | |||
| 341 | static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | ||
| 342 | { | ||
| 343 | struct bpf_prog_type_list *tl; | ||
| 344 | |||
| 345 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | ||
| 346 | if (tl->type == type) { | ||
| 347 | prog->aux->ops = tl->ops; | ||
| 348 | prog->aux->prog_type = type; | ||
| 349 | return 0; | ||
| 350 | } | ||
| 351 | } | ||
| 352 | return -EINVAL; | ||
| 353 | } | ||
| 354 | |||
| 355 | void bpf_register_prog_type(struct bpf_prog_type_list *tl) | ||
| 356 | { | ||
| 357 | list_add(&tl->list_node, &bpf_prog_types); | ||
| 358 | } | ||
| 359 | |||
| 360 | /* fixup insn->imm field of bpf_call instructions: | ||
| 361 | * if (insn->imm == BPF_FUNC_map_lookup_elem) | ||
| 362 | * insn->imm = bpf_map_lookup_elem - __bpf_call_base; | ||
| 363 | * else if (insn->imm == BPF_FUNC_map_update_elem) | ||
| 364 | * insn->imm = bpf_map_update_elem - __bpf_call_base; | ||
| 365 | * else ... | ||
| 366 | * | ||
| 367 | * this function is called after eBPF program passed verification | ||
| 368 | */ | ||
| 369 | static void fixup_bpf_calls(struct bpf_prog *prog) | ||
| 370 | { | ||
| 371 | const struct bpf_func_proto *fn; | ||
| 372 | int i; | ||
| 373 | |||
| 374 | for (i = 0; i < prog->len; i++) { | ||
| 375 | struct bpf_insn *insn = &prog->insnsi[i]; | ||
| 376 | |||
| 377 | if (insn->code == (BPF_JMP | BPF_CALL)) { | ||
| 378 | /* we reach here when program has bpf_call instructions | ||
| 379 | * and it passed bpf_check(), means that | ||
| 380 | * ops->get_func_proto must have been supplied, check it | ||
| 381 | */ | ||
| 382 | BUG_ON(!prog->aux->ops->get_func_proto); | ||
| 383 | |||
| 384 | fn = prog->aux->ops->get_func_proto(insn->imm); | ||
| 385 | /* all functions that have prototype and verifier allowed | ||
| 386 | * programs to call them, must be real in-kernel functions | ||
| 387 | */ | ||
| 388 | BUG_ON(!fn->func); | ||
| 389 | insn->imm = fn->func - __bpf_call_base; | ||
| 390 | } | ||
| 391 | } | ||
| 392 | } | ||
| 393 | |||
| 394 | /* drop refcnt on maps used by eBPF program and free auxilary data */ | ||
| 395 | static void free_used_maps(struct bpf_prog_aux *aux) | ||
| 396 | { | ||
| 397 | int i; | ||
| 398 | |||
| 399 | for (i = 0; i < aux->used_map_cnt; i++) | ||
| 400 | bpf_map_put(aux->used_maps[i]); | ||
| 401 | |||
| 402 | kfree(aux->used_maps); | ||
| 403 | } | ||
| 404 | |||
| 405 | void bpf_prog_put(struct bpf_prog *prog) | ||
| 406 | { | ||
| 407 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | ||
| 408 | free_used_maps(prog->aux); | ||
| 409 | bpf_prog_free(prog); | ||
| 410 | } | ||
| 411 | } | ||
| 412 | |||
| 413 | static int bpf_prog_release(struct inode *inode, struct file *filp) | ||
| 414 | { | ||
| 415 | struct bpf_prog *prog = filp->private_data; | ||
| 416 | |||
| 417 | bpf_prog_put(prog); | ||
| 418 | return 0; | ||
| 419 | } | ||
| 420 | |||
| 421 | static const struct file_operations bpf_prog_fops = { | ||
| 422 | .release = bpf_prog_release, | ||
| 423 | }; | ||
| 424 | |||
| 425 | static struct bpf_prog *get_prog(struct fd f) | ||
| 426 | { | ||
| 427 | struct bpf_prog *prog; | ||
| 428 | |||
| 429 | if (!f.file) | ||
| 430 | return ERR_PTR(-EBADF); | ||
| 431 | |||
| 432 | if (f.file->f_op != &bpf_prog_fops) { | ||
| 433 | fdput(f); | ||
| 434 | return ERR_PTR(-EINVAL); | ||
| 435 | } | ||
| 436 | |||
| 437 | prog = f.file->private_data; | ||
| 438 | |||
| 439 | return prog; | ||
| 440 | } | ||
| 441 | |||
| 442 | /* called by sockets/tracing/seccomp before attaching program to an event | ||
| 443 | * pairs with bpf_prog_put() | ||
| 444 | */ | ||
| 445 | struct bpf_prog *bpf_prog_get(u32 ufd) | ||
| 446 | { | ||
| 447 | struct fd f = fdget(ufd); | ||
| 448 | struct bpf_prog *prog; | ||
| 449 | |||
| 450 | prog = get_prog(f); | ||
| 451 | |||
| 452 | if (IS_ERR(prog)) | ||
| 453 | return prog; | ||
| 454 | |||
| 455 | atomic_inc(&prog->aux->refcnt); | ||
| 456 | fdput(f); | ||
| 457 | return prog; | ||
| 458 | } | ||
| 459 | |||
| 460 | /* last field in 'union bpf_attr' used by this command */ | ||
| 461 | #define BPF_PROG_LOAD_LAST_FIELD log_buf | ||
| 462 | |||
| 463 | static int bpf_prog_load(union bpf_attr *attr) | ||
| 464 | { | ||
| 465 | enum bpf_prog_type type = attr->prog_type; | ||
| 466 | struct bpf_prog *prog; | ||
| 467 | int err; | ||
| 468 | char license[128]; | ||
| 469 | bool is_gpl; | ||
| 470 | |||
| 471 | if (CHECK_ATTR(BPF_PROG_LOAD)) | ||
| 472 | return -EINVAL; | ||
| 473 | |||
| 474 | /* copy eBPF program license from user space */ | ||
| 475 | if (strncpy_from_user(license, u64_to_ptr(attr->license), | ||
| 476 | sizeof(license) - 1) < 0) | ||
| 477 | return -EFAULT; | ||
| 478 | license[sizeof(license) - 1] = 0; | ||
| 479 | |||
| 480 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | ||
| 481 | is_gpl = license_is_gpl_compatible(license); | ||
| 482 | |||
| 483 | if (attr->insn_cnt >= BPF_MAXINSNS) | ||
| 484 | return -EINVAL; | ||
| 485 | |||
| 486 | /* plain bpf_prog allocation */ | ||
| 487 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | ||
| 488 | if (!prog) | ||
| 489 | return -ENOMEM; | ||
| 490 | |||
| 491 | prog->len = attr->insn_cnt; | ||
| 492 | |||
| 493 | err = -EFAULT; | ||
| 494 | if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), | ||
| 495 | prog->len * sizeof(struct bpf_insn)) != 0) | ||
| 496 | goto free_prog; | ||
| 497 | |||
| 498 | prog->orig_prog = NULL; | ||
| 499 | prog->jited = false; | ||
| 500 | |||
| 501 | atomic_set(&prog->aux->refcnt, 1); | ||
| 502 | prog->aux->is_gpl_compatible = is_gpl; | ||
| 503 | |||
| 504 | /* find program type: socket_filter vs tracing_filter */ | ||
| 505 | err = find_prog_type(type, prog); | ||
| 506 | if (err < 0) | ||
| 507 | goto free_prog; | ||
| 508 | |||
| 509 | /* run eBPF verifier */ | ||
| 510 | err = bpf_check(prog, attr); | ||
| 511 | |||
| 512 | if (err < 0) | ||
| 513 | goto free_used_maps; | ||
| 514 | |||
| 515 | /* fixup BPF_CALL->imm field */ | ||
| 516 | fixup_bpf_calls(prog); | ||
| 517 | |||
| 518 | /* eBPF program is ready to be JITed */ | ||
| 519 | bpf_prog_select_runtime(prog); | ||
| 520 | |||
| 521 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | ||
| 522 | |||
| 523 | if (err < 0) | ||
| 524 | /* failed to allocate fd */ | ||
| 525 | goto free_used_maps; | ||
| 526 | |||
| 527 | return err; | ||
| 528 | |||
| 529 | free_used_maps: | ||
| 530 | free_used_maps(prog->aux); | ||
| 531 | free_prog: | ||
| 532 | bpf_prog_free(prog); | ||
| 533 | return err; | ||
| 534 | } | ||
| 535 | |||
| 536 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | ||
| 537 | { | ||
| 538 | union bpf_attr attr = {}; | ||
| 539 | int err; | ||
| 540 | |||
| 541 | /* the syscall is limited to root temporarily. This restriction will be | ||
| 542 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
| 543 | * this restriction, since it may pass kernel data to user space | ||
| 544 | */ | ||
| 545 | if (!capable(CAP_SYS_ADMIN)) | ||
| 546 | return -EPERM; | ||
| 547 | |||
| 548 | if (!access_ok(VERIFY_READ, uattr, 1)) | ||
| 549 | return -EFAULT; | ||
| 550 | |||
| 551 | if (size > PAGE_SIZE) /* silly large */ | ||
| 552 | return -E2BIG; | ||
| 553 | |||
| 554 | /* If we're handed a bigger struct than we know of, | ||
| 555 | * ensure all the unknown bits are 0 - i.e. new | ||
| 556 | * user-space does not rely on any kernel feature | ||
| 557 | * extensions we dont know about yet. | ||
| 558 | */ | ||
| 559 | if (size > sizeof(attr)) { | ||
| 560 | unsigned char __user *addr; | ||
| 561 | unsigned char __user *end; | ||
| 562 | unsigned char val; | ||
| 563 | |||
| 564 | addr = (void __user *)uattr + sizeof(attr); | ||
| 565 | end = (void __user *)uattr + size; | ||
| 566 | |||
| 567 | for (; addr < end; addr++) { | ||
| 568 | err = get_user(val, addr); | ||
| 569 | if (err) | ||
| 570 | return err; | ||
| 571 | if (val) | ||
| 572 | return -E2BIG; | ||
| 573 | } | ||
| 574 | size = sizeof(attr); | ||
| 575 | } | ||
| 576 | |||
| 577 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | ||
| 578 | if (copy_from_user(&attr, uattr, size) != 0) | ||
| 579 | return -EFAULT; | ||
| 580 | |||
| 581 | switch (cmd) { | ||
| 582 | case BPF_MAP_CREATE: | ||
| 583 | err = map_create(&attr); | ||
| 584 | break; | ||
| 585 | case BPF_MAP_LOOKUP_ELEM: | ||
| 586 | err = map_lookup_elem(&attr); | ||
| 587 | break; | ||
| 588 | case BPF_MAP_UPDATE_ELEM: | ||
| 589 | err = map_update_elem(&attr); | ||
| 590 | break; | ||
| 591 | case BPF_MAP_DELETE_ELEM: | ||
| 592 | err = map_delete_elem(&attr); | ||
| 593 | break; | ||
| 594 | case BPF_MAP_GET_NEXT_KEY: | ||
| 595 | err = map_get_next_key(&attr); | ||
| 596 | break; | ||
| 597 | case BPF_PROG_LOAD: | ||
| 598 | err = bpf_prog_load(&attr); | ||
| 599 | break; | ||
| 600 | default: | ||
| 601 | err = -EINVAL; | ||
| 602 | break; | ||
| 603 | } | ||
| 604 | |||
| 605 | return err; | ||
| 606 | } | ||
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c new file mode 100644 index 000000000000..0ceae1e6e8b5 --- /dev/null +++ b/kernel/bpf/test_stub.c | |||
| @@ -0,0 +1,78 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | ||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/types.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/err.h> | ||
| 11 | #include <linux/bpf.h> | ||
| 12 | |||
| 13 | /* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC | ||
| 14 | * to be used by user space verifier testsuite | ||
| 15 | */ | ||
| 16 | struct bpf_context { | ||
| 17 | u64 arg1; | ||
| 18 | u64 arg2; | ||
| 19 | }; | ||
| 20 | |||
| 21 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) | ||
| 22 | { | ||
| 23 | switch (func_id) { | ||
| 24 | case BPF_FUNC_map_lookup_elem: | ||
| 25 | return &bpf_map_lookup_elem_proto; | ||
| 26 | case BPF_FUNC_map_update_elem: | ||
| 27 | return &bpf_map_update_elem_proto; | ||
| 28 | case BPF_FUNC_map_delete_elem: | ||
| 29 | return &bpf_map_delete_elem_proto; | ||
| 30 | default: | ||
| 31 | return NULL; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | |||
| 35 | static const struct bpf_context_access { | ||
| 36 | int size; | ||
| 37 | enum bpf_access_type type; | ||
| 38 | } test_ctx_access[] = { | ||
| 39 | [offsetof(struct bpf_context, arg1)] = { | ||
| 40 | FIELD_SIZEOF(struct bpf_context, arg1), | ||
| 41 | BPF_READ | ||
| 42 | }, | ||
| 43 | [offsetof(struct bpf_context, arg2)] = { | ||
| 44 | FIELD_SIZEOF(struct bpf_context, arg2), | ||
| 45 | BPF_READ | ||
| 46 | }, | ||
| 47 | }; | ||
| 48 | |||
| 49 | static bool test_is_valid_access(int off, int size, enum bpf_access_type type) | ||
| 50 | { | ||
| 51 | const struct bpf_context_access *access; | ||
| 52 | |||
| 53 | if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) | ||
| 54 | return false; | ||
| 55 | |||
| 56 | access = &test_ctx_access[off]; | ||
| 57 | if (access->size == size && (access->type & type)) | ||
| 58 | return true; | ||
| 59 | |||
| 60 | return false; | ||
| 61 | } | ||
| 62 | |||
| 63 | static struct bpf_verifier_ops test_ops = { | ||
| 64 | .get_func_proto = test_func_proto, | ||
| 65 | .is_valid_access = test_is_valid_access, | ||
| 66 | }; | ||
| 67 | |||
| 68 | static struct bpf_prog_type_list tl_prog = { | ||
| 69 | .ops = &test_ops, | ||
| 70 | .type = BPF_PROG_TYPE_UNSPEC, | ||
| 71 | }; | ||
| 72 | |||
| 73 | static int __init register_test_ops(void) | ||
| 74 | { | ||
| 75 | bpf_register_prog_type(&tl_prog); | ||
| 76 | return 0; | ||
| 77 | } | ||
| 78 | late_initcall(register_test_ops); | ||
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..a28e09c7825d --- /dev/null +++ b/kernel/bpf/verifier.c | |||
| @@ -0,0 +1,2003 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | ||
| 12 | #include <linux/kernel.h> | ||
| 13 | #include <linux/types.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/bpf.h> | ||
| 16 | #include <linux/filter.h> | ||
| 17 | #include <net/netlink.h> | ||
| 18 | #include <linux/file.h> | ||
| 19 | #include <linux/vmalloc.h> | ||
| 20 | |||
| 21 | /* bpf_check() is a static code analyzer that walks eBPF program | ||
| 22 | * instruction by instruction and updates register/stack state. | ||
| 23 | * All paths of conditional branches are analyzed until 'bpf_exit' insn. | ||
| 24 | * | ||
| 25 | * The first pass is depth-first-search to check that the program is a DAG. | ||
| 26 | * It rejects the following programs: | ||
| 27 | * - larger than BPF_MAXINSNS insns | ||
| 28 | * - if loop is present (detected via back-edge) | ||
| 29 | * - unreachable insns exist (shouldn't be a forest. program = one function) | ||
| 30 | * - out of bounds or malformed jumps | ||
| 31 | * The second pass is all possible path descent from the 1st insn. | ||
| 32 | * Since it's analyzing all pathes through the program, the length of the | ||
| 33 | * analysis is limited to 32k insn, which may be hit even if total number of | ||
| 34 | * insn is less then 4K, but there are too many branches that change stack/regs. | ||
| 35 | * Number of 'branches to be analyzed' is limited to 1k | ||
| 36 | * | ||
| 37 | * On entry to each instruction, each register has a type, and the instruction | ||
| 38 | * changes the types of the registers depending on instruction semantics. | ||
| 39 | * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is | ||
| 40 | * copied to R1. | ||
| 41 | * | ||
| 42 | * All registers are 64-bit. | ||
| 43 | * R0 - return register | ||
| 44 | * R1-R5 argument passing registers | ||
| 45 | * R6-R9 callee saved registers | ||
| 46 | * R10 - frame pointer read-only | ||
| 47 | * | ||
| 48 | * At the start of BPF program the register R1 contains a pointer to bpf_context | ||
| 49 | * and has type PTR_TO_CTX. | ||
| 50 | * | ||
| 51 | * Verifier tracks arithmetic operations on pointers in case: | ||
| 52 | * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), | ||
| 53 | * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), | ||
| 54 | * 1st insn copies R10 (which has FRAME_PTR) type into R1 | ||
| 55 | * and 2nd arithmetic instruction is pattern matched to recognize | ||
| 56 | * that it wants to construct a pointer to some element within stack. | ||
| 57 | * So after 2nd insn, the register R1 has type PTR_TO_STACK | ||
| 58 | * (and -20 constant is saved for further stack bounds checking). | ||
| 59 | * Meaning that this reg is a pointer to stack plus known immediate constant. | ||
| 60 | * | ||
| 61 | * Most of the time the registers have UNKNOWN_VALUE type, which | ||
| 62 | * means the register has some value, but it's not a valid pointer. | ||
| 63 | * (like pointer plus pointer becomes UNKNOWN_VALUE type) | ||
| 64 | * | ||
| 65 | * When verifier sees load or store instructions the type of base register | ||
| 66 | * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer | ||
| 67 | * types recognized by check_mem_access() function. | ||
| 68 | * | ||
| 69 | * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' | ||
| 70 | * and the range of [ptr, ptr + map's value_size) is accessible. | ||
| 71 | * | ||
| 72 | * registers used to pass values to function calls are checked against | ||
| 73 | * function argument constraints. | ||
| 74 | * | ||
| 75 | * ARG_PTR_TO_MAP_KEY is one of such argument constraints. | ||
| 76 | * It means that the register type passed to this function must be | ||
| 77 | * PTR_TO_STACK and it will be used inside the function as | ||
| 78 | * 'pointer to map element key' | ||
| 79 | * | ||
| 80 | * For example the argument constraints for bpf_map_lookup_elem(): | ||
| 81 | * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | ||
| 82 | * .arg1_type = ARG_CONST_MAP_PTR, | ||
| 83 | * .arg2_type = ARG_PTR_TO_MAP_KEY, | ||
| 84 | * | ||
| 85 | * ret_type says that this function returns 'pointer to map elem value or null' | ||
| 86 | * function expects 1st argument to be a const pointer to 'struct bpf_map' and | ||
| 87 | * 2nd argument should be a pointer to stack, which will be used inside | ||
| 88 | * the helper function as a pointer to map element key. | ||
| 89 | * | ||
| 90 | * On the kernel side the helper function looks like: | ||
| 91 | * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 92 | * { | ||
| 93 | * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; | ||
| 94 | * void *key = (void *) (unsigned long) r2; | ||
| 95 | * void *value; | ||
| 96 | * | ||
| 97 | * here kernel can access 'key' and 'map' pointers safely, knowing that | ||
| 98 | * [key, key + map->key_size) bytes are valid and were initialized on | ||
| 99 | * the stack of eBPF program. | ||
| 100 | * } | ||
| 101 | * | ||
| 102 | * Corresponding eBPF program may look like: | ||
| 103 | * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR | ||
| 104 | * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK | ||
| 105 | * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP | ||
| 106 | * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), | ||
| 107 | * here verifier looks at prototype of map_lookup_elem() and sees: | ||
| 108 | * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, | ||
| 109 | * Now verifier knows that this map has key of R1->map_ptr->key_size bytes | ||
| 110 | * | ||
| 111 | * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, | ||
| 112 | * Now verifier checks that [R2, R2 + map's key_size) are within stack limits | ||
| 113 | * and were initialized prior to this call. | ||
| 114 | * If it's ok, then verifier allows this BPF_CALL insn and looks at | ||
| 115 | * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets | ||
| 116 | * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function | ||
| 117 | * returns ether pointer to map value or NULL. | ||
| 118 | * | ||
| 119 | * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' | ||
| 120 | * insn, the register holding that pointer in the true branch changes state to | ||
| 121 | * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false | ||
| 122 | * branch. See check_cond_jmp_op(). | ||
| 123 | * | ||
| 124 | * After the call R0 is set to return type of the function and registers R1-R5 | ||
| 125 | * are set to NOT_INIT to indicate that they are no longer readable. | ||
| 126 | */ | ||
| 127 | |||
| 128 | /* types of values stored in eBPF registers */ | ||
| 129 | enum bpf_reg_type { | ||
| 130 | NOT_INIT = 0, /* nothing was written into register */ | ||
| 131 | UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ | ||
| 132 | PTR_TO_CTX, /* reg points to bpf_context */ | ||
| 133 | CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ | ||
| 134 | PTR_TO_MAP_VALUE, /* reg points to map element value */ | ||
| 135 | PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ | ||
| 136 | FRAME_PTR, /* reg == frame_pointer */ | ||
| 137 | PTR_TO_STACK, /* reg == frame_pointer + imm */ | ||
| 138 | CONST_IMM, /* constant integer value */ | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct reg_state { | ||
| 142 | enum bpf_reg_type type; | ||
| 143 | union { | ||
| 144 | /* valid when type == CONST_IMM | PTR_TO_STACK */ | ||
| 145 | int imm; | ||
| 146 | |||
| 147 | /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | | ||
| 148 | * PTR_TO_MAP_VALUE_OR_NULL | ||
| 149 | */ | ||
| 150 | struct bpf_map *map_ptr; | ||
| 151 | }; | ||
| 152 | }; | ||
| 153 | |||
| 154 | enum bpf_stack_slot_type { | ||
| 155 | STACK_INVALID, /* nothing was stored in this stack slot */ | ||
| 156 | STACK_SPILL, /* register spilled into stack */ | ||
| 157 | STACK_MISC /* BPF program wrote some data into this slot */ | ||
| 158 | }; | ||
| 159 | |||
| 160 | #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ | ||
| 161 | |||
| 162 | /* state of the program: | ||
| 163 | * type of all registers and stack info | ||
| 164 | */ | ||
| 165 | struct verifier_state { | ||
| 166 | struct reg_state regs[MAX_BPF_REG]; | ||
| 167 | u8 stack_slot_type[MAX_BPF_STACK]; | ||
| 168 | struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; | ||
| 169 | }; | ||
| 170 | |||
| 171 | /* linked list of verifier states used to prune search */ | ||
| 172 | struct verifier_state_list { | ||
| 173 | struct verifier_state state; | ||
| 174 | struct verifier_state_list *next; | ||
| 175 | }; | ||
| 176 | |||
| 177 | /* verifier_state + insn_idx are pushed to stack when branch is encountered */ | ||
| 178 | struct verifier_stack_elem { | ||
| 179 | /* verifer state is 'st' | ||
| 180 | * before processing instruction 'insn_idx' | ||
| 181 | * and after processing instruction 'prev_insn_idx' | ||
| 182 | */ | ||
| 183 | struct verifier_state st; | ||
| 184 | int insn_idx; | ||
| 185 | int prev_insn_idx; | ||
| 186 | struct verifier_stack_elem *next; | ||
| 187 | }; | ||
| 188 | |||
| 189 | #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ | ||
| 190 | |||
| 191 | /* single container for all structs | ||
| 192 | * one verifier_env per bpf_check() call | ||
| 193 | */ | ||
| 194 | struct verifier_env { | ||
| 195 | struct bpf_prog *prog; /* eBPF program being verified */ | ||
| 196 | struct verifier_stack_elem *head; /* stack of verifier states to be processed */ | ||
| 197 | int stack_size; /* number of states to be processed */ | ||
| 198 | struct verifier_state cur_state; /* current verifier state */ | ||
| 199 | struct verifier_state_list **explored_states; /* search pruning optimization */ | ||
| 200 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ | ||
| 201 | u32 used_map_cnt; /* number of used maps */ | ||
| 202 | }; | ||
| 203 | |||
| 204 | /* verbose verifier prints what it's seeing | ||
| 205 | * bpf_check() is called under lock, so no race to access these global vars | ||
| 206 | */ | ||
| 207 | static u32 log_level, log_size, log_len; | ||
| 208 | static char *log_buf; | ||
| 209 | |||
| 210 | static DEFINE_MUTEX(bpf_verifier_lock); | ||
| 211 | |||
| 212 | /* log_level controls verbosity level of eBPF verifier. | ||
| 213 | * verbose() is used to dump the verification trace to the log, so the user | ||
| 214 | * can figure out what's wrong with the program | ||
| 215 | */ | ||
| 216 | static void verbose(const char *fmt, ...) | ||
| 217 | { | ||
| 218 | va_list args; | ||
| 219 | |||
| 220 | if (log_level == 0 || log_len >= log_size - 1) | ||
| 221 | return; | ||
| 222 | |||
| 223 | va_start(args, fmt); | ||
| 224 | log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); | ||
| 225 | va_end(args); | ||
| 226 | } | ||
| 227 | |||
| 228 | /* string representation of 'enum bpf_reg_type' */ | ||
| 229 | static const char * const reg_type_str[] = { | ||
| 230 | [NOT_INIT] = "?", | ||
| 231 | [UNKNOWN_VALUE] = "inv", | ||
| 232 | [PTR_TO_CTX] = "ctx", | ||
| 233 | [CONST_PTR_TO_MAP] = "map_ptr", | ||
| 234 | [PTR_TO_MAP_VALUE] = "map_value", | ||
| 235 | [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", | ||
| 236 | [FRAME_PTR] = "fp", | ||
| 237 | [PTR_TO_STACK] = "fp", | ||
| 238 | [CONST_IMM] = "imm", | ||
| 239 | }; | ||
| 240 | |||
| 241 | static void print_verifier_state(struct verifier_env *env) | ||
| 242 | { | ||
| 243 | enum bpf_reg_type t; | ||
| 244 | int i; | ||
| 245 | |||
| 246 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 247 | t = env->cur_state.regs[i].type; | ||
| 248 | if (t == NOT_INIT) | ||
| 249 | continue; | ||
| 250 | verbose(" R%d=%s", i, reg_type_str[t]); | ||
| 251 | if (t == CONST_IMM || t == PTR_TO_STACK) | ||
| 252 | verbose("%d", env->cur_state.regs[i].imm); | ||
| 253 | else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || | ||
| 254 | t == PTR_TO_MAP_VALUE_OR_NULL) | ||
| 255 | verbose("(ks=%d,vs=%d)", | ||
| 256 | env->cur_state.regs[i].map_ptr->key_size, | ||
| 257 | env->cur_state.regs[i].map_ptr->value_size); | ||
| 258 | } | ||
| 259 | for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { | ||
| 260 | if (env->cur_state.stack_slot_type[i] == STACK_SPILL) | ||
| 261 | verbose(" fp%d=%s", -MAX_BPF_STACK + i, | ||
| 262 | reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); | ||
| 263 | } | ||
| 264 | verbose("\n"); | ||
| 265 | } | ||
| 266 | |||
| 267 | static const char *const bpf_class_string[] = { | ||
| 268 | [BPF_LD] = "ld", | ||
| 269 | [BPF_LDX] = "ldx", | ||
| 270 | [BPF_ST] = "st", | ||
| 271 | [BPF_STX] = "stx", | ||
| 272 | [BPF_ALU] = "alu", | ||
| 273 | [BPF_JMP] = "jmp", | ||
| 274 | [BPF_RET] = "BUG", | ||
| 275 | [BPF_ALU64] = "alu64", | ||
| 276 | }; | ||
| 277 | |||
| 278 | static const char *const bpf_alu_string[] = { | ||
| 279 | [BPF_ADD >> 4] = "+=", | ||
| 280 | [BPF_SUB >> 4] = "-=", | ||
| 281 | [BPF_MUL >> 4] = "*=", | ||
| 282 | [BPF_DIV >> 4] = "/=", | ||
| 283 | [BPF_OR >> 4] = "|=", | ||
| 284 | [BPF_AND >> 4] = "&=", | ||
| 285 | [BPF_LSH >> 4] = "<<=", | ||
| 286 | [BPF_RSH >> 4] = ">>=", | ||
| 287 | [BPF_NEG >> 4] = "neg", | ||
| 288 | [BPF_MOD >> 4] = "%=", | ||
| 289 | [BPF_XOR >> 4] = "^=", | ||
| 290 | [BPF_MOV >> 4] = "=", | ||
| 291 | [BPF_ARSH >> 4] = "s>>=", | ||
| 292 | [BPF_END >> 4] = "endian", | ||
| 293 | }; | ||
| 294 | |||
| 295 | static const char *const bpf_ldst_string[] = { | ||
| 296 | [BPF_W >> 3] = "u32", | ||
| 297 | [BPF_H >> 3] = "u16", | ||
| 298 | [BPF_B >> 3] = "u8", | ||
| 299 | [BPF_DW >> 3] = "u64", | ||
| 300 | }; | ||
| 301 | |||
| 302 | static const char *const bpf_jmp_string[] = { | ||
| 303 | [BPF_JA >> 4] = "jmp", | ||
| 304 | [BPF_JEQ >> 4] = "==", | ||
| 305 | [BPF_JGT >> 4] = ">", | ||
| 306 | [BPF_JGE >> 4] = ">=", | ||
| 307 | [BPF_JSET >> 4] = "&", | ||
| 308 | [BPF_JNE >> 4] = "!=", | ||
| 309 | [BPF_JSGT >> 4] = "s>", | ||
| 310 | [BPF_JSGE >> 4] = "s>=", | ||
| 311 | [BPF_CALL >> 4] = "call", | ||
| 312 | [BPF_EXIT >> 4] = "exit", | ||
| 313 | }; | ||
| 314 | |||
| 315 | static void print_bpf_insn(struct bpf_insn *insn) | ||
| 316 | { | ||
| 317 | u8 class = BPF_CLASS(insn->code); | ||
| 318 | |||
| 319 | if (class == BPF_ALU || class == BPF_ALU64) { | ||
| 320 | if (BPF_SRC(insn->code) == BPF_X) | ||
| 321 | verbose("(%02x) %sr%d %s %sr%d\n", | ||
| 322 | insn->code, class == BPF_ALU ? "(u32) " : "", | ||
| 323 | insn->dst_reg, | ||
| 324 | bpf_alu_string[BPF_OP(insn->code) >> 4], | ||
| 325 | class == BPF_ALU ? "(u32) " : "", | ||
| 326 | insn->src_reg); | ||
| 327 | else | ||
| 328 | verbose("(%02x) %sr%d %s %s%d\n", | ||
| 329 | insn->code, class == BPF_ALU ? "(u32) " : "", | ||
| 330 | insn->dst_reg, | ||
| 331 | bpf_alu_string[BPF_OP(insn->code) >> 4], | ||
| 332 | class == BPF_ALU ? "(u32) " : "", | ||
| 333 | insn->imm); | ||
| 334 | } else if (class == BPF_STX) { | ||
| 335 | if (BPF_MODE(insn->code) == BPF_MEM) | ||
| 336 | verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", | ||
| 337 | insn->code, | ||
| 338 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 339 | insn->dst_reg, | ||
| 340 | insn->off, insn->src_reg); | ||
| 341 | else if (BPF_MODE(insn->code) == BPF_XADD) | ||
| 342 | verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", | ||
| 343 | insn->code, | ||
| 344 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 345 | insn->dst_reg, insn->off, | ||
| 346 | insn->src_reg); | ||
| 347 | else | ||
| 348 | verbose("BUG_%02x\n", insn->code); | ||
| 349 | } else if (class == BPF_ST) { | ||
| 350 | if (BPF_MODE(insn->code) != BPF_MEM) { | ||
| 351 | verbose("BUG_st_%02x\n", insn->code); | ||
| 352 | return; | ||
| 353 | } | ||
| 354 | verbose("(%02x) *(%s *)(r%d %+d) = %d\n", | ||
| 355 | insn->code, | ||
| 356 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 357 | insn->dst_reg, | ||
| 358 | insn->off, insn->imm); | ||
| 359 | } else if (class == BPF_LDX) { | ||
| 360 | if (BPF_MODE(insn->code) != BPF_MEM) { | ||
| 361 | verbose("BUG_ldx_%02x\n", insn->code); | ||
| 362 | return; | ||
| 363 | } | ||
| 364 | verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", | ||
| 365 | insn->code, insn->dst_reg, | ||
| 366 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 367 | insn->src_reg, insn->off); | ||
| 368 | } else if (class == BPF_LD) { | ||
| 369 | if (BPF_MODE(insn->code) == BPF_ABS) { | ||
| 370 | verbose("(%02x) r0 = *(%s *)skb[%d]\n", | ||
| 371 | insn->code, | ||
| 372 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 373 | insn->imm); | ||
| 374 | } else if (BPF_MODE(insn->code) == BPF_IND) { | ||
| 375 | verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", | ||
| 376 | insn->code, | ||
| 377 | bpf_ldst_string[BPF_SIZE(insn->code) >> 3], | ||
| 378 | insn->src_reg, insn->imm); | ||
| 379 | } else if (BPF_MODE(insn->code) == BPF_IMM) { | ||
| 380 | verbose("(%02x) r%d = 0x%x\n", | ||
| 381 | insn->code, insn->dst_reg, insn->imm); | ||
| 382 | } else { | ||
| 383 | verbose("BUG_ld_%02x\n", insn->code); | ||
| 384 | return; | ||
| 385 | } | ||
| 386 | } else if (class == BPF_JMP) { | ||
| 387 | u8 opcode = BPF_OP(insn->code); | ||
| 388 | |||
| 389 | if (opcode == BPF_CALL) { | ||
| 390 | verbose("(%02x) call %d\n", insn->code, insn->imm); | ||
| 391 | } else if (insn->code == (BPF_JMP | BPF_JA)) { | ||
| 392 | verbose("(%02x) goto pc%+d\n", | ||
| 393 | insn->code, insn->off); | ||
| 394 | } else if (insn->code == (BPF_JMP | BPF_EXIT)) { | ||
| 395 | verbose("(%02x) exit\n", insn->code); | ||
| 396 | } else if (BPF_SRC(insn->code) == BPF_X) { | ||
| 397 | verbose("(%02x) if r%d %s r%d goto pc%+d\n", | ||
| 398 | insn->code, insn->dst_reg, | ||
| 399 | bpf_jmp_string[BPF_OP(insn->code) >> 4], | ||
| 400 | insn->src_reg, insn->off); | ||
| 401 | } else { | ||
| 402 | verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", | ||
| 403 | insn->code, insn->dst_reg, | ||
| 404 | bpf_jmp_string[BPF_OP(insn->code) >> 4], | ||
| 405 | insn->imm, insn->off); | ||
| 406 | } | ||
| 407 | } else { | ||
| 408 | verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); | ||
| 409 | } | ||
| 410 | } | ||
| 411 | |||
| 412 | static int pop_stack(struct verifier_env *env, int *prev_insn_idx) | ||
| 413 | { | ||
| 414 | struct verifier_stack_elem *elem; | ||
| 415 | int insn_idx; | ||
| 416 | |||
| 417 | if (env->head == NULL) | ||
| 418 | return -1; | ||
| 419 | |||
| 420 | memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); | ||
| 421 | insn_idx = env->head->insn_idx; | ||
| 422 | if (prev_insn_idx) | ||
| 423 | *prev_insn_idx = env->head->prev_insn_idx; | ||
| 424 | elem = env->head->next; | ||
| 425 | kfree(env->head); | ||
| 426 | env->head = elem; | ||
| 427 | env->stack_size--; | ||
| 428 | return insn_idx; | ||
| 429 | } | ||
| 430 | |||
| 431 | static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, | ||
| 432 | int prev_insn_idx) | ||
| 433 | { | ||
| 434 | struct verifier_stack_elem *elem; | ||
| 435 | |||
| 436 | elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); | ||
| 437 | if (!elem) | ||
| 438 | goto err; | ||
| 439 | |||
| 440 | memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); | ||
| 441 | elem->insn_idx = insn_idx; | ||
| 442 | elem->prev_insn_idx = prev_insn_idx; | ||
| 443 | elem->next = env->head; | ||
| 444 | env->head = elem; | ||
| 445 | env->stack_size++; | ||
| 446 | if (env->stack_size > 1024) { | ||
| 447 | verbose("BPF program is too complex\n"); | ||
| 448 | goto err; | ||
| 449 | } | ||
| 450 | return &elem->st; | ||
| 451 | err: | ||
| 452 | /* pop all elements and return */ | ||
| 453 | while (pop_stack(env, NULL) >= 0); | ||
| 454 | return NULL; | ||
| 455 | } | ||
| 456 | |||
| 457 | #define CALLER_SAVED_REGS 6 | ||
| 458 | static const int caller_saved[CALLER_SAVED_REGS] = { | ||
| 459 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | ||
| 460 | }; | ||
| 461 | |||
| 462 | static void init_reg_state(struct reg_state *regs) | ||
| 463 | { | ||
| 464 | int i; | ||
| 465 | |||
| 466 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 467 | regs[i].type = NOT_INIT; | ||
| 468 | regs[i].imm = 0; | ||
| 469 | regs[i].map_ptr = NULL; | ||
| 470 | } | ||
| 471 | |||
| 472 | /* frame pointer */ | ||
| 473 | regs[BPF_REG_FP].type = FRAME_PTR; | ||
| 474 | |||
| 475 | /* 1st arg to a function */ | ||
| 476 | regs[BPF_REG_1].type = PTR_TO_CTX; | ||
| 477 | } | ||
| 478 | |||
| 479 | static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) | ||
| 480 | { | ||
| 481 | BUG_ON(regno >= MAX_BPF_REG); | ||
| 482 | regs[regno].type = UNKNOWN_VALUE; | ||
| 483 | regs[regno].imm = 0; | ||
| 484 | regs[regno].map_ptr = NULL; | ||
| 485 | } | ||
| 486 | |||
| 487 | enum reg_arg_type { | ||
| 488 | SRC_OP, /* register is used as source operand */ | ||
| 489 | DST_OP, /* register is used as destination operand */ | ||
| 490 | DST_OP_NO_MARK /* same as above, check only, don't mark */ | ||
| 491 | }; | ||
| 492 | |||
| 493 | static int check_reg_arg(struct reg_state *regs, u32 regno, | ||
| 494 | enum reg_arg_type t) | ||
| 495 | { | ||
| 496 | if (regno >= MAX_BPF_REG) { | ||
| 497 | verbose("R%d is invalid\n", regno); | ||
| 498 | return -EINVAL; | ||
| 499 | } | ||
| 500 | |||
| 501 | if (t == SRC_OP) { | ||
| 502 | /* check whether register used as source operand can be read */ | ||
| 503 | if (regs[regno].type == NOT_INIT) { | ||
| 504 | verbose("R%d !read_ok\n", regno); | ||
| 505 | return -EACCES; | ||
| 506 | } | ||
| 507 | } else { | ||
| 508 | /* check whether register used as dest operand can be written to */ | ||
| 509 | if (regno == BPF_REG_FP) { | ||
| 510 | verbose("frame pointer is read only\n"); | ||
| 511 | return -EACCES; | ||
| 512 | } | ||
| 513 | if (t == DST_OP) | ||
| 514 | mark_reg_unknown_value(regs, regno); | ||
| 515 | } | ||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 519 | static int bpf_size_to_bytes(int bpf_size) | ||
| 520 | { | ||
| 521 | if (bpf_size == BPF_W) | ||
| 522 | return 4; | ||
| 523 | else if (bpf_size == BPF_H) | ||
| 524 | return 2; | ||
| 525 | else if (bpf_size == BPF_B) | ||
| 526 | return 1; | ||
| 527 | else if (bpf_size == BPF_DW) | ||
| 528 | return 8; | ||
| 529 | else | ||
| 530 | return -EINVAL; | ||
| 531 | } | ||
| 532 | |||
| 533 | /* check_stack_read/write functions track spill/fill of registers, | ||
| 534 | * stack boundary and alignment are checked in check_mem_access() | ||
| 535 | */ | ||
| 536 | static int check_stack_write(struct verifier_state *state, int off, int size, | ||
| 537 | int value_regno) | ||
| 538 | { | ||
| 539 | int i; | ||
| 540 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, | ||
| 541 | * so it's aligned access and [off, off + size) are within stack limits | ||
| 542 | */ | ||
| 543 | |||
| 544 | if (value_regno >= 0 && | ||
| 545 | (state->regs[value_regno].type == PTR_TO_MAP_VALUE || | ||
| 546 | state->regs[value_regno].type == PTR_TO_STACK || | ||
| 547 | state->regs[value_regno].type == PTR_TO_CTX)) { | ||
| 548 | |||
| 549 | /* register containing pointer is being spilled into stack */ | ||
| 550 | if (size != BPF_REG_SIZE) { | ||
| 551 | verbose("invalid size of register spill\n"); | ||
| 552 | return -EACCES; | ||
| 553 | } | ||
| 554 | |||
| 555 | /* save register state */ | ||
| 556 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = | ||
| 557 | state->regs[value_regno]; | ||
| 558 | |||
| 559 | for (i = 0; i < BPF_REG_SIZE; i++) | ||
| 560 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; | ||
| 561 | } else { | ||
| 562 | /* regular write of data into stack */ | ||
| 563 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = | ||
| 564 | (struct reg_state) {}; | ||
| 565 | |||
| 566 | for (i = 0; i < size; i++) | ||
| 567 | state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; | ||
| 568 | } | ||
| 569 | return 0; | ||
| 570 | } | ||
| 571 | |||
| 572 | static int check_stack_read(struct verifier_state *state, int off, int size, | ||
| 573 | int value_regno) | ||
| 574 | { | ||
| 575 | u8 *slot_type; | ||
| 576 | int i; | ||
| 577 | |||
| 578 | slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; | ||
| 579 | |||
| 580 | if (slot_type[0] == STACK_SPILL) { | ||
| 581 | if (size != BPF_REG_SIZE) { | ||
| 582 | verbose("invalid size of register spill\n"); | ||
| 583 | return -EACCES; | ||
| 584 | } | ||
| 585 | for (i = 1; i < BPF_REG_SIZE; i++) { | ||
| 586 | if (slot_type[i] != STACK_SPILL) { | ||
| 587 | verbose("corrupted spill memory\n"); | ||
| 588 | return -EACCES; | ||
| 589 | } | ||
| 590 | } | ||
| 591 | |||
| 592 | if (value_regno >= 0) | ||
| 593 | /* restore register state from stack */ | ||
| 594 | state->regs[value_regno] = | ||
| 595 | state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; | ||
| 596 | return 0; | ||
| 597 | } else { | ||
| 598 | for (i = 0; i < size; i++) { | ||
| 599 | if (slot_type[i] != STACK_MISC) { | ||
| 600 | verbose("invalid read from stack off %d+%d size %d\n", | ||
| 601 | off, i, size); | ||
| 602 | return -EACCES; | ||
| 603 | } | ||
| 604 | } | ||
| 605 | if (value_regno >= 0) | ||
| 606 | /* have read misc data from the stack */ | ||
| 607 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 608 | return 0; | ||
| 609 | } | ||
| 610 | } | ||
| 611 | |||
| 612 | /* check read/write into map element returned by bpf_map_lookup_elem() */ | ||
| 613 | static int check_map_access(struct verifier_env *env, u32 regno, int off, | ||
| 614 | int size) | ||
| 615 | { | ||
| 616 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; | ||
| 617 | |||
| 618 | if (off < 0 || off + size > map->value_size) { | ||
| 619 | verbose("invalid access to map value, value_size=%d off=%d size=%d\n", | ||
| 620 | map->value_size, off, size); | ||
| 621 | return -EACCES; | ||
| 622 | } | ||
| 623 | return 0; | ||
| 624 | } | ||
| 625 | |||
| 626 | /* check access to 'struct bpf_context' fields */ | ||
| 627 | static int check_ctx_access(struct verifier_env *env, int off, int size, | ||
| 628 | enum bpf_access_type t) | ||
| 629 | { | ||
| 630 | if (env->prog->aux->ops->is_valid_access && | ||
| 631 | env->prog->aux->ops->is_valid_access(off, size, t)) | ||
| 632 | return 0; | ||
| 633 | |||
| 634 | verbose("invalid bpf_context access off=%d size=%d\n", off, size); | ||
| 635 | return -EACCES; | ||
| 636 | } | ||
| 637 | |||
| 638 | /* check whether memory at (regno + off) is accessible for t = (read | write) | ||
| 639 | * if t==write, value_regno is a register which value is stored into memory | ||
| 640 | * if t==read, value_regno is a register which will receive the value from memory | ||
| 641 | * if t==write && value_regno==-1, some unknown value is stored into memory | ||
| 642 | * if t==read && value_regno==-1, don't care what we read from memory | ||
| 643 | */ | ||
| 644 | static int check_mem_access(struct verifier_env *env, u32 regno, int off, | ||
| 645 | int bpf_size, enum bpf_access_type t, | ||
| 646 | int value_regno) | ||
| 647 | { | ||
| 648 | struct verifier_state *state = &env->cur_state; | ||
| 649 | int size, err = 0; | ||
| 650 | |||
| 651 | size = bpf_size_to_bytes(bpf_size); | ||
| 652 | if (size < 0) | ||
| 653 | return size; | ||
| 654 | |||
| 655 | if (off % size != 0) { | ||
| 656 | verbose("misaligned access off %d size %d\n", off, size); | ||
| 657 | return -EACCES; | ||
| 658 | } | ||
| 659 | |||
| 660 | if (state->regs[regno].type == PTR_TO_MAP_VALUE) { | ||
| 661 | err = check_map_access(env, regno, off, size); | ||
| 662 | if (!err && t == BPF_READ && value_regno >= 0) | ||
| 663 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 664 | |||
| 665 | } else if (state->regs[regno].type == PTR_TO_CTX) { | ||
| 666 | err = check_ctx_access(env, off, size, t); | ||
| 667 | if (!err && t == BPF_READ && value_regno >= 0) | ||
| 668 | mark_reg_unknown_value(state->regs, value_regno); | ||
| 669 | |||
| 670 | } else if (state->regs[regno].type == FRAME_PTR) { | ||
| 671 | if (off >= 0 || off < -MAX_BPF_STACK) { | ||
| 672 | verbose("invalid stack off=%d size=%d\n", off, size); | ||
| 673 | return -EACCES; | ||
| 674 | } | ||
| 675 | if (t == BPF_WRITE) | ||
| 676 | err = check_stack_write(state, off, size, value_regno); | ||
| 677 | else | ||
| 678 | err = check_stack_read(state, off, size, value_regno); | ||
| 679 | } else { | ||
| 680 | verbose("R%d invalid mem access '%s'\n", | ||
| 681 | regno, reg_type_str[state->regs[regno].type]); | ||
| 682 | return -EACCES; | ||
| 683 | } | ||
| 684 | return err; | ||
| 685 | } | ||
| 686 | |||
| 687 | static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) | ||
| 688 | { | ||
| 689 | struct reg_state *regs = env->cur_state.regs; | ||
| 690 | int err; | ||
| 691 | |||
| 692 | if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || | ||
| 693 | insn->imm != 0) { | ||
| 694 | verbose("BPF_XADD uses reserved fields\n"); | ||
| 695 | return -EINVAL; | ||
| 696 | } | ||
| 697 | |||
| 698 | /* check src1 operand */ | ||
| 699 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 700 | if (err) | ||
| 701 | return err; | ||
| 702 | |||
| 703 | /* check src2 operand */ | ||
| 704 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 705 | if (err) | ||
| 706 | return err; | ||
| 707 | |||
| 708 | /* check whether atomic_add can read the memory */ | ||
| 709 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 710 | BPF_SIZE(insn->code), BPF_READ, -1); | ||
| 711 | if (err) | ||
| 712 | return err; | ||
| 713 | |||
| 714 | /* check whether atomic_add can write into the same memory */ | ||
| 715 | return check_mem_access(env, insn->dst_reg, insn->off, | ||
| 716 | BPF_SIZE(insn->code), BPF_WRITE, -1); | ||
| 717 | } | ||
| 718 | |||
| 719 | /* when register 'regno' is passed into function that will read 'access_size' | ||
| 720 | * bytes from that pointer, make sure that it's within stack boundary | ||
| 721 | * and all elements of stack are initialized | ||
| 722 | */ | ||
| 723 | static int check_stack_boundary(struct verifier_env *env, | ||
| 724 | int regno, int access_size) | ||
| 725 | { | ||
| 726 | struct verifier_state *state = &env->cur_state; | ||
| 727 | struct reg_state *regs = state->regs; | ||
| 728 | int off, i; | ||
| 729 | |||
| 730 | if (regs[regno].type != PTR_TO_STACK) | ||
| 731 | return -EACCES; | ||
| 732 | |||
| 733 | off = regs[regno].imm; | ||
| 734 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || | ||
| 735 | access_size <= 0) { | ||
| 736 | verbose("invalid stack type R%d off=%d access_size=%d\n", | ||
| 737 | regno, off, access_size); | ||
| 738 | return -EACCES; | ||
| 739 | } | ||
| 740 | |||
| 741 | for (i = 0; i < access_size; i++) { | ||
| 742 | if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { | ||
| 743 | verbose("invalid indirect read from stack off %d+%d size %d\n", | ||
| 744 | off, i, access_size); | ||
| 745 | return -EACCES; | ||
| 746 | } | ||
| 747 | } | ||
| 748 | return 0; | ||
| 749 | } | ||
| 750 | |||
| 751 | static int check_func_arg(struct verifier_env *env, u32 regno, | ||
| 752 | enum bpf_arg_type arg_type, struct bpf_map **mapp) | ||
| 753 | { | ||
| 754 | struct reg_state *reg = env->cur_state.regs + regno; | ||
| 755 | enum bpf_reg_type expected_type; | ||
| 756 | int err = 0; | ||
| 757 | |||
| 758 | if (arg_type == ARG_ANYTHING) | ||
| 759 | return 0; | ||
| 760 | |||
| 761 | if (reg->type == NOT_INIT) { | ||
| 762 | verbose("R%d !read_ok\n", regno); | ||
| 763 | return -EACCES; | ||
| 764 | } | ||
| 765 | |||
| 766 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | ||
| 767 | arg_type == ARG_PTR_TO_MAP_VALUE) { | ||
| 768 | expected_type = PTR_TO_STACK; | ||
| 769 | } else if (arg_type == ARG_CONST_STACK_SIZE) { | ||
| 770 | expected_type = CONST_IMM; | ||
| 771 | } else if (arg_type == ARG_CONST_MAP_PTR) { | ||
| 772 | expected_type = CONST_PTR_TO_MAP; | ||
| 773 | } else { | ||
| 774 | verbose("unsupported arg_type %d\n", arg_type); | ||
| 775 | return -EFAULT; | ||
| 776 | } | ||
| 777 | |||
| 778 | if (reg->type != expected_type) { | ||
| 779 | verbose("R%d type=%s expected=%s\n", regno, | ||
| 780 | reg_type_str[reg->type], reg_type_str[expected_type]); | ||
| 781 | return -EACCES; | ||
| 782 | } | ||
| 783 | |||
| 784 | if (arg_type == ARG_CONST_MAP_PTR) { | ||
| 785 | /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ | ||
| 786 | *mapp = reg->map_ptr; | ||
| 787 | |||
| 788 | } else if (arg_type == ARG_PTR_TO_MAP_KEY) { | ||
| 789 | /* bpf_map_xxx(..., map_ptr, ..., key) call: | ||
| 790 | * check that [key, key + map->key_size) are within | ||
| 791 | * stack limits and initialized | ||
| 792 | */ | ||
| 793 | if (!*mapp) { | ||
| 794 | /* in function declaration map_ptr must come before | ||
| 795 | * map_key, so that it's verified and known before | ||
| 796 | * we have to check map_key here. Otherwise it means | ||
| 797 | * that kernel subsystem misconfigured verifier | ||
| 798 | */ | ||
| 799 | verbose("invalid map_ptr to access map->key\n"); | ||
| 800 | return -EACCES; | ||
| 801 | } | ||
| 802 | err = check_stack_boundary(env, regno, (*mapp)->key_size); | ||
| 803 | |||
| 804 | } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { | ||
| 805 | /* bpf_map_xxx(..., map_ptr, ..., value) call: | ||
| 806 | * check [value, value + map->value_size) validity | ||
| 807 | */ | ||
| 808 | if (!*mapp) { | ||
| 809 | /* kernel subsystem misconfigured verifier */ | ||
| 810 | verbose("invalid map_ptr to access map->value\n"); | ||
| 811 | return -EACCES; | ||
| 812 | } | ||
| 813 | err = check_stack_boundary(env, regno, (*mapp)->value_size); | ||
| 814 | |||
| 815 | } else if (arg_type == ARG_CONST_STACK_SIZE) { | ||
| 816 | /* bpf_xxx(..., buf, len) call will access 'len' bytes | ||
| 817 | * from stack pointer 'buf'. Check it | ||
| 818 | * note: regno == len, regno - 1 == buf | ||
| 819 | */ | ||
| 820 | if (regno == 0) { | ||
| 821 | /* kernel subsystem misconfigured verifier */ | ||
| 822 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); | ||
| 823 | return -EACCES; | ||
| 824 | } | ||
| 825 | err = check_stack_boundary(env, regno - 1, reg->imm); | ||
| 826 | } | ||
| 827 | |||
| 828 | return err; | ||
| 829 | } | ||
| 830 | |||
| 831 | static int check_call(struct verifier_env *env, int func_id) | ||
| 832 | { | ||
| 833 | struct verifier_state *state = &env->cur_state; | ||
| 834 | const struct bpf_func_proto *fn = NULL; | ||
| 835 | struct reg_state *regs = state->regs; | ||
| 836 | struct bpf_map *map = NULL; | ||
| 837 | struct reg_state *reg; | ||
| 838 | int i, err; | ||
| 839 | |||
| 840 | /* find function prototype */ | ||
| 841 | if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { | ||
| 842 | verbose("invalid func %d\n", func_id); | ||
| 843 | return -EINVAL; | ||
| 844 | } | ||
| 845 | |||
| 846 | if (env->prog->aux->ops->get_func_proto) | ||
| 847 | fn = env->prog->aux->ops->get_func_proto(func_id); | ||
| 848 | |||
| 849 | if (!fn) { | ||
| 850 | verbose("unknown func %d\n", func_id); | ||
| 851 | return -EINVAL; | ||
| 852 | } | ||
| 853 | |||
| 854 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | ||
| 855 | if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { | ||
| 856 | verbose("cannot call GPL only function from proprietary program\n"); | ||
| 857 | return -EINVAL; | ||
| 858 | } | ||
| 859 | |||
| 860 | /* check args */ | ||
| 861 | err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); | ||
| 862 | if (err) | ||
| 863 | return err; | ||
| 864 | err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); | ||
| 865 | if (err) | ||
| 866 | return err; | ||
| 867 | err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); | ||
| 868 | if (err) | ||
| 869 | return err; | ||
| 870 | err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); | ||
| 871 | if (err) | ||
| 872 | return err; | ||
| 873 | err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); | ||
| 874 | if (err) | ||
| 875 | return err; | ||
| 876 | |||
| 877 | /* reset caller saved regs */ | ||
| 878 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | ||
| 879 | reg = regs + caller_saved[i]; | ||
| 880 | reg->type = NOT_INIT; | ||
| 881 | reg->imm = 0; | ||
| 882 | } | ||
| 883 | |||
| 884 | /* update return register */ | ||
| 885 | if (fn->ret_type == RET_INTEGER) { | ||
| 886 | regs[BPF_REG_0].type = UNKNOWN_VALUE; | ||
| 887 | } else if (fn->ret_type == RET_VOID) { | ||
| 888 | regs[BPF_REG_0].type = NOT_INIT; | ||
| 889 | } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { | ||
| 890 | regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; | ||
| 891 | /* remember map_ptr, so that check_map_access() | ||
| 892 | * can check 'value_size' boundary of memory access | ||
| 893 | * to map element returned from bpf_map_lookup_elem() | ||
| 894 | */ | ||
| 895 | if (map == NULL) { | ||
| 896 | verbose("kernel subsystem misconfigured verifier\n"); | ||
| 897 | return -EINVAL; | ||
| 898 | } | ||
| 899 | regs[BPF_REG_0].map_ptr = map; | ||
| 900 | } else { | ||
| 901 | verbose("unknown return type %d of func %d\n", | ||
| 902 | fn->ret_type, func_id); | ||
| 903 | return -EINVAL; | ||
| 904 | } | ||
| 905 | return 0; | ||
| 906 | } | ||
| 907 | |||
| 908 | /* check validity of 32-bit and 64-bit arithmetic operations */ | ||
| 909 | static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | ||
| 910 | { | ||
| 911 | u8 opcode = BPF_OP(insn->code); | ||
| 912 | int err; | ||
| 913 | |||
| 914 | if (opcode == BPF_END || opcode == BPF_NEG) { | ||
| 915 | if (opcode == BPF_NEG) { | ||
| 916 | if (BPF_SRC(insn->code) != 0 || | ||
| 917 | insn->src_reg != BPF_REG_0 || | ||
| 918 | insn->off != 0 || insn->imm != 0) { | ||
| 919 | verbose("BPF_NEG uses reserved fields\n"); | ||
| 920 | return -EINVAL; | ||
| 921 | } | ||
| 922 | } else { | ||
| 923 | if (insn->src_reg != BPF_REG_0 || insn->off != 0 || | ||
| 924 | (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { | ||
| 925 | verbose("BPF_END uses reserved fields\n"); | ||
| 926 | return -EINVAL; | ||
| 927 | } | ||
| 928 | } | ||
| 929 | |||
| 930 | /* check src operand */ | ||
| 931 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 932 | if (err) | ||
| 933 | return err; | ||
| 934 | |||
| 935 | /* check dest operand */ | ||
| 936 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 937 | if (err) | ||
| 938 | return err; | ||
| 939 | |||
| 940 | } else if (opcode == BPF_MOV) { | ||
| 941 | |||
| 942 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 943 | if (insn->imm != 0 || insn->off != 0) { | ||
| 944 | verbose("BPF_MOV uses reserved fields\n"); | ||
| 945 | return -EINVAL; | ||
| 946 | } | ||
| 947 | |||
| 948 | /* check src operand */ | ||
| 949 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 950 | if (err) | ||
| 951 | return err; | ||
| 952 | } else { | ||
| 953 | if (insn->src_reg != BPF_REG_0 || insn->off != 0) { | ||
| 954 | verbose("BPF_MOV uses reserved fields\n"); | ||
| 955 | return -EINVAL; | ||
| 956 | } | ||
| 957 | } | ||
| 958 | |||
| 959 | /* check dest operand */ | ||
| 960 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 961 | if (err) | ||
| 962 | return err; | ||
| 963 | |||
| 964 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 965 | if (BPF_CLASS(insn->code) == BPF_ALU64) { | ||
| 966 | /* case: R1 = R2 | ||
| 967 | * copy register state to dest reg | ||
| 968 | */ | ||
| 969 | regs[insn->dst_reg] = regs[insn->src_reg]; | ||
| 970 | } else { | ||
| 971 | regs[insn->dst_reg].type = UNKNOWN_VALUE; | ||
| 972 | regs[insn->dst_reg].map_ptr = NULL; | ||
| 973 | } | ||
| 974 | } else { | ||
| 975 | /* case: R = imm | ||
| 976 | * remember the value we stored into this reg | ||
| 977 | */ | ||
| 978 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 979 | regs[insn->dst_reg].imm = insn->imm; | ||
| 980 | } | ||
| 981 | |||
| 982 | } else if (opcode > BPF_END) { | ||
| 983 | verbose("invalid BPF_ALU opcode %x\n", opcode); | ||
| 984 | return -EINVAL; | ||
| 985 | |||
| 986 | } else { /* all other ALU ops: and, sub, xor, add, ... */ | ||
| 987 | |||
| 988 | bool stack_relative = false; | ||
| 989 | |||
| 990 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 991 | if (insn->imm != 0 || insn->off != 0) { | ||
| 992 | verbose("BPF_ALU uses reserved fields\n"); | ||
| 993 | return -EINVAL; | ||
| 994 | } | ||
| 995 | /* check src1 operand */ | ||
| 996 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 997 | if (err) | ||
| 998 | return err; | ||
| 999 | } else { | ||
| 1000 | if (insn->src_reg != BPF_REG_0 || insn->off != 0) { | ||
| 1001 | verbose("BPF_ALU uses reserved fields\n"); | ||
| 1002 | return -EINVAL; | ||
| 1003 | } | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | /* check src2 operand */ | ||
| 1007 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1008 | if (err) | ||
| 1009 | return err; | ||
| 1010 | |||
| 1011 | if ((opcode == BPF_MOD || opcode == BPF_DIV) && | ||
| 1012 | BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { | ||
| 1013 | verbose("div by zero\n"); | ||
| 1014 | return -EINVAL; | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | /* pattern match 'bpf_add Rx, imm' instruction */ | ||
| 1018 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && | ||
| 1019 | regs[insn->dst_reg].type == FRAME_PTR && | ||
| 1020 | BPF_SRC(insn->code) == BPF_K) | ||
| 1021 | stack_relative = true; | ||
| 1022 | |||
| 1023 | /* check dest operand */ | ||
| 1024 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 1025 | if (err) | ||
| 1026 | return err; | ||
| 1027 | |||
| 1028 | if (stack_relative) { | ||
| 1029 | regs[insn->dst_reg].type = PTR_TO_STACK; | ||
| 1030 | regs[insn->dst_reg].imm = insn->imm; | ||
| 1031 | } | ||
| 1032 | } | ||
| 1033 | |||
| 1034 | return 0; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | static int check_cond_jmp_op(struct verifier_env *env, | ||
| 1038 | struct bpf_insn *insn, int *insn_idx) | ||
| 1039 | { | ||
| 1040 | struct reg_state *regs = env->cur_state.regs; | ||
| 1041 | struct verifier_state *other_branch; | ||
| 1042 | u8 opcode = BPF_OP(insn->code); | ||
| 1043 | int err; | ||
| 1044 | |||
| 1045 | if (opcode > BPF_EXIT) { | ||
| 1046 | verbose("invalid BPF_JMP opcode %x\n", opcode); | ||
| 1047 | return -EINVAL; | ||
| 1048 | } | ||
| 1049 | |||
| 1050 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1051 | if (insn->imm != 0) { | ||
| 1052 | verbose("BPF_JMP uses reserved fields\n"); | ||
| 1053 | return -EINVAL; | ||
| 1054 | } | ||
| 1055 | |||
| 1056 | /* check src1 operand */ | ||
| 1057 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1058 | if (err) | ||
| 1059 | return err; | ||
| 1060 | } else { | ||
| 1061 | if (insn->src_reg != BPF_REG_0) { | ||
| 1062 | verbose("BPF_JMP uses reserved fields\n"); | ||
| 1063 | return -EINVAL; | ||
| 1064 | } | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | /* check src2 operand */ | ||
| 1068 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1069 | if (err) | ||
| 1070 | return err; | ||
| 1071 | |||
| 1072 | /* detect if R == 0 where R was initialized to zero earlier */ | ||
| 1073 | if (BPF_SRC(insn->code) == BPF_K && | ||
| 1074 | (opcode == BPF_JEQ || opcode == BPF_JNE) && | ||
| 1075 | regs[insn->dst_reg].type == CONST_IMM && | ||
| 1076 | regs[insn->dst_reg].imm == insn->imm) { | ||
| 1077 | if (opcode == BPF_JEQ) { | ||
| 1078 | /* if (imm == imm) goto pc+off; | ||
| 1079 | * only follow the goto, ignore fall-through | ||
| 1080 | */ | ||
| 1081 | *insn_idx += insn->off; | ||
| 1082 | return 0; | ||
| 1083 | } else { | ||
| 1084 | /* if (imm != imm) goto pc+off; | ||
| 1085 | * only follow fall-through branch, since | ||
| 1086 | * that's where the program will go | ||
| 1087 | */ | ||
| 1088 | return 0; | ||
| 1089 | } | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); | ||
| 1093 | if (!other_branch) | ||
| 1094 | return -EFAULT; | ||
| 1095 | |||
| 1096 | /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ | ||
| 1097 | if (BPF_SRC(insn->code) == BPF_K && | ||
| 1098 | insn->imm == 0 && (opcode == BPF_JEQ || | ||
| 1099 | opcode == BPF_JNE) && | ||
| 1100 | regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { | ||
| 1101 | if (opcode == BPF_JEQ) { | ||
| 1102 | /* next fallthrough insn can access memory via | ||
| 1103 | * this register | ||
| 1104 | */ | ||
| 1105 | regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; | ||
| 1106 | /* branch targer cannot access it, since reg == 0 */ | ||
| 1107 | other_branch->regs[insn->dst_reg].type = CONST_IMM; | ||
| 1108 | other_branch->regs[insn->dst_reg].imm = 0; | ||
| 1109 | } else { | ||
| 1110 | other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; | ||
| 1111 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 1112 | regs[insn->dst_reg].imm = 0; | ||
| 1113 | } | ||
| 1114 | } else if (BPF_SRC(insn->code) == BPF_K && | ||
| 1115 | (opcode == BPF_JEQ || opcode == BPF_JNE)) { | ||
| 1116 | |||
| 1117 | if (opcode == BPF_JEQ) { | ||
| 1118 | /* detect if (R == imm) goto | ||
| 1119 | * and in the target state recognize that R = imm | ||
| 1120 | */ | ||
| 1121 | other_branch->regs[insn->dst_reg].type = CONST_IMM; | ||
| 1122 | other_branch->regs[insn->dst_reg].imm = insn->imm; | ||
| 1123 | } else { | ||
| 1124 | /* detect if (R != imm) goto | ||
| 1125 | * and in the fall-through state recognize that R = imm | ||
| 1126 | */ | ||
| 1127 | regs[insn->dst_reg].type = CONST_IMM; | ||
| 1128 | regs[insn->dst_reg].imm = insn->imm; | ||
| 1129 | } | ||
| 1130 | } | ||
| 1131 | if (log_level) | ||
| 1132 | print_verifier_state(env); | ||
| 1133 | return 0; | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | /* return the map pointer stored inside BPF_LD_IMM64 instruction */ | ||
| 1137 | static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) | ||
| 1138 | { | ||
| 1139 | u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; | ||
| 1140 | |||
| 1141 | return (struct bpf_map *) (unsigned long) imm64; | ||
| 1142 | } | ||
| 1143 | |||
| 1144 | /* verify BPF_LD_IMM64 instruction */ | ||
| 1145 | static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | ||
| 1146 | { | ||
| 1147 | struct reg_state *regs = env->cur_state.regs; | ||
| 1148 | int err; | ||
| 1149 | |||
| 1150 | if (BPF_SIZE(insn->code) != BPF_DW) { | ||
| 1151 | verbose("invalid BPF_LD_IMM insn\n"); | ||
| 1152 | return -EINVAL; | ||
| 1153 | } | ||
| 1154 | if (insn->off != 0) { | ||
| 1155 | verbose("BPF_LD_IMM64 uses reserved fields\n"); | ||
| 1156 | return -EINVAL; | ||
| 1157 | } | ||
| 1158 | |||
| 1159 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | ||
| 1160 | if (err) | ||
| 1161 | return err; | ||
| 1162 | |||
| 1163 | if (insn->src_reg == 0) | ||
| 1164 | /* generic move 64-bit immediate into a register */ | ||
| 1165 | return 0; | ||
| 1166 | |||
| 1167 | /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ | ||
| 1168 | BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); | ||
| 1169 | |||
| 1170 | regs[insn->dst_reg].type = CONST_PTR_TO_MAP; | ||
| 1171 | regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); | ||
| 1172 | return 0; | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | /* verify safety of LD_ABS|LD_IND instructions: | ||
| 1176 | * - they can only appear in the programs where ctx == skb | ||
| 1177 | * - since they are wrappers of function calls, they scratch R1-R5 registers, | ||
| 1178 | * preserve R6-R9, and store return value into R0 | ||
| 1179 | * | ||
| 1180 | * Implicit input: | ||
| 1181 | * ctx == skb == R6 == CTX | ||
| 1182 | * | ||
| 1183 | * Explicit input: | ||
| 1184 | * SRC == any register | ||
| 1185 | * IMM == 32-bit immediate | ||
| 1186 | * | ||
| 1187 | * Output: | ||
| 1188 | * R0 - 8/16/32-bit skb data converted to cpu endianness | ||
| 1189 | */ | ||
| 1190 | static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | ||
| 1191 | { | ||
| 1192 | struct reg_state *regs = env->cur_state.regs; | ||
| 1193 | u8 mode = BPF_MODE(insn->code); | ||
| 1194 | struct reg_state *reg; | ||
| 1195 | int i, err; | ||
| 1196 | |||
| 1197 | if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { | ||
| 1198 | verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); | ||
| 1199 | return -EINVAL; | ||
| 1200 | } | ||
| 1201 | |||
| 1202 | if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || | ||
| 1203 | (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { | ||
| 1204 | verbose("BPF_LD_ABS uses reserved fields\n"); | ||
| 1205 | return -EINVAL; | ||
| 1206 | } | ||
| 1207 | |||
| 1208 | /* check whether implicit source operand (register R6) is readable */ | ||
| 1209 | err = check_reg_arg(regs, BPF_REG_6, SRC_OP); | ||
| 1210 | if (err) | ||
| 1211 | return err; | ||
| 1212 | |||
| 1213 | if (regs[BPF_REG_6].type != PTR_TO_CTX) { | ||
| 1214 | verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); | ||
| 1215 | return -EINVAL; | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | if (mode == BPF_IND) { | ||
| 1219 | /* check explicit source operand */ | ||
| 1220 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1221 | if (err) | ||
| 1222 | return err; | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | /* reset caller saved regs to unreadable */ | ||
| 1226 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | ||
| 1227 | reg = regs + caller_saved[i]; | ||
| 1228 | reg->type = NOT_INIT; | ||
| 1229 | reg->imm = 0; | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | /* mark destination R0 register as readable, since it contains | ||
| 1233 | * the value fetched from the packet | ||
| 1234 | */ | ||
| 1235 | regs[BPF_REG_0].type = UNKNOWN_VALUE; | ||
| 1236 | return 0; | ||
| 1237 | } | ||
| 1238 | |||
| 1239 | /* non-recursive DFS pseudo code | ||
| 1240 | * 1 procedure DFS-iterative(G,v): | ||
| 1241 | * 2 label v as discovered | ||
| 1242 | * 3 let S be a stack | ||
| 1243 | * 4 S.push(v) | ||
| 1244 | * 5 while S is not empty | ||
| 1245 | * 6 t <- S.pop() | ||
| 1246 | * 7 if t is what we're looking for: | ||
| 1247 | * 8 return t | ||
| 1248 | * 9 for all edges e in G.adjacentEdges(t) do | ||
| 1249 | * 10 if edge e is already labelled | ||
| 1250 | * 11 continue with the next edge | ||
| 1251 | * 12 w <- G.adjacentVertex(t,e) | ||
| 1252 | * 13 if vertex w is not discovered and not explored | ||
| 1253 | * 14 label e as tree-edge | ||
| 1254 | * 15 label w as discovered | ||
| 1255 | * 16 S.push(w) | ||
| 1256 | * 17 continue at 5 | ||
| 1257 | * 18 else if vertex w is discovered | ||
| 1258 | * 19 label e as back-edge | ||
| 1259 | * 20 else | ||
| 1260 | * 21 // vertex w is explored | ||
| 1261 | * 22 label e as forward- or cross-edge | ||
| 1262 | * 23 label t as explored | ||
| 1263 | * 24 S.pop() | ||
| 1264 | * | ||
| 1265 | * convention: | ||
| 1266 | * 0x10 - discovered | ||
| 1267 | * 0x11 - discovered and fall-through edge labelled | ||
| 1268 | * 0x12 - discovered and fall-through and branch edges labelled | ||
| 1269 | * 0x20 - explored | ||
| 1270 | */ | ||
| 1271 | |||
| 1272 | enum { | ||
| 1273 | DISCOVERED = 0x10, | ||
| 1274 | EXPLORED = 0x20, | ||
| 1275 | FALLTHROUGH = 1, | ||
| 1276 | BRANCH = 2, | ||
| 1277 | }; | ||
| 1278 | |||
| 1279 | #define STATE_LIST_MARK ((struct verifier_state_list *) -1L) | ||
| 1280 | |||
| 1281 | static int *insn_stack; /* stack of insns to process */ | ||
| 1282 | static int cur_stack; /* current stack index */ | ||
| 1283 | static int *insn_state; | ||
| 1284 | |||
| 1285 | /* t, w, e - match pseudo-code above: | ||
| 1286 | * t - index of current instruction | ||
| 1287 | * w - next instruction | ||
| 1288 | * e - edge | ||
| 1289 | */ | ||
| 1290 | static int push_insn(int t, int w, int e, struct verifier_env *env) | ||
| 1291 | { | ||
| 1292 | if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) | ||
| 1293 | return 0; | ||
| 1294 | |||
| 1295 | if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) | ||
| 1296 | return 0; | ||
| 1297 | |||
| 1298 | if (w < 0 || w >= env->prog->len) { | ||
| 1299 | verbose("jump out of range from insn %d to %d\n", t, w); | ||
| 1300 | return -EINVAL; | ||
| 1301 | } | ||
| 1302 | |||
| 1303 | if (e == BRANCH) | ||
| 1304 | /* mark branch target for state pruning */ | ||
| 1305 | env->explored_states[w] = STATE_LIST_MARK; | ||
| 1306 | |||
| 1307 | if (insn_state[w] == 0) { | ||
| 1308 | /* tree-edge */ | ||
| 1309 | insn_state[t] = DISCOVERED | e; | ||
| 1310 | insn_state[w] = DISCOVERED; | ||
| 1311 | if (cur_stack >= env->prog->len) | ||
| 1312 | return -E2BIG; | ||
| 1313 | insn_stack[cur_stack++] = w; | ||
| 1314 | return 1; | ||
| 1315 | } else if ((insn_state[w] & 0xF0) == DISCOVERED) { | ||
| 1316 | verbose("back-edge from insn %d to %d\n", t, w); | ||
| 1317 | return -EINVAL; | ||
| 1318 | } else if (insn_state[w] == EXPLORED) { | ||
| 1319 | /* forward- or cross-edge */ | ||
| 1320 | insn_state[t] = DISCOVERED | e; | ||
| 1321 | } else { | ||
| 1322 | verbose("insn state internal bug\n"); | ||
| 1323 | return -EFAULT; | ||
| 1324 | } | ||
| 1325 | return 0; | ||
| 1326 | } | ||
| 1327 | |||
| 1328 | /* non-recursive depth-first-search to detect loops in BPF program | ||
| 1329 | * loop == back-edge in directed graph | ||
| 1330 | */ | ||
| 1331 | static int check_cfg(struct verifier_env *env) | ||
| 1332 | { | ||
| 1333 | struct bpf_insn *insns = env->prog->insnsi; | ||
| 1334 | int insn_cnt = env->prog->len; | ||
| 1335 | int ret = 0; | ||
| 1336 | int i, t; | ||
| 1337 | |||
| 1338 | insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); | ||
| 1339 | if (!insn_state) | ||
| 1340 | return -ENOMEM; | ||
| 1341 | |||
| 1342 | insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); | ||
| 1343 | if (!insn_stack) { | ||
| 1344 | kfree(insn_state); | ||
| 1345 | return -ENOMEM; | ||
| 1346 | } | ||
| 1347 | |||
| 1348 | insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ | ||
| 1349 | insn_stack[0] = 0; /* 0 is the first instruction */ | ||
| 1350 | cur_stack = 1; | ||
| 1351 | |||
| 1352 | peek_stack: | ||
| 1353 | if (cur_stack == 0) | ||
| 1354 | goto check_state; | ||
| 1355 | t = insn_stack[cur_stack - 1]; | ||
| 1356 | |||
| 1357 | if (BPF_CLASS(insns[t].code) == BPF_JMP) { | ||
| 1358 | u8 opcode = BPF_OP(insns[t].code); | ||
| 1359 | |||
| 1360 | if (opcode == BPF_EXIT) { | ||
| 1361 | goto mark_explored; | ||
| 1362 | } else if (opcode == BPF_CALL) { | ||
| 1363 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1364 | if (ret == 1) | ||
| 1365 | goto peek_stack; | ||
| 1366 | else if (ret < 0) | ||
| 1367 | goto err_free; | ||
| 1368 | } else if (opcode == BPF_JA) { | ||
| 1369 | if (BPF_SRC(insns[t].code) != BPF_K) { | ||
| 1370 | ret = -EINVAL; | ||
| 1371 | goto err_free; | ||
| 1372 | } | ||
| 1373 | /* unconditional jump with single edge */ | ||
| 1374 | ret = push_insn(t, t + insns[t].off + 1, | ||
| 1375 | FALLTHROUGH, env); | ||
| 1376 | if (ret == 1) | ||
| 1377 | goto peek_stack; | ||
| 1378 | else if (ret < 0) | ||
| 1379 | goto err_free; | ||
| 1380 | /* tell verifier to check for equivalent states | ||
| 1381 | * after every call and jump | ||
| 1382 | */ | ||
| 1383 | env->explored_states[t + 1] = STATE_LIST_MARK; | ||
| 1384 | } else { | ||
| 1385 | /* conditional jump with two edges */ | ||
| 1386 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1387 | if (ret == 1) | ||
| 1388 | goto peek_stack; | ||
| 1389 | else if (ret < 0) | ||
| 1390 | goto err_free; | ||
| 1391 | |||
| 1392 | ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); | ||
| 1393 | if (ret == 1) | ||
| 1394 | goto peek_stack; | ||
| 1395 | else if (ret < 0) | ||
| 1396 | goto err_free; | ||
| 1397 | } | ||
| 1398 | } else { | ||
| 1399 | /* all other non-branch instructions with single | ||
| 1400 | * fall-through edge | ||
| 1401 | */ | ||
| 1402 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | ||
| 1403 | if (ret == 1) | ||
| 1404 | goto peek_stack; | ||
| 1405 | else if (ret < 0) | ||
| 1406 | goto err_free; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | mark_explored: | ||
| 1410 | insn_state[t] = EXPLORED; | ||
| 1411 | if (cur_stack-- <= 0) { | ||
| 1412 | verbose("pop stack internal bug\n"); | ||
| 1413 | ret = -EFAULT; | ||
| 1414 | goto err_free; | ||
| 1415 | } | ||
| 1416 | goto peek_stack; | ||
| 1417 | |||
| 1418 | check_state: | ||
| 1419 | for (i = 0; i < insn_cnt; i++) { | ||
| 1420 | if (insn_state[i] != EXPLORED) { | ||
| 1421 | verbose("unreachable insn %d\n", i); | ||
| 1422 | ret = -EINVAL; | ||
| 1423 | goto err_free; | ||
| 1424 | } | ||
| 1425 | } | ||
| 1426 | ret = 0; /* cfg looks good */ | ||
| 1427 | |||
| 1428 | err_free: | ||
| 1429 | kfree(insn_state); | ||
| 1430 | kfree(insn_stack); | ||
| 1431 | return ret; | ||
| 1432 | } | ||
| 1433 | |||
| 1434 | /* compare two verifier states | ||
| 1435 | * | ||
| 1436 | * all states stored in state_list are known to be valid, since | ||
| 1437 | * verifier reached 'bpf_exit' instruction through them | ||
| 1438 | * | ||
| 1439 | * this function is called when verifier exploring different branches of | ||
| 1440 | * execution popped from the state stack. If it sees an old state that has | ||
| 1441 | * more strict register state and more strict stack state then this execution | ||
| 1442 | * branch doesn't need to be explored further, since verifier already | ||
| 1443 | * concluded that more strict state leads to valid finish. | ||
| 1444 | * | ||
| 1445 | * Therefore two states are equivalent if register state is more conservative | ||
| 1446 | * and explored stack state is more conservative than the current one. | ||
| 1447 | * Example: | ||
| 1448 | * explored current | ||
| 1449 | * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) | ||
| 1450 | * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) | ||
| 1451 | * | ||
| 1452 | * In other words if current stack state (one being explored) has more | ||
| 1453 | * valid slots than old one that already passed validation, it means | ||
| 1454 | * the verifier can stop exploring and conclude that current state is valid too | ||
| 1455 | * | ||
| 1456 | * Similarly with registers. If explored state has register type as invalid | ||
| 1457 | * whereas register type in current state is meaningful, it means that | ||
| 1458 | * the current state will reach 'bpf_exit' instruction safely | ||
| 1459 | */ | ||
| 1460 | static bool states_equal(struct verifier_state *old, struct verifier_state *cur) | ||
| 1461 | { | ||
| 1462 | int i; | ||
| 1463 | |||
| 1464 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
| 1465 | if (memcmp(&old->regs[i], &cur->regs[i], | ||
| 1466 | sizeof(old->regs[0])) != 0) { | ||
| 1467 | if (old->regs[i].type == NOT_INIT || | ||
| 1468 | (old->regs[i].type == UNKNOWN_VALUE && | ||
| 1469 | cur->regs[i].type != NOT_INIT)) | ||
| 1470 | continue; | ||
| 1471 | return false; | ||
| 1472 | } | ||
| 1473 | } | ||
| 1474 | |||
| 1475 | for (i = 0; i < MAX_BPF_STACK; i++) { | ||
| 1476 | if (old->stack_slot_type[i] == STACK_INVALID) | ||
| 1477 | continue; | ||
| 1478 | if (old->stack_slot_type[i] != cur->stack_slot_type[i]) | ||
| 1479 | /* Ex: old explored (safe) state has STACK_SPILL in | ||
| 1480 | * this stack slot, but current has has STACK_MISC -> | ||
| 1481 | * this verifier states are not equivalent, | ||
| 1482 | * return false to continue verification of this path | ||
| 1483 | */ | ||
| 1484 | return false; | ||
| 1485 | if (i % BPF_REG_SIZE) | ||
| 1486 | continue; | ||
| 1487 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], | ||
| 1488 | &cur->spilled_regs[i / BPF_REG_SIZE], | ||
| 1489 | sizeof(old->spilled_regs[0]))) | ||
| 1490 | /* when explored and current stack slot types are | ||
| 1491 | * the same, check that stored pointers types | ||
| 1492 | * are the same as well. | ||
| 1493 | * Ex: explored safe path could have stored | ||
| 1494 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} | ||
| 1495 | * but current path has stored: | ||
| 1496 | * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} | ||
| 1497 | * such verifier states are not equivalent. | ||
| 1498 | * return false to continue verification of this path | ||
| 1499 | */ | ||
| 1500 | return false; | ||
| 1501 | else | ||
| 1502 | continue; | ||
| 1503 | } | ||
| 1504 | return true; | ||
| 1505 | } | ||
| 1506 | |||
| 1507 | static int is_state_visited(struct verifier_env *env, int insn_idx) | ||
| 1508 | { | ||
| 1509 | struct verifier_state_list *new_sl; | ||
| 1510 | struct verifier_state_list *sl; | ||
| 1511 | |||
| 1512 | sl = env->explored_states[insn_idx]; | ||
| 1513 | if (!sl) | ||
| 1514 | /* this 'insn_idx' instruction wasn't marked, so we will not | ||
| 1515 | * be doing state search here | ||
| 1516 | */ | ||
| 1517 | return 0; | ||
| 1518 | |||
| 1519 | while (sl != STATE_LIST_MARK) { | ||
| 1520 | if (states_equal(&sl->state, &env->cur_state)) | ||
| 1521 | /* reached equivalent register/stack state, | ||
| 1522 | * prune the search | ||
| 1523 | */ | ||
| 1524 | return 1; | ||
| 1525 | sl = sl->next; | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | /* there were no equivalent states, remember current one. | ||
| 1529 | * technically the current state is not proven to be safe yet, | ||
| 1530 | * but it will either reach bpf_exit (which means it's safe) or | ||
| 1531 | * it will be rejected. Since there are no loops, we won't be | ||
| 1532 | * seeing this 'insn_idx' instruction again on the way to bpf_exit | ||
| 1533 | */ | ||
| 1534 | new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); | ||
| 1535 | if (!new_sl) | ||
| 1536 | return -ENOMEM; | ||
| 1537 | |||
| 1538 | /* add new state to the head of linked list */ | ||
| 1539 | memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); | ||
| 1540 | new_sl->next = env->explored_states[insn_idx]; | ||
| 1541 | env->explored_states[insn_idx] = new_sl; | ||
| 1542 | return 0; | ||
| 1543 | } | ||
| 1544 | |||
| 1545 | static int do_check(struct verifier_env *env) | ||
| 1546 | { | ||
| 1547 | struct verifier_state *state = &env->cur_state; | ||
| 1548 | struct bpf_insn *insns = env->prog->insnsi; | ||
| 1549 | struct reg_state *regs = state->regs; | ||
| 1550 | int insn_cnt = env->prog->len; | ||
| 1551 | int insn_idx, prev_insn_idx = 0; | ||
| 1552 | int insn_processed = 0; | ||
| 1553 | bool do_print_state = false; | ||
| 1554 | |||
| 1555 | init_reg_state(regs); | ||
| 1556 | insn_idx = 0; | ||
| 1557 | for (;;) { | ||
| 1558 | struct bpf_insn *insn; | ||
| 1559 | u8 class; | ||
| 1560 | int err; | ||
| 1561 | |||
| 1562 | if (insn_idx >= insn_cnt) { | ||
| 1563 | verbose("invalid insn idx %d insn_cnt %d\n", | ||
| 1564 | insn_idx, insn_cnt); | ||
| 1565 | return -EFAULT; | ||
| 1566 | } | ||
| 1567 | |||
| 1568 | insn = &insns[insn_idx]; | ||
| 1569 | class = BPF_CLASS(insn->code); | ||
| 1570 | |||
| 1571 | if (++insn_processed > 32768) { | ||
| 1572 | verbose("BPF program is too large. Proccessed %d insn\n", | ||
| 1573 | insn_processed); | ||
| 1574 | return -E2BIG; | ||
| 1575 | } | ||
| 1576 | |||
| 1577 | err = is_state_visited(env, insn_idx); | ||
| 1578 | if (err < 0) | ||
| 1579 | return err; | ||
| 1580 | if (err == 1) { | ||
| 1581 | /* found equivalent state, can prune the search */ | ||
| 1582 | if (log_level) { | ||
| 1583 | if (do_print_state) | ||
| 1584 | verbose("\nfrom %d to %d: safe\n", | ||
| 1585 | prev_insn_idx, insn_idx); | ||
| 1586 | else | ||
| 1587 | verbose("%d: safe\n", insn_idx); | ||
| 1588 | } | ||
| 1589 | goto process_bpf_exit; | ||
| 1590 | } | ||
| 1591 | |||
| 1592 | if (log_level && do_print_state) { | ||
| 1593 | verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); | ||
| 1594 | print_verifier_state(env); | ||
| 1595 | do_print_state = false; | ||
| 1596 | } | ||
| 1597 | |||
| 1598 | if (log_level) { | ||
| 1599 | verbose("%d: ", insn_idx); | ||
| 1600 | print_bpf_insn(insn); | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | if (class == BPF_ALU || class == BPF_ALU64) { | ||
| 1604 | err = check_alu_op(regs, insn); | ||
| 1605 | if (err) | ||
| 1606 | return err; | ||
| 1607 | |||
| 1608 | } else if (class == BPF_LDX) { | ||
| 1609 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1610 | insn->imm != 0) { | ||
| 1611 | verbose("BPF_LDX uses reserved fields\n"); | ||
| 1612 | return -EINVAL; | ||
| 1613 | } | ||
| 1614 | /* check src operand */ | ||
| 1615 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1616 | if (err) | ||
| 1617 | return err; | ||
| 1618 | |||
| 1619 | err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); | ||
| 1620 | if (err) | ||
| 1621 | return err; | ||
| 1622 | |||
| 1623 | /* check that memory (src_reg + off) is readable, | ||
| 1624 | * the state of dst_reg will be updated by this func | ||
| 1625 | */ | ||
| 1626 | err = check_mem_access(env, insn->src_reg, insn->off, | ||
| 1627 | BPF_SIZE(insn->code), BPF_READ, | ||
| 1628 | insn->dst_reg); | ||
| 1629 | if (err) | ||
| 1630 | return err; | ||
| 1631 | |||
| 1632 | } else if (class == BPF_STX) { | ||
| 1633 | if (BPF_MODE(insn->code) == BPF_XADD) { | ||
| 1634 | err = check_xadd(env, insn); | ||
| 1635 | if (err) | ||
| 1636 | return err; | ||
| 1637 | insn_idx++; | ||
| 1638 | continue; | ||
| 1639 | } | ||
| 1640 | |||
| 1641 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1642 | insn->imm != 0) { | ||
| 1643 | verbose("BPF_STX uses reserved fields\n"); | ||
| 1644 | return -EINVAL; | ||
| 1645 | } | ||
| 1646 | /* check src1 operand */ | ||
| 1647 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | ||
| 1648 | if (err) | ||
| 1649 | return err; | ||
| 1650 | /* check src2 operand */ | ||
| 1651 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1652 | if (err) | ||
| 1653 | return err; | ||
| 1654 | |||
| 1655 | /* check that memory (dst_reg + off) is writeable */ | ||
| 1656 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 1657 | BPF_SIZE(insn->code), BPF_WRITE, | ||
| 1658 | insn->src_reg); | ||
| 1659 | if (err) | ||
| 1660 | return err; | ||
| 1661 | |||
| 1662 | } else if (class == BPF_ST) { | ||
| 1663 | if (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1664 | insn->src_reg != BPF_REG_0) { | ||
| 1665 | verbose("BPF_ST uses reserved fields\n"); | ||
| 1666 | return -EINVAL; | ||
| 1667 | } | ||
| 1668 | /* check src operand */ | ||
| 1669 | err = check_reg_arg(regs, insn->dst_reg, SRC_OP); | ||
| 1670 | if (err) | ||
| 1671 | return err; | ||
| 1672 | |||
| 1673 | /* check that memory (dst_reg + off) is writeable */ | ||
| 1674 | err = check_mem_access(env, insn->dst_reg, insn->off, | ||
| 1675 | BPF_SIZE(insn->code), BPF_WRITE, | ||
| 1676 | -1); | ||
| 1677 | if (err) | ||
| 1678 | return err; | ||
| 1679 | |||
| 1680 | } else if (class == BPF_JMP) { | ||
| 1681 | u8 opcode = BPF_OP(insn->code); | ||
| 1682 | |||
| 1683 | if (opcode == BPF_CALL) { | ||
| 1684 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1685 | insn->off != 0 || | ||
| 1686 | insn->src_reg != BPF_REG_0 || | ||
| 1687 | insn->dst_reg != BPF_REG_0) { | ||
| 1688 | verbose("BPF_CALL uses reserved fields\n"); | ||
| 1689 | return -EINVAL; | ||
| 1690 | } | ||
| 1691 | |||
| 1692 | err = check_call(env, insn->imm); | ||
| 1693 | if (err) | ||
| 1694 | return err; | ||
| 1695 | |||
| 1696 | } else if (opcode == BPF_JA) { | ||
| 1697 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1698 | insn->imm != 0 || | ||
| 1699 | insn->src_reg != BPF_REG_0 || | ||
| 1700 | insn->dst_reg != BPF_REG_0) { | ||
| 1701 | verbose("BPF_JA uses reserved fields\n"); | ||
| 1702 | return -EINVAL; | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | insn_idx += insn->off + 1; | ||
| 1706 | continue; | ||
| 1707 | |||
| 1708 | } else if (opcode == BPF_EXIT) { | ||
| 1709 | if (BPF_SRC(insn->code) != BPF_K || | ||
| 1710 | insn->imm != 0 || | ||
| 1711 | insn->src_reg != BPF_REG_0 || | ||
| 1712 | insn->dst_reg != BPF_REG_0) { | ||
| 1713 | verbose("BPF_EXIT uses reserved fields\n"); | ||
| 1714 | return -EINVAL; | ||
| 1715 | } | ||
| 1716 | |||
| 1717 | /* eBPF calling convetion is such that R0 is used | ||
| 1718 | * to return the value from eBPF program. | ||
| 1719 | * Make sure that it's readable at this time | ||
| 1720 | * of bpf_exit, which means that program wrote | ||
| 1721 | * something into it earlier | ||
| 1722 | */ | ||
| 1723 | err = check_reg_arg(regs, BPF_REG_0, SRC_OP); | ||
| 1724 | if (err) | ||
| 1725 | return err; | ||
| 1726 | |||
| 1727 | process_bpf_exit: | ||
| 1728 | insn_idx = pop_stack(env, &prev_insn_idx); | ||
| 1729 | if (insn_idx < 0) { | ||
| 1730 | break; | ||
| 1731 | } else { | ||
| 1732 | do_print_state = true; | ||
| 1733 | continue; | ||
| 1734 | } | ||
| 1735 | } else { | ||
| 1736 | err = check_cond_jmp_op(env, insn, &insn_idx); | ||
| 1737 | if (err) | ||
| 1738 | return err; | ||
| 1739 | } | ||
| 1740 | } else if (class == BPF_LD) { | ||
| 1741 | u8 mode = BPF_MODE(insn->code); | ||
| 1742 | |||
| 1743 | if (mode == BPF_ABS || mode == BPF_IND) { | ||
| 1744 | err = check_ld_abs(env, insn); | ||
| 1745 | if (err) | ||
| 1746 | return err; | ||
| 1747 | |||
| 1748 | } else if (mode == BPF_IMM) { | ||
| 1749 | err = check_ld_imm(env, insn); | ||
| 1750 | if (err) | ||
| 1751 | return err; | ||
| 1752 | |||
| 1753 | insn_idx++; | ||
| 1754 | } else { | ||
| 1755 | verbose("invalid BPF_LD mode\n"); | ||
| 1756 | return -EINVAL; | ||
| 1757 | } | ||
| 1758 | } else { | ||
| 1759 | verbose("unknown insn class %d\n", class); | ||
| 1760 | return -EINVAL; | ||
| 1761 | } | ||
| 1762 | |||
| 1763 | insn_idx++; | ||
| 1764 | } | ||
| 1765 | |||
| 1766 | return 0; | ||
| 1767 | } | ||
| 1768 | |||
| 1769 | /* look for pseudo eBPF instructions that access map FDs and | ||
| 1770 | * replace them with actual map pointers | ||
| 1771 | */ | ||
| 1772 | static int replace_map_fd_with_map_ptr(struct verifier_env *env) | ||
| 1773 | { | ||
| 1774 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1775 | int insn_cnt = env->prog->len; | ||
| 1776 | int i, j; | ||
| 1777 | |||
| 1778 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
| 1779 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { | ||
| 1780 | struct bpf_map *map; | ||
| 1781 | struct fd f; | ||
| 1782 | |||
| 1783 | if (i == insn_cnt - 1 || insn[1].code != 0 || | ||
| 1784 | insn[1].dst_reg != 0 || insn[1].src_reg != 0 || | ||
| 1785 | insn[1].off != 0) { | ||
| 1786 | verbose("invalid bpf_ld_imm64 insn\n"); | ||
| 1787 | return -EINVAL; | ||
| 1788 | } | ||
| 1789 | |||
| 1790 | if (insn->src_reg == 0) | ||
| 1791 | /* valid generic load 64-bit imm */ | ||
| 1792 | goto next_insn; | ||
| 1793 | |||
| 1794 | if (insn->src_reg != BPF_PSEUDO_MAP_FD) { | ||
| 1795 | verbose("unrecognized bpf_ld_imm64 insn\n"); | ||
| 1796 | return -EINVAL; | ||
| 1797 | } | ||
| 1798 | |||
| 1799 | f = fdget(insn->imm); | ||
| 1800 | |||
| 1801 | map = bpf_map_get(f); | ||
| 1802 | if (IS_ERR(map)) { | ||
| 1803 | verbose("fd %d is not pointing to valid bpf_map\n", | ||
| 1804 | insn->imm); | ||
| 1805 | fdput(f); | ||
| 1806 | return PTR_ERR(map); | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | /* store map pointer inside BPF_LD_IMM64 instruction */ | ||
| 1810 | insn[0].imm = (u32) (unsigned long) map; | ||
| 1811 | insn[1].imm = ((u64) (unsigned long) map) >> 32; | ||
| 1812 | |||
| 1813 | /* check whether we recorded this map already */ | ||
| 1814 | for (j = 0; j < env->used_map_cnt; j++) | ||
| 1815 | if (env->used_maps[j] == map) { | ||
| 1816 | fdput(f); | ||
| 1817 | goto next_insn; | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | if (env->used_map_cnt >= MAX_USED_MAPS) { | ||
| 1821 | fdput(f); | ||
| 1822 | return -E2BIG; | ||
| 1823 | } | ||
| 1824 | |||
| 1825 | /* remember this map */ | ||
| 1826 | env->used_maps[env->used_map_cnt++] = map; | ||
| 1827 | |||
| 1828 | /* hold the map. If the program is rejected by verifier, | ||
| 1829 | * the map will be released by release_maps() or it | ||
| 1830 | * will be used by the valid program until it's unloaded | ||
| 1831 | * and all maps are released in free_bpf_prog_info() | ||
| 1832 | */ | ||
| 1833 | atomic_inc(&map->refcnt); | ||
| 1834 | |||
| 1835 | fdput(f); | ||
| 1836 | next_insn: | ||
| 1837 | insn++; | ||
| 1838 | i++; | ||
| 1839 | } | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | /* now all pseudo BPF_LD_IMM64 instructions load valid | ||
| 1843 | * 'struct bpf_map *' into a register instead of user map_fd. | ||
| 1844 | * These pointers will be used later by verifier to validate map access. | ||
| 1845 | */ | ||
| 1846 | return 0; | ||
| 1847 | } | ||
| 1848 | |||
| 1849 | /* drop refcnt of maps used by the rejected program */ | ||
| 1850 | static void release_maps(struct verifier_env *env) | ||
| 1851 | { | ||
| 1852 | int i; | ||
| 1853 | |||
| 1854 | for (i = 0; i < env->used_map_cnt; i++) | ||
| 1855 | bpf_map_put(env->used_maps[i]); | ||
| 1856 | } | ||
| 1857 | |||
| 1858 | /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ | ||
| 1859 | static void convert_pseudo_ld_imm64(struct verifier_env *env) | ||
| 1860 | { | ||
| 1861 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1862 | int insn_cnt = env->prog->len; | ||
| 1863 | int i; | ||
| 1864 | |||
| 1865 | for (i = 0; i < insn_cnt; i++, insn++) | ||
| 1866 | if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) | ||
| 1867 | insn->src_reg = 0; | ||
| 1868 | } | ||
| 1869 | |||
| 1870 | static void free_states(struct verifier_env *env) | ||
| 1871 | { | ||
| 1872 | struct verifier_state_list *sl, *sln; | ||
| 1873 | int i; | ||
| 1874 | |||
| 1875 | if (!env->explored_states) | ||
| 1876 | return; | ||
| 1877 | |||
| 1878 | for (i = 0; i < env->prog->len; i++) { | ||
| 1879 | sl = env->explored_states[i]; | ||
| 1880 | |||
| 1881 | if (sl) | ||
| 1882 | while (sl != STATE_LIST_MARK) { | ||
| 1883 | sln = sl->next; | ||
| 1884 | kfree(sl); | ||
| 1885 | sl = sln; | ||
| 1886 | } | ||
| 1887 | } | ||
| 1888 | |||
| 1889 | kfree(env->explored_states); | ||
| 1890 | } | ||
| 1891 | |||
| 1892 | int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | ||
| 1893 | { | ||
| 1894 | char __user *log_ubuf = NULL; | ||
| 1895 | struct verifier_env *env; | ||
| 1896 | int ret = -EINVAL; | ||
| 1897 | |||
| 1898 | if (prog->len <= 0 || prog->len > BPF_MAXINSNS) | ||
| 1899 | return -E2BIG; | ||
| 1900 | |||
| 1901 | /* 'struct verifier_env' can be global, but since it's not small, | ||
| 1902 | * allocate/free it every time bpf_check() is called | ||
| 1903 | */ | ||
| 1904 | env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); | ||
| 1905 | if (!env) | ||
| 1906 | return -ENOMEM; | ||
| 1907 | |||
| 1908 | env->prog = prog; | ||
| 1909 | |||
| 1910 | /* grab the mutex to protect few globals used by verifier */ | ||
| 1911 | mutex_lock(&bpf_verifier_lock); | ||
| 1912 | |||
| 1913 | if (attr->log_level || attr->log_buf || attr->log_size) { | ||
| 1914 | /* user requested verbose verifier output | ||
| 1915 | * and supplied buffer to store the verification trace | ||
| 1916 | */ | ||
| 1917 | log_level = attr->log_level; | ||
| 1918 | log_ubuf = (char __user *) (unsigned long) attr->log_buf; | ||
| 1919 | log_size = attr->log_size; | ||
| 1920 | log_len = 0; | ||
| 1921 | |||
| 1922 | ret = -EINVAL; | ||
| 1923 | /* log_* values have to be sane */ | ||
| 1924 | if (log_size < 128 || log_size > UINT_MAX >> 8 || | ||
| 1925 | log_level == 0 || log_ubuf == NULL) | ||
| 1926 | goto free_env; | ||
| 1927 | |||
| 1928 | ret = -ENOMEM; | ||
| 1929 | log_buf = vmalloc(log_size); | ||
| 1930 | if (!log_buf) | ||
| 1931 | goto free_env; | ||
| 1932 | } else { | ||
| 1933 | log_level = 0; | ||
| 1934 | } | ||
| 1935 | |||
| 1936 | ret = replace_map_fd_with_map_ptr(env); | ||
| 1937 | if (ret < 0) | ||
| 1938 | goto skip_full_check; | ||
| 1939 | |||
| 1940 | env->explored_states = kcalloc(prog->len, | ||
| 1941 | sizeof(struct verifier_state_list *), | ||
| 1942 | GFP_USER); | ||
| 1943 | ret = -ENOMEM; | ||
| 1944 | if (!env->explored_states) | ||
| 1945 | goto skip_full_check; | ||
| 1946 | |||
| 1947 | ret = check_cfg(env); | ||
| 1948 | if (ret < 0) | ||
| 1949 | goto skip_full_check; | ||
| 1950 | |||
| 1951 | ret = do_check(env); | ||
| 1952 | |||
| 1953 | skip_full_check: | ||
| 1954 | while (pop_stack(env, NULL) >= 0); | ||
| 1955 | free_states(env); | ||
| 1956 | |||
| 1957 | if (log_level && log_len >= log_size - 1) { | ||
| 1958 | BUG_ON(log_len >= log_size); | ||
| 1959 | /* verifier log exceeded user supplied buffer */ | ||
| 1960 | ret = -ENOSPC; | ||
| 1961 | /* fall through to return what was recorded */ | ||
| 1962 | } | ||
| 1963 | |||
| 1964 | /* copy verifier log back to user space including trailing zero */ | ||
| 1965 | if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { | ||
| 1966 | ret = -EFAULT; | ||
| 1967 | goto free_log_buf; | ||
| 1968 | } | ||
| 1969 | |||
| 1970 | if (ret == 0 && env->used_map_cnt) { | ||
| 1971 | /* if program passed verifier, update used_maps in bpf_prog_info */ | ||
| 1972 | prog->aux->used_maps = kmalloc_array(env->used_map_cnt, | ||
| 1973 | sizeof(env->used_maps[0]), | ||
| 1974 | GFP_KERNEL); | ||
| 1975 | |||
| 1976 | if (!prog->aux->used_maps) { | ||
| 1977 | ret = -ENOMEM; | ||
| 1978 | goto free_log_buf; | ||
| 1979 | } | ||
| 1980 | |||
| 1981 | memcpy(prog->aux->used_maps, env->used_maps, | ||
| 1982 | sizeof(env->used_maps[0]) * env->used_map_cnt); | ||
| 1983 | prog->aux->used_map_cnt = env->used_map_cnt; | ||
| 1984 | |||
| 1985 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic | ||
| 1986 | * bpf_ld_imm64 instructions | ||
| 1987 | */ | ||
| 1988 | convert_pseudo_ld_imm64(env); | ||
| 1989 | } | ||
| 1990 | |||
| 1991 | free_log_buf: | ||
| 1992 | if (log_level) | ||
| 1993 | vfree(log_buf); | ||
| 1994 | free_env: | ||
| 1995 | if (!prog->aux->used_maps) | ||
| 1996 | /* if we didn't copy map pointers into bpf_prog_info, release | ||
| 1997 | * them now. Otherwise free_bpf_prog_info() will release them. | ||
| 1998 | */ | ||
| 1999 | release_maps(env); | ||
| 2000 | kfree(env); | ||
| 2001 | mutex_unlock(&bpf_verifier_lock); | ||
| 2002 | return ret; | ||
| 2003 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3a73f995a81e..bb263d0caab3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly; | |||
| 185 | static struct cftype cgroup_dfl_base_files[]; | 185 | static struct cftype cgroup_dfl_base_files[]; |
| 186 | static struct cftype cgroup_legacy_base_files[]; | 186 | static struct cftype cgroup_legacy_base_files[]; |
| 187 | 187 | ||
| 188 | static void cgroup_put(struct cgroup *cgrp); | ||
| 189 | static int rebind_subsystems(struct cgroup_root *dst_root, | 188 | static int rebind_subsystems(struct cgroup_root *dst_root, |
| 190 | unsigned int ss_mask); | 189 | unsigned int ss_mask); |
| 191 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 190 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| @@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref); | |||
| 195 | static void kill_css(struct cgroup_subsys_state *css); | 194 | static void kill_css(struct cgroup_subsys_state *css); |
| 196 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 195 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
| 197 | bool is_add); | 196 | bool is_add); |
| 198 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | ||
| 199 | 197 | ||
| 200 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | 198 | /* IDR wrappers which synchronize using cgroup_idr_lock */ |
| 201 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | 199 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, |
| @@ -279,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
| 279 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | 277 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) |
| 280 | return NULL; | 278 | return NULL; |
| 281 | 279 | ||
| 280 | /* | ||
| 281 | * This function is used while updating css associations and thus | ||
| 282 | * can't test the csses directly. Use ->child_subsys_mask. | ||
| 283 | */ | ||
| 282 | while (cgroup_parent(cgrp) && | 284 | while (cgroup_parent(cgrp) && |
| 283 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | 285 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) |
| 284 | cgrp = cgroup_parent(cgrp); | 286 | cgrp = cgroup_parent(cgrp); |
| @@ -286,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
| 286 | return cgroup_css(cgrp, ss); | 288 | return cgroup_css(cgrp, ss); |
| 287 | } | 289 | } |
| 288 | 290 | ||
| 291 | /** | ||
| 292 | * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem | ||
| 293 | * @cgrp: the cgroup of interest | ||
| 294 | * @ss: the subsystem of interest | ||
| 295 | * | ||
| 296 | * Find and get the effective css of @cgrp for @ss. The effective css is | ||
| 297 | * defined as the matching css of the nearest ancestor including self which | ||
| 298 | * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, | ||
| 299 | * the root css is returned, so this function always returns a valid css. | ||
| 300 | * The returned css must be put using css_put(). | ||
| 301 | */ | ||
| 302 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, | ||
| 303 | struct cgroup_subsys *ss) | ||
| 304 | { | ||
| 305 | struct cgroup_subsys_state *css; | ||
| 306 | |||
| 307 | rcu_read_lock(); | ||
| 308 | |||
| 309 | do { | ||
| 310 | css = cgroup_css(cgrp, ss); | ||
| 311 | |||
| 312 | if (css && css_tryget_online(css)) | ||
| 313 | goto out_unlock; | ||
| 314 | cgrp = cgroup_parent(cgrp); | ||
| 315 | } while (cgrp); | ||
| 316 | |||
| 317 | css = init_css_set.subsys[ss->id]; | ||
| 318 | css_get(css); | ||
| 319 | out_unlock: | ||
| 320 | rcu_read_unlock(); | ||
| 321 | return css; | ||
| 322 | } | ||
| 323 | |||
| 289 | /* convenient tests for these bits */ | 324 | /* convenient tests for these bits */ |
| 290 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 325 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
| 291 | { | 326 | { |
| @@ -331,14 +366,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | |||
| 331 | return false; | 366 | return false; |
| 332 | } | 367 | } |
| 333 | 368 | ||
| 334 | static int cgroup_is_releasable(const struct cgroup *cgrp) | ||
| 335 | { | ||
| 336 | const int bits = | ||
| 337 | (1 << CGRP_RELEASABLE) | | ||
| 338 | (1 << CGRP_NOTIFY_ON_RELEASE); | ||
| 339 | return (cgrp->flags & bits) == bits; | ||
| 340 | } | ||
| 341 | |||
| 342 | static int notify_on_release(const struct cgroup *cgrp) | 369 | static int notify_on_release(const struct cgroup *cgrp) |
| 343 | { | 370 | { |
| 344 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 371 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| @@ -394,12 +421,7 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 394 | ; \ | 421 | ; \ |
| 395 | else | 422 | else |
| 396 | 423 | ||
| 397 | /* the list of cgroups eligible for automatic release. Protected by | ||
| 398 | * release_list_lock */ | ||
| 399 | static LIST_HEAD(release_list); | ||
| 400 | static DEFINE_RAW_SPINLOCK(release_list_lock); | ||
| 401 | static void cgroup_release_agent(struct work_struct *work); | 424 | static void cgroup_release_agent(struct work_struct *work); |
| 402 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | ||
| 403 | static void check_for_release(struct cgroup *cgrp); | 425 | static void check_for_release(struct cgroup *cgrp); |
| 404 | 426 | ||
| 405 | /* | 427 | /* |
| @@ -498,7 +520,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 498 | return key; | 520 | return key; |
| 499 | } | 521 | } |
| 500 | 522 | ||
| 501 | static void put_css_set_locked(struct css_set *cset, bool taskexit) | 523 | static void put_css_set_locked(struct css_set *cset) |
| 502 | { | 524 | { |
| 503 | struct cgrp_cset_link *link, *tmp_link; | 525 | struct cgrp_cset_link *link, *tmp_link; |
| 504 | struct cgroup_subsys *ss; | 526 | struct cgroup_subsys *ss; |
| @@ -524,11 +546,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 524 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 546 | /* @cgrp can't go away while we're holding css_set_rwsem */ |
| 525 | if (list_empty(&cgrp->cset_links)) { | 547 | if (list_empty(&cgrp->cset_links)) { |
| 526 | cgroup_update_populated(cgrp, false); | 548 | cgroup_update_populated(cgrp, false); |
| 527 | if (notify_on_release(cgrp)) { | 549 | check_for_release(cgrp); |
| 528 | if (taskexit) | ||
| 529 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 530 | check_for_release(cgrp); | ||
| 531 | } | ||
| 532 | } | 550 | } |
| 533 | 551 | ||
| 534 | kfree(link); | 552 | kfree(link); |
| @@ -537,7 +555,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 537 | kfree_rcu(cset, rcu_head); | 555 | kfree_rcu(cset, rcu_head); |
| 538 | } | 556 | } |
| 539 | 557 | ||
| 540 | static void put_css_set(struct css_set *cset, bool taskexit) | 558 | static void put_css_set(struct css_set *cset) |
| 541 | { | 559 | { |
| 542 | /* | 560 | /* |
| 543 | * Ensure that the refcount doesn't hit zero while any readers | 561 | * Ensure that the refcount doesn't hit zero while any readers |
| @@ -548,7 +566,7 @@ static void put_css_set(struct css_set *cset, bool taskexit) | |||
| 548 | return; | 566 | return; |
| 549 | 567 | ||
| 550 | down_write(&css_set_rwsem); | 568 | down_write(&css_set_rwsem); |
| 551 | put_css_set_locked(cset, taskexit); | 569 | put_css_set_locked(cset); |
| 552 | up_write(&css_set_rwsem); | 570 | up_write(&css_set_rwsem); |
| 553 | } | 571 | } |
| 554 | 572 | ||
| @@ -969,14 +987,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
| 969 | * knows that the cgroup won't be removed, as cgroup_rmdir() | 987 | * knows that the cgroup won't be removed, as cgroup_rmdir() |
| 970 | * needs that mutex. | 988 | * needs that mutex. |
| 971 | * | 989 | * |
| 972 | * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't | ||
| 973 | * (usually) take cgroup_mutex. These are the two most performance | ||
| 974 | * critical pieces of code here. The exception occurs on cgroup_exit(), | ||
| 975 | * when a task in a notify_on_release cgroup exits. Then cgroup_mutex | ||
| 976 | * is taken, and if the cgroup count is zero, a usermode call made | ||
| 977 | * to the release agent with the name of the cgroup (path relative to | ||
| 978 | * the root of cgroup file system) as the argument. | ||
| 979 | * | ||
| 980 | * A cgroup can only be deleted if both its 'count' of using tasks | 990 | * A cgroup can only be deleted if both its 'count' of using tasks |
| 981 | * is zero, and its list of 'children' cgroups is empty. Since all | 991 | * is zero, and its list of 'children' cgroups is empty. Since all |
| 982 | * tasks in the system use _some_ cgroup, and since there is always at | 992 | * tasks in the system use _some_ cgroup, and since there is always at |
| @@ -1046,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) | |||
| 1046 | } | 1056 | } |
| 1047 | 1057 | ||
| 1048 | /** | 1058 | /** |
| 1049 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | 1059 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask |
| 1050 | * @cgrp: the target cgroup | 1060 | * @cgrp: the target cgroup |
| 1061 | * @subtree_control: the new subtree_control mask to consider | ||
| 1051 | * | 1062 | * |
| 1052 | * On the default hierarchy, a subsystem may request other subsystems to be | 1063 | * On the default hierarchy, a subsystem may request other subsystems to be |
| 1053 | * enabled together through its ->depends_on mask. In such cases, more | 1064 | * enabled together through its ->depends_on mask. In such cases, more |
| 1054 | * subsystems than specified in "cgroup.subtree_control" may be enabled. | 1065 | * subsystems than specified in "cgroup.subtree_control" may be enabled. |
| 1055 | * | 1066 | * |
| 1056 | * This function determines which subsystems need to be enabled given the | 1067 | * This function calculates which subsystems need to be enabled if |
| 1057 | * current @cgrp->subtree_control and records it in | 1068 | * @subtree_control is to be applied to @cgrp. The returned mask is always |
| 1058 | * @cgrp->child_subsys_mask. The resulting mask is always a superset of | 1069 | * a superset of @subtree_control and follows the usual hierarchy rules. |
| 1059 | * @cgrp->subtree_control and follows the usual hierarchy rules. | ||
| 1060 | */ | 1070 | */ |
| 1061 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | 1071 | static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, |
| 1072 | unsigned int subtree_control) | ||
| 1062 | { | 1073 | { |
| 1063 | struct cgroup *parent = cgroup_parent(cgrp); | 1074 | struct cgroup *parent = cgroup_parent(cgrp); |
| 1064 | unsigned int cur_ss_mask = cgrp->subtree_control; | 1075 | unsigned int cur_ss_mask = subtree_control; |
| 1065 | struct cgroup_subsys *ss; | 1076 | struct cgroup_subsys *ss; |
| 1066 | int ssid; | 1077 | int ssid; |
| 1067 | 1078 | ||
| 1068 | lockdep_assert_held(&cgroup_mutex); | 1079 | lockdep_assert_held(&cgroup_mutex); |
| 1069 | 1080 | ||
| 1070 | if (!cgroup_on_dfl(cgrp)) { | 1081 | if (!cgroup_on_dfl(cgrp)) |
| 1071 | cgrp->child_subsys_mask = cur_ss_mask; | 1082 | return cur_ss_mask; |
| 1072 | return; | ||
| 1073 | } | ||
| 1074 | 1083 | ||
| 1075 | while (true) { | 1084 | while (true) { |
| 1076 | unsigned int new_ss_mask = cur_ss_mask; | 1085 | unsigned int new_ss_mask = cur_ss_mask; |
| @@ -1094,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | |||
| 1094 | cur_ss_mask = new_ss_mask; | 1103 | cur_ss_mask = new_ss_mask; |
| 1095 | } | 1104 | } |
| 1096 | 1105 | ||
| 1097 | cgrp->child_subsys_mask = cur_ss_mask; | 1106 | return cur_ss_mask; |
| 1107 | } | ||
| 1108 | |||
| 1109 | /** | ||
| 1110 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | ||
| 1111 | * @cgrp: the target cgroup | ||
| 1112 | * | ||
| 1113 | * Update @cgrp->child_subsys_mask according to the current | ||
| 1114 | * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). | ||
| 1115 | */ | ||
| 1116 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | ||
| 1117 | { | ||
| 1118 | cgrp->child_subsys_mask = | ||
| 1119 | cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); | ||
| 1098 | } | 1120 | } |
| 1099 | 1121 | ||
| 1100 | /** | 1122 | /** |
| @@ -1587,7 +1609,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1587 | INIT_LIST_HEAD(&cgrp->self.sibling); | 1609 | INIT_LIST_HEAD(&cgrp->self.sibling); |
| 1588 | INIT_LIST_HEAD(&cgrp->self.children); | 1610 | INIT_LIST_HEAD(&cgrp->self.children); |
| 1589 | INIT_LIST_HEAD(&cgrp->cset_links); | 1611 | INIT_LIST_HEAD(&cgrp->cset_links); |
| 1590 | INIT_LIST_HEAD(&cgrp->release_list); | ||
| 1591 | INIT_LIST_HEAD(&cgrp->pidlists); | 1612 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1592 | mutex_init(&cgrp->pidlist_mutex); | 1613 | mutex_init(&cgrp->pidlist_mutex); |
| 1593 | cgrp->self.cgroup = cgrp; | 1614 | cgrp->self.cgroup = cgrp; |
| @@ -1597,6 +1618,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1597 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | 1618 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); |
| 1598 | 1619 | ||
| 1599 | init_waitqueue_head(&cgrp->offline_waitq); | 1620 | init_waitqueue_head(&cgrp->offline_waitq); |
| 1621 | INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); | ||
| 1600 | } | 1622 | } |
| 1601 | 1623 | ||
| 1602 | static void init_cgroup_root(struct cgroup_root *root, | 1624 | static void init_cgroup_root(struct cgroup_root *root, |
| @@ -1634,7 +1656,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | |||
| 1634 | goto out; | 1656 | goto out; |
| 1635 | root_cgrp->id = ret; | 1657 | root_cgrp->id = ret; |
| 1636 | 1658 | ||
| 1637 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | 1659 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, |
| 1660 | GFP_KERNEL); | ||
| 1638 | if (ret) | 1661 | if (ret) |
| 1639 | goto out; | 1662 | goto out; |
| 1640 | 1663 | ||
| @@ -2052,8 +2075,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, | |||
| 2052 | * task. As trading it for new_cset is protected by cgroup_mutex, | 2075 | * task. As trading it for new_cset is protected by cgroup_mutex, |
| 2053 | * we're safe to drop it here; it will be freed under RCU. | 2076 | * we're safe to drop it here; it will be freed under RCU. |
| 2054 | */ | 2077 | */ |
| 2055 | set_bit(CGRP_RELEASABLE, &old_cgrp->flags); | 2078 | put_css_set_locked(old_cset); |
| 2056 | put_css_set_locked(old_cset, false); | ||
| 2057 | } | 2079 | } |
| 2058 | 2080 | ||
| 2059 | /** | 2081 | /** |
| @@ -2074,7 +2096,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
| 2074 | cset->mg_src_cgrp = NULL; | 2096 | cset->mg_src_cgrp = NULL; |
| 2075 | cset->mg_dst_cset = NULL; | 2097 | cset->mg_dst_cset = NULL; |
| 2076 | list_del_init(&cset->mg_preload_node); | 2098 | list_del_init(&cset->mg_preload_node); |
| 2077 | put_css_set_locked(cset, false); | 2099 | put_css_set_locked(cset); |
| 2078 | } | 2100 | } |
| 2079 | up_write(&css_set_rwsem); | 2101 | up_write(&css_set_rwsem); |
| 2080 | } | 2102 | } |
| @@ -2168,8 +2190,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 2168 | if (src_cset == dst_cset) { | 2190 | if (src_cset == dst_cset) { |
| 2169 | src_cset->mg_src_cgrp = NULL; | 2191 | src_cset->mg_src_cgrp = NULL; |
| 2170 | list_del_init(&src_cset->mg_preload_node); | 2192 | list_del_init(&src_cset->mg_preload_node); |
| 2171 | put_css_set(src_cset, false); | 2193 | put_css_set(src_cset); |
| 2172 | put_css_set(dst_cset, false); | 2194 | put_css_set(dst_cset); |
| 2173 | continue; | 2195 | continue; |
| 2174 | } | 2196 | } |
| 2175 | 2197 | ||
| @@ -2178,7 +2200,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 2178 | if (list_empty(&dst_cset->mg_preload_node)) | 2200 | if (list_empty(&dst_cset->mg_preload_node)) |
| 2179 | list_add(&dst_cset->mg_preload_node, &csets); | 2201 | list_add(&dst_cset->mg_preload_node, &csets); |
| 2180 | else | 2202 | else |
| 2181 | put_css_set(dst_cset, false); | 2203 | put_css_set(dst_cset); |
| 2182 | } | 2204 | } |
| 2183 | 2205 | ||
| 2184 | list_splice_tail(&csets, preloaded_csets); | 2206 | list_splice_tail(&csets, preloaded_csets); |
| @@ -2668,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2668 | loff_t off) | 2690 | loff_t off) |
| 2669 | { | 2691 | { |
| 2670 | unsigned int enable = 0, disable = 0; | 2692 | unsigned int enable = 0, disable = 0; |
| 2671 | unsigned int css_enable, css_disable, old_ctrl, new_ctrl; | 2693 | unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; |
| 2672 | struct cgroup *cgrp, *child; | 2694 | struct cgroup *cgrp, *child; |
| 2673 | struct cgroup_subsys *ss; | 2695 | struct cgroup_subsys *ss; |
| 2674 | char *tok; | 2696 | char *tok; |
| @@ -2720,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2720 | ret = -ENOENT; | 2742 | ret = -ENOENT; |
| 2721 | goto out_unlock; | 2743 | goto out_unlock; |
| 2722 | } | 2744 | } |
| 2723 | |||
| 2724 | /* | ||
| 2725 | * @ss is already enabled through dependency and | ||
| 2726 | * we'll just make it visible. Skip draining. | ||
| 2727 | */ | ||
| 2728 | if (cgrp->child_subsys_mask & (1 << ssid)) | ||
| 2729 | continue; | ||
| 2730 | |||
| 2731 | /* | ||
| 2732 | * Because css offlining is asynchronous, userland | ||
| 2733 | * might try to re-enable the same controller while | ||
| 2734 | * the previous instance is still around. In such | ||
| 2735 | * cases, wait till it's gone using offline_waitq. | ||
| 2736 | */ | ||
| 2737 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2738 | DEFINE_WAIT(wait); | ||
| 2739 | |||
| 2740 | if (!cgroup_css(child, ss)) | ||
| 2741 | continue; | ||
| 2742 | |||
| 2743 | cgroup_get(child); | ||
| 2744 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2745 | TASK_UNINTERRUPTIBLE); | ||
| 2746 | cgroup_kn_unlock(of->kn); | ||
| 2747 | schedule(); | ||
| 2748 | finish_wait(&child->offline_waitq, &wait); | ||
| 2749 | cgroup_put(child); | ||
| 2750 | |||
| 2751 | return restart_syscall(); | ||
| 2752 | } | ||
| 2753 | } else if (disable & (1 << ssid)) { | 2745 | } else if (disable & (1 << ssid)) { |
| 2754 | if (!(cgrp->subtree_control & (1 << ssid))) { | 2746 | if (!(cgrp->subtree_control & (1 << ssid))) { |
| 2755 | disable &= ~(1 << ssid); | 2747 | disable &= ~(1 << ssid); |
| @@ -2785,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2785 | * subsystems than specified may need to be enabled or disabled | 2777 | * subsystems than specified may need to be enabled or disabled |
| 2786 | * depending on subsystem dependencies. | 2778 | * depending on subsystem dependencies. |
| 2787 | */ | 2779 | */ |
| 2788 | cgrp->subtree_control |= enable; | 2780 | old_sc = cgrp->subtree_control; |
| 2789 | cgrp->subtree_control &= ~disable; | 2781 | old_ss = cgrp->child_subsys_mask; |
| 2790 | 2782 | new_sc = (old_sc | enable) & ~disable; | |
| 2791 | old_ctrl = cgrp->child_subsys_mask; | 2783 | new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); |
| 2792 | cgroup_refresh_child_subsys_mask(cgrp); | ||
| 2793 | new_ctrl = cgrp->child_subsys_mask; | ||
| 2794 | 2784 | ||
| 2795 | css_enable = ~old_ctrl & new_ctrl; | 2785 | css_enable = ~old_ss & new_ss; |
| 2796 | css_disable = old_ctrl & ~new_ctrl; | 2786 | css_disable = old_ss & ~new_ss; |
| 2797 | enable |= css_enable; | 2787 | enable |= css_enable; |
| 2798 | disable |= css_disable; | 2788 | disable |= css_disable; |
| 2799 | 2789 | ||
| 2800 | /* | 2790 | /* |
| 2791 | * Because css offlining is asynchronous, userland might try to | ||
| 2792 | * re-enable the same controller while the previous instance is | ||
| 2793 | * still around. In such cases, wait till it's gone using | ||
| 2794 | * offline_waitq. | ||
| 2795 | */ | ||
| 2796 | for_each_subsys(ss, ssid) { | ||
| 2797 | if (!(css_enable & (1 << ssid))) | ||
| 2798 | continue; | ||
| 2799 | |||
| 2800 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2801 | DEFINE_WAIT(wait); | ||
| 2802 | |||
| 2803 | if (!cgroup_css(child, ss)) | ||
| 2804 | continue; | ||
| 2805 | |||
| 2806 | cgroup_get(child); | ||
| 2807 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2808 | TASK_UNINTERRUPTIBLE); | ||
| 2809 | cgroup_kn_unlock(of->kn); | ||
| 2810 | schedule(); | ||
| 2811 | finish_wait(&child->offline_waitq, &wait); | ||
| 2812 | cgroup_put(child); | ||
| 2813 | |||
| 2814 | return restart_syscall(); | ||
| 2815 | } | ||
| 2816 | } | ||
| 2817 | |||
| 2818 | cgrp->subtree_control = new_sc; | ||
| 2819 | cgrp->child_subsys_mask = new_ss; | ||
| 2820 | |||
| 2821 | /* | ||
| 2801 | * Create new csses or make the existing ones visible. A css is | 2822 | * Create new csses or make the existing ones visible. A css is |
| 2802 | * created invisible if it's being implicitly enabled through | 2823 | * created invisible if it's being implicitly enabled through |
| 2803 | * dependency. An invisible css is made visible when the userland | 2824 | * dependency. An invisible css is made visible when the userland |
| @@ -2852,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
| 2852 | } | 2873 | } |
| 2853 | } | 2874 | } |
| 2854 | 2875 | ||
| 2876 | /* | ||
| 2877 | * The effective csses of all the descendants (excluding @cgrp) may | ||
| 2878 | * have changed. Subsystems can optionally subscribe to this event | ||
| 2879 | * by implementing ->css_e_css_changed() which is invoked if any of | ||
| 2880 | * the effective csses seen from the css's cgroup may have changed. | ||
| 2881 | */ | ||
| 2882 | for_each_subsys(ss, ssid) { | ||
| 2883 | struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); | ||
| 2884 | struct cgroup_subsys_state *css; | ||
| 2885 | |||
| 2886 | if (!ss->css_e_css_changed || !this_css) | ||
| 2887 | continue; | ||
| 2888 | |||
| 2889 | css_for_each_descendant_pre(css, this_css) | ||
| 2890 | if (css != this_css) | ||
| 2891 | ss->css_e_css_changed(css); | ||
| 2892 | } | ||
| 2893 | |||
| 2855 | kernfs_activate(cgrp->kn); | 2894 | kernfs_activate(cgrp->kn); |
| 2856 | ret = 0; | 2895 | ret = 0; |
| 2857 | out_unlock: | 2896 | out_unlock: |
| @@ -2859,9 +2898,8 @@ out_unlock: | |||
| 2859 | return ret ?: nbytes; | 2898 | return ret ?: nbytes; |
| 2860 | 2899 | ||
| 2861 | err_undo_css: | 2900 | err_undo_css: |
| 2862 | cgrp->subtree_control &= ~enable; | 2901 | cgrp->subtree_control = old_sc; |
| 2863 | cgrp->subtree_control |= disable; | 2902 | cgrp->child_subsys_mask = old_ss; |
| 2864 | cgroup_refresh_child_subsys_mask(cgrp); | ||
| 2865 | 2903 | ||
| 2866 | for_each_subsys(ss, ssid) { | 2904 | for_each_subsys(ss, ssid) { |
| 2867 | if (!(enable & (1 << ssid))) | 2905 | if (!(enable & (1 << ssid))) |
| @@ -4173,7 +4211,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | |||
| 4173 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, | 4211 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
| 4174 | struct cftype *cft, u64 val) | 4212 | struct cftype *cft, u64 val) |
| 4175 | { | 4213 | { |
| 4176 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); | ||
| 4177 | if (val) | 4214 | if (val) |
| 4178 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); | 4215 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
| 4179 | else | 4216 | else |
| @@ -4351,6 +4388,7 @@ static void css_free_work_fn(struct work_struct *work) | |||
| 4351 | /* cgroup free path */ | 4388 | /* cgroup free path */ |
| 4352 | atomic_dec(&cgrp->root->nr_cgrps); | 4389 | atomic_dec(&cgrp->root->nr_cgrps); |
| 4353 | cgroup_pidlist_destroy_all(cgrp); | 4390 | cgroup_pidlist_destroy_all(cgrp); |
| 4391 | cancel_work_sync(&cgrp->release_agent_work); | ||
| 4354 | 4392 | ||
| 4355 | if (cgroup_parent(cgrp)) { | 4393 | if (cgroup_parent(cgrp)) { |
| 4356 | /* | 4394 | /* |
| @@ -4397,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 4397 | if (ss) { | 4435 | if (ss) { |
| 4398 | /* css release path */ | 4436 | /* css release path */ |
| 4399 | cgroup_idr_remove(&ss->css_idr, css->id); | 4437 | cgroup_idr_remove(&ss->css_idr, css->id); |
| 4438 | if (ss->css_released) | ||
| 4439 | ss->css_released(css); | ||
| 4400 | } else { | 4440 | } else { |
| 4401 | /* cgroup release path */ | 4441 | /* cgroup release path */ |
| 4402 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
| @@ -4510,7 +4550,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
| 4510 | 4550 | ||
| 4511 | init_and_link_css(css, ss, cgrp); | 4551 | init_and_link_css(css, ss, cgrp); |
| 4512 | 4552 | ||
| 4513 | err = percpu_ref_init(&css->refcnt, css_release); | 4553 | err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); |
| 4514 | if (err) | 4554 | if (err) |
| 4515 | goto err_free_css; | 4555 | goto err_free_css; |
| 4516 | 4556 | ||
| @@ -4583,7 +4623,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
| 4583 | goto out_unlock; | 4623 | goto out_unlock; |
| 4584 | } | 4624 | } |
| 4585 | 4625 | ||
| 4586 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | 4626 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); |
| 4587 | if (ret) | 4627 | if (ret) |
| 4588 | goto out_free_cgrp; | 4628 | goto out_free_cgrp; |
| 4589 | 4629 | ||
| @@ -4813,19 +4853,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4813 | for_each_css(css, ssid, cgrp) | 4853 | for_each_css(css, ssid, cgrp) |
| 4814 | kill_css(css); | 4854 | kill_css(css); |
| 4815 | 4855 | ||
| 4816 | /* CSS_ONLINE is clear, remove from ->release_list for the last time */ | ||
| 4817 | raw_spin_lock(&release_list_lock); | ||
| 4818 | if (!list_empty(&cgrp->release_list)) | ||
| 4819 | list_del_init(&cgrp->release_list); | ||
| 4820 | raw_spin_unlock(&release_list_lock); | ||
| 4821 | |||
| 4822 | /* | 4856 | /* |
| 4823 | * Remove @cgrp directory along with the base files. @cgrp has an | 4857 | * Remove @cgrp directory along with the base files. @cgrp has an |
| 4824 | * extra ref on its kn. | 4858 | * extra ref on its kn. |
| 4825 | */ | 4859 | */ |
| 4826 | kernfs_remove(cgrp->kn); | 4860 | kernfs_remove(cgrp->kn); |
| 4827 | 4861 | ||
| 4828 | set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); | ||
| 4829 | check_for_release(cgroup_parent(cgrp)); | 4862 | check_for_release(cgroup_parent(cgrp)); |
| 4830 | 4863 | ||
| 4831 | /* put the base reference */ | 4864 | /* put the base reference */ |
| @@ -4842,13 +4875,10 @@ static int cgroup_rmdir(struct kernfs_node *kn) | |||
| 4842 | cgrp = cgroup_kn_lock_live(kn); | 4875 | cgrp = cgroup_kn_lock_live(kn); |
| 4843 | if (!cgrp) | 4876 | if (!cgrp) |
| 4844 | return 0; | 4877 | return 0; |
| 4845 | cgroup_get(cgrp); /* for @kn->priv clearing */ | ||
| 4846 | 4878 | ||
| 4847 | ret = cgroup_destroy_locked(cgrp); | 4879 | ret = cgroup_destroy_locked(cgrp); |
| 4848 | 4880 | ||
| 4849 | cgroup_kn_unlock(kn); | 4881 | cgroup_kn_unlock(kn); |
| 4850 | |||
| 4851 | cgroup_put(cgrp); | ||
| 4852 | return ret; | 4882 | return ret; |
| 4853 | } | 4883 | } |
| 4854 | 4884 | ||
| @@ -5052,12 +5082,9 @@ core_initcall(cgroup_wq_init); | |||
| 5052 | * - Print task's cgroup paths into seq_file, one line for each hierarchy | 5082 | * - Print task's cgroup paths into seq_file, one line for each hierarchy |
| 5053 | * - Used for /proc/<pid>/cgroup. | 5083 | * - Used for /proc/<pid>/cgroup. |
| 5054 | */ | 5084 | */ |
| 5055 | 5085 | int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |
| 5056 | /* TODO: Use a proper seq_file iterator */ | 5086 | struct pid *pid, struct task_struct *tsk) |
| 5057 | int proc_cgroup_show(struct seq_file *m, void *v) | ||
| 5058 | { | 5087 | { |
| 5059 | struct pid *pid; | ||
| 5060 | struct task_struct *tsk; | ||
| 5061 | char *buf, *path; | 5088 | char *buf, *path; |
| 5062 | int retval; | 5089 | int retval; |
| 5063 | struct cgroup_root *root; | 5090 | struct cgroup_root *root; |
| @@ -5067,14 +5094,6 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 5067 | if (!buf) | 5094 | if (!buf) |
| 5068 | goto out; | 5095 | goto out; |
| 5069 | 5096 | ||
| 5070 | retval = -ESRCH; | ||
| 5071 | pid = m->private; | ||
| 5072 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 5073 | if (!tsk) | ||
| 5074 | goto out_free; | ||
| 5075 | |||
| 5076 | retval = 0; | ||
| 5077 | |||
| 5078 | mutex_lock(&cgroup_mutex); | 5097 | mutex_lock(&cgroup_mutex); |
| 5079 | down_read(&css_set_rwsem); | 5098 | down_read(&css_set_rwsem); |
| 5080 | 5099 | ||
| @@ -5104,11 +5123,10 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 5104 | seq_putc(m, '\n'); | 5123 | seq_putc(m, '\n'); |
| 5105 | } | 5124 | } |
| 5106 | 5125 | ||
| 5126 | retval = 0; | ||
| 5107 | out_unlock: | 5127 | out_unlock: |
| 5108 | up_read(&css_set_rwsem); | 5128 | up_read(&css_set_rwsem); |
| 5109 | mutex_unlock(&cgroup_mutex); | 5129 | mutex_unlock(&cgroup_mutex); |
| 5110 | put_task_struct(tsk); | ||
| 5111 | out_free: | ||
| 5112 | kfree(buf); | 5130 | kfree(buf); |
| 5113 | out: | 5131 | out: |
| 5114 | return retval; | 5132 | return retval; |
| @@ -5179,7 +5197,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 5179 | int i; | 5197 | int i; |
| 5180 | 5198 | ||
| 5181 | /* | 5199 | /* |
| 5182 | * This may race against cgroup_enable_task_cg_links(). As that | 5200 | * This may race against cgroup_enable_task_cg_lists(). As that |
| 5183 | * function sets use_task_css_set_links before grabbing | 5201 | * function sets use_task_css_set_links before grabbing |
| 5184 | * tasklist_lock and we just went through tasklist_lock to add | 5202 | * tasklist_lock and we just went through tasklist_lock to add |
| 5185 | * @child, it's guaranteed that either we see the set | 5203 | * @child, it's guaranteed that either we see the set |
| @@ -5194,7 +5212,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 5194 | * when implementing operations which need to migrate all tasks of | 5212 | * when implementing operations which need to migrate all tasks of |
| 5195 | * a cgroup to another. | 5213 | * a cgroup to another. |
| 5196 | * | 5214 | * |
| 5197 | * Note that if we lose to cgroup_enable_task_cg_links(), @child | 5215 | * Note that if we lose to cgroup_enable_task_cg_lists(), @child |
| 5198 | * will remain in init_css_set. This is safe because all tasks are | 5216 | * will remain in init_css_set. This is safe because all tasks are |
| 5199 | * in the init_css_set before cg_links is enabled and there's no | 5217 | * in the init_css_set before cg_links is enabled and there's no |
| 5200 | * operation which transfers all tasks out of init_css_set. | 5218 | * operation which transfers all tasks out of init_css_set. |
| @@ -5278,30 +5296,14 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 5278 | } | 5296 | } |
| 5279 | 5297 | ||
| 5280 | if (put_cset) | 5298 | if (put_cset) |
| 5281 | put_css_set(cset, true); | 5299 | put_css_set(cset); |
| 5282 | } | 5300 | } |
| 5283 | 5301 | ||
| 5284 | static void check_for_release(struct cgroup *cgrp) | 5302 | static void check_for_release(struct cgroup *cgrp) |
| 5285 | { | 5303 | { |
| 5286 | if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && | 5304 | if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && |
| 5287 | !css_has_online_children(&cgrp->self)) { | 5305 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) |
| 5288 | /* | 5306 | schedule_work(&cgrp->release_agent_work); |
| 5289 | * Control Group is currently removeable. If it's not | ||
| 5290 | * already queued for a userspace notification, queue | ||
| 5291 | * it now | ||
| 5292 | */ | ||
| 5293 | int need_schedule_work = 0; | ||
| 5294 | |||
| 5295 | raw_spin_lock(&release_list_lock); | ||
| 5296 | if (!cgroup_is_dead(cgrp) && | ||
| 5297 | list_empty(&cgrp->release_list)) { | ||
| 5298 | list_add(&cgrp->release_list, &release_list); | ||
| 5299 | need_schedule_work = 1; | ||
| 5300 | } | ||
| 5301 | raw_spin_unlock(&release_list_lock); | ||
| 5302 | if (need_schedule_work) | ||
| 5303 | schedule_work(&release_agent_work); | ||
| 5304 | } | ||
| 5305 | } | 5307 | } |
| 5306 | 5308 | ||
| 5307 | /* | 5309 | /* |
| @@ -5329,52 +5331,39 @@ static void check_for_release(struct cgroup *cgrp) | |||
| 5329 | */ | 5331 | */ |
| 5330 | static void cgroup_release_agent(struct work_struct *work) | 5332 | static void cgroup_release_agent(struct work_struct *work) |
| 5331 | { | 5333 | { |
| 5332 | BUG_ON(work != &release_agent_work); | 5334 | struct cgroup *cgrp = |
| 5335 | container_of(work, struct cgroup, release_agent_work); | ||
| 5336 | char *pathbuf = NULL, *agentbuf = NULL, *path; | ||
| 5337 | char *argv[3], *envp[3]; | ||
| 5338 | |||
| 5333 | mutex_lock(&cgroup_mutex); | 5339 | mutex_lock(&cgroup_mutex); |
| 5334 | raw_spin_lock(&release_list_lock); | 5340 | |
| 5335 | while (!list_empty(&release_list)) { | 5341 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
| 5336 | char *argv[3], *envp[3]; | 5342 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); |
| 5337 | int i; | 5343 | if (!pathbuf || !agentbuf) |
| 5338 | char *pathbuf = NULL, *agentbuf = NULL, *path; | 5344 | goto out; |
| 5339 | struct cgroup *cgrp = list_entry(release_list.next, | 5345 | |
| 5340 | struct cgroup, | 5346 | path = cgroup_path(cgrp, pathbuf, PATH_MAX); |
| 5341 | release_list); | 5347 | if (!path) |
| 5342 | list_del_init(&cgrp->release_list); | 5348 | goto out; |
| 5343 | raw_spin_unlock(&release_list_lock); | 5349 | |
| 5344 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); | 5350 | argv[0] = agentbuf; |
| 5345 | if (!pathbuf) | 5351 | argv[1] = path; |
| 5346 | goto continue_free; | 5352 | argv[2] = NULL; |
| 5347 | path = cgroup_path(cgrp, pathbuf, PATH_MAX); | 5353 | |
| 5348 | if (!path) | 5354 | /* minimal command environment */ |
| 5349 | goto continue_free; | 5355 | envp[0] = "HOME=/"; |
| 5350 | agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); | 5356 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
| 5351 | if (!agentbuf) | 5357 | envp[2] = NULL; |
| 5352 | goto continue_free; | 5358 | |
| 5353 | |||
| 5354 | i = 0; | ||
| 5355 | argv[i++] = agentbuf; | ||
| 5356 | argv[i++] = path; | ||
| 5357 | argv[i] = NULL; | ||
| 5358 | |||
| 5359 | i = 0; | ||
| 5360 | /* minimal command environment */ | ||
| 5361 | envp[i++] = "HOME=/"; | ||
| 5362 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
| 5363 | envp[i] = NULL; | ||
| 5364 | |||
| 5365 | /* Drop the lock while we invoke the usermode helper, | ||
| 5366 | * since the exec could involve hitting disk and hence | ||
| 5367 | * be a slow process */ | ||
| 5368 | mutex_unlock(&cgroup_mutex); | ||
| 5369 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
| 5370 | mutex_lock(&cgroup_mutex); | ||
| 5371 | continue_free: | ||
| 5372 | kfree(pathbuf); | ||
| 5373 | kfree(agentbuf); | ||
| 5374 | raw_spin_lock(&release_list_lock); | ||
| 5375 | } | ||
| 5376 | raw_spin_unlock(&release_list_lock); | ||
| 5377 | mutex_unlock(&cgroup_mutex); | 5359 | mutex_unlock(&cgroup_mutex); |
| 5360 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
| 5361 | goto out_free; | ||
| 5362 | out: | ||
| 5363 | mutex_unlock(&cgroup_mutex); | ||
| 5364 | out_free: | ||
| 5365 | kfree(agentbuf); | ||
| 5366 | kfree(pathbuf); | ||
| 5378 | } | 5367 | } |
| 5379 | 5368 | ||
| 5380 | static int __init cgroup_disable(char *str) | 5369 | static int __init cgroup_disable(char *str) |
| @@ -5562,7 +5551,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
| 5562 | 5551 | ||
| 5563 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | 5552 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
| 5564 | { | 5553 | { |
| 5565 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); | 5554 | return (!cgroup_has_tasks(css->cgroup) && |
| 5555 | !css_has_online_children(&css->cgroup->self)); | ||
| 5566 | } | 5556 | } |
| 5567 | 5557 | ||
| 5568 | static struct cftype debug_files[] = { | 5558 | static struct cftype debug_files[] = { |
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 000000000000..c2de56ab0fce --- /dev/null +++ b/kernel/configs/tiny.config | |||
| @@ -0,0 +1,4 @@ | |||
| 1 | CONFIG_CC_OPTIMIZE_FOR_SIZE=y | ||
| 2 | CONFIG_KERNEL_XZ=y | ||
| 3 | CONFIG_OPTIMIZE_INLINING=y | ||
| 4 | CONFIG_SLOB=y | ||
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) | |||
| 107 | } | 107 | } |
| 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); |
| 109 | 109 | ||
| 110 | #ifdef CONFIG_PREEMPT | ||
| 111 | /** | ||
| 112 | * preempt_schedule_context - preempt_schedule called by tracing | ||
| 113 | * | ||
| 114 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
| 115 | * recursion and tracing preempt enabling caused by the tracing | ||
| 116 | * infrastructure itself. But as tracing can happen in areas coming | ||
| 117 | * from userspace or just about to enter userspace, a preempt enable | ||
| 118 | * can occur before user_exit() is called. This will cause the scheduler | ||
| 119 | * to be called when the system is still in usermode. | ||
| 120 | * | ||
| 121 | * To prevent this, the preempt_enable_notrace will use this function | ||
| 122 | * instead of preempt_schedule() to exit user context if needed before | ||
| 123 | * calling the scheduler. | ||
| 124 | */ | ||
| 125 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
| 126 | { | ||
| 127 | enum ctx_state prev_ctx; | ||
| 128 | |||
| 129 | if (likely(!preemptible())) | ||
| 130 | return; | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Need to disable preemption in case user_exit() is traced | ||
| 134 | * and the tracer calls preempt_enable_notrace() causing | ||
| 135 | * an infinite recursion. | ||
| 136 | */ | ||
| 137 | preempt_disable_notrace(); | ||
| 138 | prev_ctx = exception_enter(); | ||
| 139 | preempt_enable_no_resched_notrace(); | ||
| 140 | |||
| 141 | preempt_schedule(); | ||
| 142 | |||
| 143 | preempt_disable_notrace(); | ||
| 144 | exception_exit(prev_ctx); | ||
| 145 | preempt_enable_notrace(); | ||
| 146 | } | ||
| 147 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
| 148 | #endif /* CONFIG_PREEMPT */ | ||
| 149 | |||
| 150 | /** | 110 | /** |
| 151 | * context_tracking_user_exit - Inform the context tracking that the CPU is | 111 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
| 152 | * exiting userspace mode and entering the kernel. | 112 | * exiting userspace mode and entering the kernel. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 81e2a388a0f6..5d220234b3ca 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -64,6 +64,8 @@ static struct { | |||
| 64 | * an ongoing cpu hotplug operation. | 64 | * an ongoing cpu hotplug operation. |
| 65 | */ | 65 | */ |
| 66 | int refcount; | 66 | int refcount; |
| 67 | /* And allows lockless put_online_cpus(). */ | ||
| 68 | atomic_t puts_pending; | ||
| 67 | 69 | ||
| 68 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 70 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 69 | struct lockdep_map dep_map; | 71 | struct lockdep_map dep_map; |
| @@ -79,9 +81,21 @@ static struct { | |||
| 79 | 81 | ||
| 80 | /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ | 82 | /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ |
| 81 | #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) | 83 | #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) |
| 84 | #define cpuhp_lock_acquire_tryread() \ | ||
| 85 | lock_map_acquire_tryread(&cpu_hotplug.dep_map) | ||
| 82 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) | 86 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) |
| 83 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) | 87 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) |
| 84 | 88 | ||
| 89 | static void apply_puts_pending(int max) | ||
| 90 | { | ||
| 91 | int delta; | ||
| 92 | |||
| 93 | if (atomic_read(&cpu_hotplug.puts_pending) >= max) { | ||
| 94 | delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); | ||
| 95 | cpu_hotplug.refcount -= delta; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 85 | void get_online_cpus(void) | 99 | void get_online_cpus(void) |
| 86 | { | 100 | { |
| 87 | might_sleep(); | 101 | might_sleep(); |
| @@ -89,17 +103,35 @@ void get_online_cpus(void) | |||
| 89 | return; | 103 | return; |
| 90 | cpuhp_lock_acquire_read(); | 104 | cpuhp_lock_acquire_read(); |
| 91 | mutex_lock(&cpu_hotplug.lock); | 105 | mutex_lock(&cpu_hotplug.lock); |
| 106 | apply_puts_pending(65536); | ||
| 92 | cpu_hotplug.refcount++; | 107 | cpu_hotplug.refcount++; |
| 93 | mutex_unlock(&cpu_hotplug.lock); | 108 | mutex_unlock(&cpu_hotplug.lock); |
| 94 | |||
| 95 | } | 109 | } |
| 96 | EXPORT_SYMBOL_GPL(get_online_cpus); | 110 | EXPORT_SYMBOL_GPL(get_online_cpus); |
| 97 | 111 | ||
| 112 | bool try_get_online_cpus(void) | ||
| 113 | { | ||
| 114 | if (cpu_hotplug.active_writer == current) | ||
| 115 | return true; | ||
| 116 | if (!mutex_trylock(&cpu_hotplug.lock)) | ||
| 117 | return false; | ||
| 118 | cpuhp_lock_acquire_tryread(); | ||
| 119 | apply_puts_pending(65536); | ||
| 120 | cpu_hotplug.refcount++; | ||
| 121 | mutex_unlock(&cpu_hotplug.lock); | ||
| 122 | return true; | ||
| 123 | } | ||
| 124 | EXPORT_SYMBOL_GPL(try_get_online_cpus); | ||
| 125 | |||
| 98 | void put_online_cpus(void) | 126 | void put_online_cpus(void) |
| 99 | { | 127 | { |
| 100 | if (cpu_hotplug.active_writer == current) | 128 | if (cpu_hotplug.active_writer == current) |
| 101 | return; | 129 | return; |
| 102 | mutex_lock(&cpu_hotplug.lock); | 130 | if (!mutex_trylock(&cpu_hotplug.lock)) { |
| 131 | atomic_inc(&cpu_hotplug.puts_pending); | ||
| 132 | cpuhp_lock_release(); | ||
| 133 | return; | ||
| 134 | } | ||
| 103 | 135 | ||
| 104 | if (WARN_ON(!cpu_hotplug.refcount)) | 136 | if (WARN_ON(!cpu_hotplug.refcount)) |
| 105 | cpu_hotplug.refcount++; /* try to fix things up */ | 137 | cpu_hotplug.refcount++; /* try to fix things up */ |
| @@ -141,6 +173,7 @@ void cpu_hotplug_begin(void) | |||
| 141 | cpuhp_lock_acquire(); | 173 | cpuhp_lock_acquire(); |
| 142 | for (;;) { | 174 | for (;;) { |
| 143 | mutex_lock(&cpu_hotplug.lock); | 175 | mutex_lock(&cpu_hotplug.lock); |
| 176 | apply_puts_pending(1); | ||
| 144 | if (likely(!cpu_hotplug.refcount)) | 177 | if (likely(!cpu_hotplug.refcount)) |
| 145 | break; | 178 | break; |
| 146 | __set_current_state(TASK_UNINTERRUPTIBLE); | 179 | __set_current_state(TASK_UNINTERRUPTIBLE); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 52cb04c993b7..64b257f6bca2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { | |||
| 248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) | 248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
| 249 | 249 | ||
| 250 | /* | 250 | /* |
| 251 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 251 | * There are two global locks guarding cpuset structures - cpuset_mutex and |
| 252 | * and callback_mutex. The latter may nest inside the former. We also | 252 | * callback_lock. We also require taking task_lock() when dereferencing a |
| 253 | * require taking task_lock() when dereferencing a task's cpuset pointer. | 253 | * task's cpuset pointer. See "The task_lock() exception", at the end of this |
| 254 | * See "The task_lock() exception", at the end of this comment. | 254 | * comment. |
| 255 | * | 255 | * |
| 256 | * A task must hold both mutexes to modify cpusets. If a task holds | 256 | * A task must hold both locks to modify cpusets. If a task holds |
| 257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it | 257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
| 258 | * is the only task able to also acquire callback_mutex and be able to | 258 | * is the only task able to also acquire callback_lock and be able to |
| 259 | * modify cpusets. It can perform various checks on the cpuset structure | 259 | * modify cpusets. It can perform various checks on the cpuset structure |
| 260 | * first, knowing nothing will change. It can also allocate memory while | 260 | * first, knowing nothing will change. It can also allocate memory while |
| 261 | * just holding cpuset_mutex. While it is performing these checks, various | 261 | * just holding cpuset_mutex. While it is performing these checks, various |
| 262 | * callback routines can briefly acquire callback_mutex to query cpusets. | 262 | * callback routines can briefly acquire callback_lock to query cpusets. |
| 263 | * Once it is ready to make the changes, it takes callback_mutex, blocking | 263 | * Once it is ready to make the changes, it takes callback_lock, blocking |
| 264 | * everyone else. | 264 | * everyone else. |
| 265 | * | 265 | * |
| 266 | * Calls to the kernel memory allocator can not be made while holding | 266 | * Calls to the kernel memory allocator can not be made while holding |
| 267 | * callback_mutex, as that would risk double tripping on callback_mutex | 267 | * callback_lock, as that would risk double tripping on callback_lock |
| 268 | * from one of the callbacks into the cpuset code from within | 268 | * from one of the callbacks into the cpuset code from within |
| 269 | * __alloc_pages(). | 269 | * __alloc_pages(). |
| 270 | * | 270 | * |
| 271 | * If a task is only holding callback_mutex, then it has read-only | 271 | * If a task is only holding callback_lock, then it has read-only |
| 272 | * access to cpusets. | 272 | * access to cpusets. |
| 273 | * | 273 | * |
| 274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed | 274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
| 275 | * by other task, we use alloc_lock in the task_struct fields to protect | 275 | * by other task, we use alloc_lock in the task_struct fields to protect |
| 276 | * them. | 276 | * them. |
| 277 | * | 277 | * |
| 278 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 278 | * The cpuset_common_file_read() handlers only hold callback_lock across |
| 279 | * small pieces of code, such as when reading out possibly multi-word | 279 | * small pieces of code, such as when reading out possibly multi-word |
| 280 | * cpumasks and nodemasks. | 280 | * cpumasks and nodemasks. |
| 281 | * | 281 | * |
| @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { | |||
| 284 | */ | 284 | */ |
| 285 | 285 | ||
| 286 | static DEFINE_MUTEX(cpuset_mutex); | 286 | static DEFINE_MUTEX(cpuset_mutex); |
| 287 | static DEFINE_MUTEX(callback_mutex); | 287 | static DEFINE_SPINLOCK(callback_lock); |
| 288 | 288 | ||
| 289 | /* | 289 | /* |
| 290 | * CPU / memory hotplug is handled asynchronously. | 290 | * CPU / memory hotplug is handled asynchronously. |
| @@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { | |||
| 329 | * One way or another, we guarantee to return some non-empty subset | 329 | * One way or another, we guarantee to return some non-empty subset |
| 330 | * of cpu_online_mask. | 330 | * of cpu_online_mask. |
| 331 | * | 331 | * |
| 332 | * Call with callback_mutex held. | 332 | * Call with callback_lock or cpuset_mutex held. |
| 333 | */ | 333 | */ |
| 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
| 335 | { | 335 | { |
| @@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | |||
| 347 | * One way or another, we guarantee to return some non-empty subset | 347 | * One way or another, we guarantee to return some non-empty subset |
| 348 | * of node_states[N_MEMORY]. | 348 | * of node_states[N_MEMORY]. |
| 349 | * | 349 | * |
| 350 | * Call with callback_mutex held. | 350 | * Call with callback_lock or cpuset_mutex held. |
| 351 | */ | 351 | */ |
| 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
| 353 | { | 353 | { |
| @@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | |||
| 359 | /* | 359 | /* |
| 360 | * update task's spread flag if cpuset's page/slab spread flag is set | 360 | * update task's spread flag if cpuset's page/slab spread flag is set |
| 361 | * | 361 | * |
| 362 | * Called with callback_mutex/cpuset_mutex held | 362 | * Call with callback_lock or cpuset_mutex held. |
| 363 | */ | 363 | */ |
| 364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
| 365 | struct task_struct *tsk) | 365 | struct task_struct *tsk) |
| @@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
| 506 | goto out; | 506 | goto out; |
| 507 | } | 507 | } |
| 508 | 508 | ||
| 509 | /* | ||
| 510 | * We can't shrink if we won't have enough room for SCHED_DEADLINE | ||
| 511 | * tasks. | ||
| 512 | */ | ||
| 513 | ret = -EBUSY; | ||
| 514 | if (is_cpu_exclusive(cur) && | ||
| 515 | !cpuset_cpumask_can_shrink(cur->cpus_allowed, | ||
| 516 | trial->cpus_allowed)) | ||
| 517 | goto out; | ||
| 518 | |||
| 509 | ret = 0; | 519 | ret = 0; |
| 510 | out: | 520 | out: |
| 511 | rcu_read_unlock(); | 521 | rcu_read_unlock(); |
| @@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) | |||
| 876 | continue; | 886 | continue; |
| 877 | rcu_read_unlock(); | 887 | rcu_read_unlock(); |
| 878 | 888 | ||
| 879 | mutex_lock(&callback_mutex); | 889 | spin_lock_irq(&callback_lock); |
| 880 | cpumask_copy(cp->effective_cpus, new_cpus); | 890 | cpumask_copy(cp->effective_cpus, new_cpus); |
| 881 | mutex_unlock(&callback_mutex); | 891 | spin_unlock_irq(&callback_lock); |
| 882 | 892 | ||
| 883 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 893 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
| 884 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | 894 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
| @@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 943 | if (retval < 0) | 953 | if (retval < 0) |
| 944 | return retval; | 954 | return retval; |
| 945 | 955 | ||
| 946 | mutex_lock(&callback_mutex); | 956 | spin_lock_irq(&callback_lock); |
| 947 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 957 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
| 948 | mutex_unlock(&callback_mutex); | 958 | spin_unlock_irq(&callback_lock); |
| 949 | 959 | ||
| 950 | /* use trialcs->cpus_allowed as a temp variable */ | 960 | /* use trialcs->cpus_allowed as a temp variable */ |
| 951 | update_cpumasks_hier(cs, trialcs->cpus_allowed); | 961 | update_cpumasks_hier(cs, trialcs->cpus_allowed); |
| @@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
| 1132 | continue; | 1142 | continue; |
| 1133 | rcu_read_unlock(); | 1143 | rcu_read_unlock(); |
| 1134 | 1144 | ||
| 1135 | mutex_lock(&callback_mutex); | 1145 | spin_lock_irq(&callback_lock); |
| 1136 | cp->effective_mems = *new_mems; | 1146 | cp->effective_mems = *new_mems; |
| 1137 | mutex_unlock(&callback_mutex); | 1147 | spin_unlock_irq(&callback_lock); |
| 1138 | 1148 | ||
| 1139 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 1149 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
| 1140 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | 1150 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
| @@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
| 1155 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1165 | * mempolicies and if the cpuset is marked 'memory_migrate', |
| 1156 | * migrate the tasks pages to the new memory. | 1166 | * migrate the tasks pages to the new memory. |
| 1157 | * | 1167 | * |
| 1158 | * Call with cpuset_mutex held. May take callback_mutex during call. | 1168 | * Call with cpuset_mutex held. May take callback_lock during call. |
| 1159 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1169 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
| 1160 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1170 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
| 1161 | * their mempolicies to the cpusets new mems_allowed. | 1171 | * their mempolicies to the cpusets new mems_allowed. |
| @@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1202 | if (retval < 0) | 1212 | if (retval < 0) |
| 1203 | goto done; | 1213 | goto done; |
| 1204 | 1214 | ||
| 1205 | mutex_lock(&callback_mutex); | 1215 | spin_lock_irq(&callback_lock); |
| 1206 | cs->mems_allowed = trialcs->mems_allowed; | 1216 | cs->mems_allowed = trialcs->mems_allowed; |
| 1207 | mutex_unlock(&callback_mutex); | 1217 | spin_unlock_irq(&callback_lock); |
| 1208 | 1218 | ||
| 1209 | /* use trialcs->mems_allowed as a temp variable */ | 1219 | /* use trialcs->mems_allowed as a temp variable */ |
| 1210 | update_nodemasks_hier(cs, &cs->mems_allowed); | 1220 | update_nodemasks_hier(cs, &cs->mems_allowed); |
| @@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
| 1295 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) | 1305 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
| 1296 | || (is_spread_page(cs) != is_spread_page(trialcs))); | 1306 | || (is_spread_page(cs) != is_spread_page(trialcs))); |
| 1297 | 1307 | ||
| 1298 | mutex_lock(&callback_mutex); | 1308 | spin_lock_irq(&callback_lock); |
| 1299 | cs->flags = trialcs->flags; | 1309 | cs->flags = trialcs->flags; |
| 1300 | mutex_unlock(&callback_mutex); | 1310 | spin_unlock_irq(&callback_lock); |
| 1301 | 1311 | ||
| 1302 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1312 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
| 1303 | rebuild_sched_domains_locked(); | 1313 | rebuild_sched_domains_locked(); |
| @@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
| 1429 | goto out_unlock; | 1439 | goto out_unlock; |
| 1430 | 1440 | ||
| 1431 | cgroup_taskset_for_each(task, tset) { | 1441 | cgroup_taskset_for_each(task, tset) { |
| 1432 | /* | 1442 | ret = task_can_attach(task, cs->cpus_allowed); |
| 1433 | * Kthreads which disallow setaffinity shouldn't be moved | 1443 | if (ret) |
| 1434 | * to a new cpuset; we don't want to change their cpu | ||
| 1435 | * affinity and isolating such threads by their set of | ||
| 1436 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
| 1437 | * applicable for such threads. This prevents checking for | ||
| 1438 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
| 1439 | * before cpus_allowed may be changed. | ||
| 1440 | */ | ||
| 1441 | ret = -EINVAL; | ||
| 1442 | if (task->flags & PF_NO_SETAFFINITY) | ||
| 1443 | goto out_unlock; | 1444 | goto out_unlock; |
| 1444 | ret = security_task_setscheduler(task); | 1445 | ret = security_task_setscheduler(task); |
| 1445 | if (ret) | 1446 | if (ret) |
| @@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
| 1713 | count = seq_get_buf(sf, &buf); | 1714 | count = seq_get_buf(sf, &buf); |
| 1714 | s = buf; | 1715 | s = buf; |
| 1715 | 1716 | ||
| 1716 | mutex_lock(&callback_mutex); | 1717 | spin_lock_irq(&callback_lock); |
| 1717 | 1718 | ||
| 1718 | switch (type) { | 1719 | switch (type) { |
| 1719 | case FILE_CPULIST: | 1720 | case FILE_CPULIST: |
| @@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
| 1740 | seq_commit(sf, -1); | 1741 | seq_commit(sf, -1); |
| 1741 | } | 1742 | } |
| 1742 | out_unlock: | 1743 | out_unlock: |
| 1743 | mutex_unlock(&callback_mutex); | 1744 | spin_unlock_irq(&callback_lock); |
| 1744 | return ret; | 1745 | return ret; |
| 1745 | } | 1746 | } |
| 1746 | 1747 | ||
| @@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
| 1957 | 1958 | ||
| 1958 | cpuset_inc(); | 1959 | cpuset_inc(); |
| 1959 | 1960 | ||
| 1960 | mutex_lock(&callback_mutex); | 1961 | spin_lock_irq(&callback_lock); |
| 1961 | if (cgroup_on_dfl(cs->css.cgroup)) { | 1962 | if (cgroup_on_dfl(cs->css.cgroup)) { |
| 1962 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | 1963 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
| 1963 | cs->effective_mems = parent->effective_mems; | 1964 | cs->effective_mems = parent->effective_mems; |
| 1964 | } | 1965 | } |
| 1965 | mutex_unlock(&callback_mutex); | 1966 | spin_unlock_irq(&callback_lock); |
| 1966 | 1967 | ||
| 1967 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1968 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
| 1968 | goto out_unlock; | 1969 | goto out_unlock; |
| @@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
| 1989 | } | 1990 | } |
| 1990 | rcu_read_unlock(); | 1991 | rcu_read_unlock(); |
| 1991 | 1992 | ||
| 1992 | mutex_lock(&callback_mutex); | 1993 | spin_lock_irq(&callback_lock); |
| 1993 | cs->mems_allowed = parent->mems_allowed; | 1994 | cs->mems_allowed = parent->mems_allowed; |
| 1994 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1995 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
| 1995 | mutex_unlock(&callback_mutex); | 1996 | spin_unlock_irq(&callback_lock); |
| 1996 | out_unlock: | 1997 | out_unlock: |
| 1997 | mutex_unlock(&cpuset_mutex); | 1998 | mutex_unlock(&cpuset_mutex); |
| 1998 | return 0; | 1999 | return 0; |
| @@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) | |||
| 2031 | static void cpuset_bind(struct cgroup_subsys_state *root_css) | 2032 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
| 2032 | { | 2033 | { |
| 2033 | mutex_lock(&cpuset_mutex); | 2034 | mutex_lock(&cpuset_mutex); |
| 2034 | mutex_lock(&callback_mutex); | 2035 | spin_lock_irq(&callback_lock); |
| 2035 | 2036 | ||
| 2036 | if (cgroup_on_dfl(root_css->cgroup)) { | 2037 | if (cgroup_on_dfl(root_css->cgroup)) { |
| 2037 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | 2038 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
| @@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
| 2042 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | 2043 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
| 2043 | } | 2044 | } |
| 2044 | 2045 | ||
| 2045 | mutex_unlock(&callback_mutex); | 2046 | spin_unlock_irq(&callback_lock); |
| 2046 | mutex_unlock(&cpuset_mutex); | 2047 | mutex_unlock(&cpuset_mutex); |
| 2047 | } | 2048 | } |
| 2048 | 2049 | ||
| @@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, | |||
| 2127 | { | 2128 | { |
| 2128 | bool is_empty; | 2129 | bool is_empty; |
| 2129 | 2130 | ||
| 2130 | mutex_lock(&callback_mutex); | 2131 | spin_lock_irq(&callback_lock); |
| 2131 | cpumask_copy(cs->cpus_allowed, new_cpus); | 2132 | cpumask_copy(cs->cpus_allowed, new_cpus); |
| 2132 | cpumask_copy(cs->effective_cpus, new_cpus); | 2133 | cpumask_copy(cs->effective_cpus, new_cpus); |
| 2133 | cs->mems_allowed = *new_mems; | 2134 | cs->mems_allowed = *new_mems; |
| 2134 | cs->effective_mems = *new_mems; | 2135 | cs->effective_mems = *new_mems; |
| 2135 | mutex_unlock(&callback_mutex); | 2136 | spin_unlock_irq(&callback_lock); |
| 2136 | 2137 | ||
| 2137 | /* | 2138 | /* |
| 2138 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | 2139 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, |
| @@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, | |||
| 2169 | if (nodes_empty(*new_mems)) | 2170 | if (nodes_empty(*new_mems)) |
| 2170 | *new_mems = parent_cs(cs)->effective_mems; | 2171 | *new_mems = parent_cs(cs)->effective_mems; |
| 2171 | 2172 | ||
| 2172 | mutex_lock(&callback_mutex); | 2173 | spin_lock_irq(&callback_lock); |
| 2173 | cpumask_copy(cs->effective_cpus, new_cpus); | 2174 | cpumask_copy(cs->effective_cpus, new_cpus); |
| 2174 | cs->effective_mems = *new_mems; | 2175 | cs->effective_mems = *new_mems; |
| 2175 | mutex_unlock(&callback_mutex); | 2176 | spin_unlock_irq(&callback_lock); |
| 2176 | 2177 | ||
| 2177 | if (cpus_updated) | 2178 | if (cpus_updated) |
| 2178 | update_tasks_cpumask(cs); | 2179 | update_tasks_cpumask(cs); |
| @@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2258 | 2259 | ||
| 2259 | /* synchronize cpus_allowed to cpu_active_mask */ | 2260 | /* synchronize cpus_allowed to cpu_active_mask */ |
| 2260 | if (cpus_updated) { | 2261 | if (cpus_updated) { |
| 2261 | mutex_lock(&callback_mutex); | 2262 | spin_lock_irq(&callback_lock); |
| 2262 | if (!on_dfl) | 2263 | if (!on_dfl) |
| 2263 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | 2264 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
| 2264 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); | 2265 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); |
| 2265 | mutex_unlock(&callback_mutex); | 2266 | spin_unlock_irq(&callback_lock); |
| 2266 | /* we don't mess with cpumasks of tasks in top_cpuset */ | 2267 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
| 2267 | } | 2268 | } |
| 2268 | 2269 | ||
| 2269 | /* synchronize mems_allowed to N_MEMORY */ | 2270 | /* synchronize mems_allowed to N_MEMORY */ |
| 2270 | if (mems_updated) { | 2271 | if (mems_updated) { |
| 2271 | mutex_lock(&callback_mutex); | 2272 | spin_lock_irq(&callback_lock); |
| 2272 | if (!on_dfl) | 2273 | if (!on_dfl) |
| 2273 | top_cpuset.mems_allowed = new_mems; | 2274 | top_cpuset.mems_allowed = new_mems; |
| 2274 | top_cpuset.effective_mems = new_mems; | 2275 | top_cpuset.effective_mems = new_mems; |
| 2275 | mutex_unlock(&callback_mutex); | 2276 | spin_unlock_irq(&callback_lock); |
| 2276 | update_tasks_nodemask(&top_cpuset); | 2277 | update_tasks_nodemask(&top_cpuset); |
| 2277 | } | 2278 | } |
| 2278 | 2279 | ||
| @@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void) | |||
| 2365 | 2366 | ||
| 2366 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2367 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
| 2367 | { | 2368 | { |
| 2368 | mutex_lock(&callback_mutex); | 2369 | unsigned long flags; |
| 2370 | |||
| 2371 | spin_lock_irqsave(&callback_lock, flags); | ||
| 2369 | rcu_read_lock(); | 2372 | rcu_read_lock(); |
| 2370 | guarantee_online_cpus(task_cs(tsk), pmask); | 2373 | guarantee_online_cpus(task_cs(tsk), pmask); |
| 2371 | rcu_read_unlock(); | 2374 | rcu_read_unlock(); |
| 2372 | mutex_unlock(&callback_mutex); | 2375 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2373 | } | 2376 | } |
| 2374 | 2377 | ||
| 2375 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2378 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
| @@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) | |||
| 2415 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2418 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
| 2416 | { | 2419 | { |
| 2417 | nodemask_t mask; | 2420 | nodemask_t mask; |
| 2421 | unsigned long flags; | ||
| 2418 | 2422 | ||
| 2419 | mutex_lock(&callback_mutex); | 2423 | spin_lock_irqsave(&callback_lock, flags); |
| 2420 | rcu_read_lock(); | 2424 | rcu_read_lock(); |
| 2421 | guarantee_online_mems(task_cs(tsk), &mask); | 2425 | guarantee_online_mems(task_cs(tsk), &mask); |
| 2422 | rcu_read_unlock(); | 2426 | rcu_read_unlock(); |
| 2423 | mutex_unlock(&callback_mutex); | 2427 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2424 | 2428 | ||
| 2425 | return mask; | 2429 | return mask; |
| 2426 | } | 2430 | } |
| @@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
| 2439 | /* | 2443 | /* |
| 2440 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or | 2444 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
| 2441 | * mem_hardwall ancestor to the specified cpuset. Call holding | 2445 | * mem_hardwall ancestor to the specified cpuset. Call holding |
| 2442 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2446 | * callback_lock. If no ancestor is mem_exclusive or mem_hardwall |
| 2443 | * (an unusual configuration), then returns the root cpuset. | 2447 | * (an unusual configuration), then returns the root cpuset. |
| 2444 | */ | 2448 | */ |
| 2445 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | 2449 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
| @@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2450 | } | 2454 | } |
| 2451 | 2455 | ||
| 2452 | /** | 2456 | /** |
| 2453 | * cpuset_node_allowed_softwall - Can we allocate on a memory node? | 2457 | * cpuset_node_allowed - Can we allocate on a memory node? |
| 2454 | * @node: is this an allowed node? | 2458 | * @node: is this an allowed node? |
| 2455 | * @gfp_mask: memory allocation flags | 2459 | * @gfp_mask: memory allocation flags |
| 2456 | * | 2460 | * |
| @@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2462 | * flag, yes. | 2466 | * flag, yes. |
| 2463 | * Otherwise, no. | 2467 | * Otherwise, no. |
| 2464 | * | 2468 | * |
| 2465 | * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to | ||
| 2466 | * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() | ||
| 2467 | * might sleep, and might allow a node from an enclosing cpuset. | ||
| 2468 | * | ||
| 2469 | * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall | ||
| 2470 | * cpusets, and never sleeps. | ||
| 2471 | * | ||
| 2472 | * The __GFP_THISNODE placement logic is really handled elsewhere, | 2469 | * The __GFP_THISNODE placement logic is really handled elsewhere, |
| 2473 | * by forcibly using a zonelist starting at a specified node, and by | 2470 | * by forcibly using a zonelist starting at a specified node, and by |
| 2474 | * (in get_page_from_freelist()) refusing to consider the zones for | 2471 | * (in get_page_from_freelist()) refusing to consider the zones for |
| @@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2481 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2478 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 2482 | * nearest enclosing hardwalled ancestor cpuset. | 2479 | * nearest enclosing hardwalled ancestor cpuset. |
| 2483 | * | 2480 | * |
| 2484 | * Scanning up parent cpusets requires callback_mutex. The | 2481 | * Scanning up parent cpusets requires callback_lock. The |
| 2485 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 2482 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
| 2486 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the | 2483 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the |
| 2487 | * current tasks mems_allowed came up empty on the first pass over | 2484 | * current tasks mems_allowed came up empty on the first pass over |
| 2488 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the | 2485 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the |
| 2489 | * cpuset are short of memory, might require taking the callback_mutex | 2486 | * cpuset are short of memory, might require taking the callback_lock. |
| 2490 | * mutex. | ||
| 2491 | * | 2487 | * |
| 2492 | * The first call here from mm/page_alloc:get_page_from_freelist() | 2488 | * The first call here from mm/page_alloc:get_page_from_freelist() |
| 2493 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, | 2489 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
| @@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2504 | * TIF_MEMDIE - any node ok | 2500 | * TIF_MEMDIE - any node ok |
| 2505 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok | 2501 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
| 2506 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2502 | * GFP_USER - only nodes in current tasks mems allowed ok. |
| 2507 | * | ||
| 2508 | * Rule: | ||
| 2509 | * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you | ||
| 2510 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | ||
| 2511 | * the code that might scan up ancestor cpusets and sleep. | ||
| 2512 | */ | 2503 | */ |
| 2513 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2504 | int __cpuset_node_allowed(int node, gfp_t gfp_mask) |
| 2514 | { | 2505 | { |
| 2515 | struct cpuset *cs; /* current cpuset ancestors */ | 2506 | struct cpuset *cs; /* current cpuset ancestors */ |
| 2516 | int allowed; /* is allocation in zone z allowed? */ | 2507 | int allowed; /* is allocation in zone z allowed? */ |
| 2508 | unsigned long flags; | ||
| 2517 | 2509 | ||
| 2518 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2510 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
| 2519 | return 1; | 2511 | return 1; |
| 2520 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | ||
| 2521 | if (node_isset(node, current->mems_allowed)) | 2512 | if (node_isset(node, current->mems_allowed)) |
| 2522 | return 1; | 2513 | return 1; |
| 2523 | /* | 2514 | /* |
| @@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | |||
| 2533 | return 1; | 2524 | return 1; |
| 2534 | 2525 | ||
| 2535 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 2526 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 2536 | mutex_lock(&callback_mutex); | 2527 | spin_lock_irqsave(&callback_lock, flags); |
| 2537 | 2528 | ||
| 2538 | rcu_read_lock(); | 2529 | rcu_read_lock(); |
| 2539 | cs = nearest_hardwall_ancestor(task_cs(current)); | 2530 | cs = nearest_hardwall_ancestor(task_cs(current)); |
| 2540 | allowed = node_isset(node, cs->mems_allowed); | 2531 | allowed = node_isset(node, cs->mems_allowed); |
| 2541 | rcu_read_unlock(); | 2532 | rcu_read_unlock(); |
| 2542 | 2533 | ||
| 2543 | mutex_unlock(&callback_mutex); | 2534 | spin_unlock_irqrestore(&callback_lock, flags); |
| 2544 | return allowed; | 2535 | return allowed; |
| 2545 | } | 2536 | } |
| 2546 | 2537 | ||
| 2547 | /* | ||
| 2548 | * cpuset_node_allowed_hardwall - Can we allocate on a memory node? | ||
| 2549 | * @node: is this an allowed node? | ||
| 2550 | * @gfp_mask: memory allocation flags | ||
| 2551 | * | ||
| 2552 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is | ||
| 2553 | * set, yes, we can always allocate. If node is in our task's mems_allowed, | ||
| 2554 | * yes. If the task has been OOM killed and has access to memory reserves as | ||
| 2555 | * specified by the TIF_MEMDIE flag, yes. | ||
| 2556 | * Otherwise, no. | ||
| 2557 | * | ||
| 2558 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
| 2559 | * by forcibly using a zonelist starting at a specified node, and by | ||
| 2560 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
| 2561 | * any node on the zonelist except the first. By the time any such | ||
| 2562 | * calls get to this routine, we should just shut up and say 'yes'. | ||
| 2563 | * | ||
| 2564 | * Unlike the cpuset_node_allowed_softwall() variant, above, | ||
| 2565 | * this variant requires that the node be in the current task's | ||
| 2566 | * mems_allowed or that we're in interrupt. It does not scan up the | ||
| 2567 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. | ||
| 2568 | * It never sleeps. | ||
| 2569 | */ | ||
| 2570 | int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | ||
| 2571 | { | ||
| 2572 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | ||
| 2573 | return 1; | ||
| 2574 | if (node_isset(node, current->mems_allowed)) | ||
| 2575 | return 1; | ||
| 2576 | /* | ||
| 2577 | * Allow tasks that have access to memory reserves because they have | ||
| 2578 | * been OOM killed to get memory anywhere. | ||
| 2579 | */ | ||
| 2580 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
| 2581 | return 1; | ||
| 2582 | return 0; | ||
| 2583 | } | ||
| 2584 | |||
| 2585 | /** | 2538 | /** |
| 2586 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 2539 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
| 2587 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | 2540 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
| @@ -2730,10 +2683,9 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2730 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it | 2683 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
| 2731 | * anyway. | 2684 | * anyway. |
| 2732 | */ | 2685 | */ |
| 2733 | int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2686 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
| 2687 | struct pid *pid, struct task_struct *tsk) | ||
| 2734 | { | 2688 | { |
| 2735 | struct pid *pid; | ||
| 2736 | struct task_struct *tsk; | ||
| 2737 | char *buf, *p; | 2689 | char *buf, *p; |
| 2738 | struct cgroup_subsys_state *css; | 2690 | struct cgroup_subsys_state *css; |
| 2739 | int retval; | 2691 | int retval; |
| @@ -2743,24 +2695,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
| 2743 | if (!buf) | 2695 | if (!buf) |
| 2744 | goto out; | 2696 | goto out; |
| 2745 | 2697 | ||
| 2746 | retval = -ESRCH; | ||
| 2747 | pid = m->private; | ||
| 2748 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 2749 | if (!tsk) | ||
| 2750 | goto out_free; | ||
| 2751 | |||
| 2752 | retval = -ENAMETOOLONG; | 2698 | retval = -ENAMETOOLONG; |
| 2753 | rcu_read_lock(); | 2699 | rcu_read_lock(); |
| 2754 | css = task_css(tsk, cpuset_cgrp_id); | 2700 | css = task_css(tsk, cpuset_cgrp_id); |
| 2755 | p = cgroup_path(css->cgroup, buf, PATH_MAX); | 2701 | p = cgroup_path(css->cgroup, buf, PATH_MAX); |
| 2756 | rcu_read_unlock(); | 2702 | rcu_read_unlock(); |
| 2757 | if (!p) | 2703 | if (!p) |
| 2758 | goto out_put_task; | 2704 | goto out_free; |
| 2759 | seq_puts(m, p); | 2705 | seq_puts(m, p); |
| 2760 | seq_putc(m, '\n'); | 2706 | seq_putc(m, '\n'); |
| 2761 | retval = 0; | 2707 | retval = 0; |
| 2762 | out_put_task: | ||
| 2763 | put_task_struct(tsk); | ||
| 2764 | out_free: | 2708 | out_free: |
| 2765 | kfree(buf); | 2709 | kfree(buf); |
| 2766 | out: | 2710 | out: |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index c766ee54c0b1..b64e238b553b 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c | |||
| @@ -18,6 +18,7 @@ unsigned long saved_max_pfn; | |||
| 18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. | 18 | * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. |
| 19 | */ | 19 | */ |
| 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; |
| 21 | EXPORT_SYMBOL_GPL(elfcorehdr_addr); | ||
| 21 | 22 | ||
| 22 | /* | 23 | /* |
| 23 | * stores the size of elf header of crash image | 24 | * stores the size of elf header of crash image |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1adf62b39b96..07ce18ca71e0 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -27,6 +27,9 @@ | |||
| 27 | * version 2. This program is licensed "as is" without any warranty of any | 27 | * version 2. This program is licensed "as is" without any warranty of any |
| 28 | * kind, whether express or implied. | 28 | * kind, whether express or implied. |
| 29 | */ | 29 | */ |
| 30 | |||
| 31 | #define pr_fmt(fmt) "KGDB: " fmt | ||
| 32 | |||
| 30 | #include <linux/pid_namespace.h> | 33 | #include <linux/pid_namespace.h> |
| 31 | #include <linux/clocksource.h> | 34 | #include <linux/clocksource.h> |
| 32 | #include <linux/serial_core.h> | 35 | #include <linux/serial_core.h> |
| @@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr) | |||
| 196 | return err; | 199 | return err; |
| 197 | err = kgdb_arch_remove_breakpoint(&tmp); | 200 | err = kgdb_arch_remove_breakpoint(&tmp); |
| 198 | if (err) | 201 | if (err) |
| 199 | printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " | 202 | pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n", |
| 200 | "memory destroyed at: %lx", addr); | 203 | addr); |
| 201 | return err; | 204 | return err; |
| 202 | } | 205 | } |
| 203 | 206 | ||
| @@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void) | |||
| 256 | error = kgdb_arch_set_breakpoint(&kgdb_break[i]); | 259 | error = kgdb_arch_set_breakpoint(&kgdb_break[i]); |
| 257 | if (error) { | 260 | if (error) { |
| 258 | ret = error; | 261 | ret = error; |
| 259 | printk(KERN_INFO "KGDB: BP install failed: %lx", | 262 | pr_info("BP install failed: %lx\n", |
| 260 | kgdb_break[i].bpt_addr); | 263 | kgdb_break[i].bpt_addr); |
| 261 | continue; | 264 | continue; |
| 262 | } | 265 | } |
| 263 | 266 | ||
| @@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void) | |||
| 319 | continue; | 322 | continue; |
| 320 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); | 323 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
| 321 | if (error) { | 324 | if (error) { |
| 322 | printk(KERN_INFO "KGDB: BP remove failed: %lx\n", | 325 | pr_info("BP remove failed: %lx\n", |
| 323 | kgdb_break[i].bpt_addr); | 326 | kgdb_break[i].bpt_addr); |
| 324 | ret = error; | 327 | ret = error; |
| 325 | } | 328 | } |
| 326 | 329 | ||
| @@ -367,7 +370,7 @@ int dbg_remove_all_break(void) | |||
| 367 | goto setundefined; | 370 | goto setundefined; |
| 368 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); | 371 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
| 369 | if (error) | 372 | if (error) |
| 370 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", | 373 | pr_err("breakpoint remove failed: %lx\n", |
| 371 | kgdb_break[i].bpt_addr); | 374 | kgdb_break[i].bpt_addr); |
| 372 | setundefined: | 375 | setundefined: |
| 373 | kgdb_break[i].state = BP_UNDEFINED; | 376 | kgdb_break[i].state = BP_UNDEFINED; |
| @@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait) | |||
| 400 | if (print_wait) { | 403 | if (print_wait) { |
| 401 | #ifdef CONFIG_KGDB_KDB | 404 | #ifdef CONFIG_KGDB_KDB |
| 402 | if (!dbg_kdb_mode) | 405 | if (!dbg_kdb_mode) |
| 403 | printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); | 406 | pr_crit("waiting... or $3#33 for KDB\n"); |
| 404 | #else | 407 | #else |
| 405 | printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); | 408 | pr_crit("Waiting for remote debugger\n"); |
| 406 | #endif | 409 | #endif |
| 407 | } | 410 | } |
| 408 | return 1; | 411 | return 1; |
| @@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
| 430 | exception_level = 0; | 433 | exception_level = 0; |
| 431 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); | 434 | kgdb_skipexception(ks->ex_vector, ks->linux_regs); |
| 432 | dbg_activate_sw_breakpoints(); | 435 | dbg_activate_sw_breakpoints(); |
| 433 | printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", | 436 | pr_crit("re-enter error: breakpoint removed %lx\n", addr); |
| 434 | addr); | ||
| 435 | WARN_ON_ONCE(1); | 437 | WARN_ON_ONCE(1); |
| 436 | 438 | ||
| 437 | return 1; | 439 | return 1; |
| @@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) | |||
| 444 | panic("Recursive entry to debugger"); | 446 | panic("Recursive entry to debugger"); |
| 445 | } | 447 | } |
| 446 | 448 | ||
| 447 | printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); | 449 | pr_crit("re-enter exception: ALL breakpoints killed\n"); |
| 448 | #ifdef CONFIG_KGDB_KDB | 450 | #ifdef CONFIG_KGDB_KDB |
| 449 | /* Allow kdb to debug itself one level */ | 451 | /* Allow kdb to debug itself one level */ |
| 450 | return 0; | 452 | return 0; |
| @@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, | |||
| 471 | int cpu; | 473 | int cpu; |
| 472 | int trace_on = 0; | 474 | int trace_on = 0; |
| 473 | int online_cpus = num_online_cpus(); | 475 | int online_cpus = num_online_cpus(); |
| 476 | u64 time_left; | ||
| 474 | 477 | ||
| 475 | kgdb_info[ks->cpu].enter_kgdb++; | 478 | kgdb_info[ks->cpu].enter_kgdb++; |
| 476 | kgdb_info[ks->cpu].exception_state |= exception_state; | 479 | kgdb_info[ks->cpu].exception_state |= exception_state; |
| @@ -595,9 +598,13 @@ return_normal: | |||
| 595 | /* | 598 | /* |
| 596 | * Wait for the other CPUs to be notified and be waiting for us: | 599 | * Wait for the other CPUs to be notified and be waiting for us: |
| 597 | */ | 600 | */ |
| 598 | while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + | 601 | time_left = loops_per_jiffy * HZ; |
| 599 | atomic_read(&slaves_in_kgdb)) != online_cpus) | 602 | while (kgdb_do_roundup && --time_left && |
| 603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != | ||
| 604 | online_cpus) | ||
| 600 | cpu_relax(); | 605 | cpu_relax(); |
| 606 | if (!time_left) | ||
| 607 | pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); | ||
| 601 | 608 | ||
| 602 | /* | 609 | /* |
| 603 | * At this point the primary processor is completely | 610 | * At this point the primary processor is completely |
| @@ -795,15 +802,15 @@ static struct console kgdbcons = { | |||
| 795 | static void sysrq_handle_dbg(int key) | 802 | static void sysrq_handle_dbg(int key) |
| 796 | { | 803 | { |
| 797 | if (!dbg_io_ops) { | 804 | if (!dbg_io_ops) { |
| 798 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | 805 | pr_crit("ERROR: No KGDB I/O module available\n"); |
| 799 | return; | 806 | return; |
| 800 | } | 807 | } |
| 801 | if (!kgdb_connected) { | 808 | if (!kgdb_connected) { |
| 802 | #ifdef CONFIG_KGDB_KDB | 809 | #ifdef CONFIG_KGDB_KDB |
| 803 | if (!dbg_kdb_mode) | 810 | if (!dbg_kdb_mode) |
| 804 | printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); | 811 | pr_crit("KGDB or $3#33 for KDB\n"); |
| 805 | #else | 812 | #else |
| 806 | printk(KERN_CRIT "Entering KGDB\n"); | 813 | pr_crit("Entering KGDB\n"); |
| 807 | #endif | 814 | #endif |
| 808 | } | 815 | } |
| 809 | 816 | ||
| @@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void) | |||
| 945 | { | 952 | { |
| 946 | kgdb_break_asap = 0; | 953 | kgdb_break_asap = 0; |
| 947 | 954 | ||
| 948 | printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); | 955 | pr_crit("Waiting for connection from remote gdb...\n"); |
| 949 | kgdb_breakpoint(); | 956 | kgdb_breakpoint(); |
| 950 | } | 957 | } |
| 951 | 958 | ||
| @@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) | |||
| 964 | if (dbg_io_ops) { | 971 | if (dbg_io_ops) { |
| 965 | spin_unlock(&kgdb_registration_lock); | 972 | spin_unlock(&kgdb_registration_lock); |
| 966 | 973 | ||
| 967 | printk(KERN_ERR "kgdb: Another I/O driver is already " | 974 | pr_err("Another I/O driver is already registered with KGDB\n"); |
| 968 | "registered with KGDB.\n"); | ||
| 969 | return -EBUSY; | 975 | return -EBUSY; |
| 970 | } | 976 | } |
| 971 | 977 | ||
| @@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) | |||
| 981 | 987 | ||
| 982 | spin_unlock(&kgdb_registration_lock); | 988 | spin_unlock(&kgdb_registration_lock); |
| 983 | 989 | ||
| 984 | printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", | 990 | pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); |
| 985 | new_dbg_io_ops->name); | ||
| 986 | 991 | ||
| 987 | /* Arm KGDB now. */ | 992 | /* Arm KGDB now. */ |
| 988 | kgdb_register_callbacks(); | 993 | kgdb_register_callbacks(); |
| @@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) | |||
| 1017 | 1022 | ||
| 1018 | spin_unlock(&kgdb_registration_lock); | 1023 | spin_unlock(&kgdb_registration_lock); |
| 1019 | 1024 | ||
| 1020 | printk(KERN_INFO | 1025 | pr_info("Unregistered I/O driver %s, debugger disabled\n", |
| 1021 | "kgdb: Unregistered I/O driver %s, debugger disabled.\n", | ||
| 1022 | old_dbg_io_ops->name); | 1026 | old_dbg_io_ops->name); |
| 1023 | } | 1027 | } |
| 1024 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); | 1028 | EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 70a504601dc3..e1dbf4a2c69e 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
| @@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) | |||
| 52 | 52 | ||
| 53 | bp->bph_length = 1; | 53 | bp->bph_length = 1; |
| 54 | if ((argc + 1) != nextarg) { | 54 | if ((argc + 1) != nextarg) { |
| 55 | if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) | 55 | if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0) |
| 56 | bp->bp_type = BP_ACCESS_WATCHPOINT; | 56 | bp->bp_type = BP_ACCESS_WATCHPOINT; |
| 57 | else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) | 57 | else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) |
| 58 | bp->bp_type = BP_WRITE_WATCHPOINT; | 58 | bp->bp_type = BP_WRITE_WATCHPOINT; |
| 59 | else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) | 59 | else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0) |
| 60 | bp->bp_type = BP_HARDWARE_BREAKPOINT; | 60 | bp->bp_type = BP_HARDWARE_BREAKPOINT; |
| 61 | else | 61 | else |
| 62 | return KDB_ARGCOUNT; | 62 | return KDB_ARGCOUNT; |
| @@ -531,22 +531,29 @@ void __init kdb_initbptab(void) | |||
| 531 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) | 531 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) |
| 532 | bp->bp_free = 1; | 532 | bp->bp_free = 1; |
| 533 | 533 | ||
| 534 | kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", | 534 | kdb_register_flags("bp", kdb_bp, "[<vaddr>]", |
| 535 | "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); | 535 | "Set/Display breakpoints", 0, |
| 536 | kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", | 536 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); |
| 537 | "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); | 537 | kdb_register_flags("bl", kdb_bp, "[<vaddr>]", |
| 538 | "Display breakpoints", 0, | ||
| 539 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); | ||
| 538 | if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) | 540 | if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) |
| 539 | kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", | 541 | kdb_register_flags("bph", kdb_bp, "[<vaddr>]", |
| 540 | "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); | 542 | "[datar [length]|dataw [length]] Set hw brk", 0, |
| 541 | kdb_register_repeat("bc", kdb_bc, "<bpnum>", | 543 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); |
| 542 | "Clear Breakpoint", 0, KDB_REPEAT_NONE); | 544 | kdb_register_flags("bc", kdb_bc, "<bpnum>", |
| 543 | kdb_register_repeat("be", kdb_bc, "<bpnum>", | 545 | "Clear Breakpoint", 0, |
| 544 | "Enable Breakpoint", 0, KDB_REPEAT_NONE); | 546 | KDB_ENABLE_FLOW_CTRL); |
| 545 | kdb_register_repeat("bd", kdb_bc, "<bpnum>", | 547 | kdb_register_flags("be", kdb_bc, "<bpnum>", |
| 546 | "Disable Breakpoint", 0, KDB_REPEAT_NONE); | 548 | "Enable Breakpoint", 0, |
| 547 | 549 | KDB_ENABLE_FLOW_CTRL); | |
| 548 | kdb_register_repeat("ss", kdb_ss, "", | 550 | kdb_register_flags("bd", kdb_bc, "<bpnum>", |
| 549 | "Single Step", 1, KDB_REPEAT_NO_ARGS); | 551 | "Disable Breakpoint", 0, |
| 552 | KDB_ENABLE_FLOW_CTRL); | ||
| 553 | |||
| 554 | kdb_register_flags("ss", kdb_ss, "", | ||
| 555 | "Single Step", 1, | ||
| 556 | KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS); | ||
| 550 | /* | 557 | /* |
| 551 | * Architecture dependent initialization. | 558 | * Architecture dependent initialization. |
| 552 | */ | 559 | */ |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8859ca34dcfe..15e1a7af5dd0 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
| @@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 129 | ks->pass_exception = 1; | 129 | ks->pass_exception = 1; |
| 130 | KDB_FLAG_SET(CATASTROPHIC); | 130 | KDB_FLAG_SET(CATASTROPHIC); |
| 131 | } | 131 | } |
| 132 | /* set CATASTROPHIC if the system contains unresponsive processors */ | ||
| 133 | for_each_online_cpu(i) | ||
| 134 | if (!kgdb_info[i].enter_kgdb) | ||
| 135 | KDB_FLAG_SET(CATASTROPHIC); | ||
| 132 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { | 136 | if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { |
| 133 | KDB_STATE_CLEAR(SSBPT); | 137 | KDB_STATE_CLEAR(SSBPT); |
| 134 | KDB_STATE_CLEAR(DOING_SS); | 138 | KDB_STATE_CLEAR(DOING_SS); |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..f191bddf64b8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
| 15 | #include <linux/types.h> | ||
| 15 | #include <linux/string.h> | 16 | #include <linux/string.h> |
| 16 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
| 17 | #include <linux/kmsg_dump.h> | 18 | #include <linux/kmsg_dump.h> |
| @@ -23,6 +24,7 @@ | |||
| 23 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
| 24 | #include <linux/atomic.h> | 25 | #include <linux/atomic.h> |
| 25 | #include <linux/module.h> | 26 | #include <linux/module.h> |
| 27 | #include <linux/moduleparam.h> | ||
| 26 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
| 27 | #include <linux/init.h> | 29 | #include <linux/init.h> |
| 28 | #include <linux/kallsyms.h> | 30 | #include <linux/kallsyms.h> |
| @@ -42,6 +44,12 @@ | |||
| 42 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
| 43 | #include "kdb_private.h" | 45 | #include "kdb_private.h" |
| 44 | 46 | ||
| 47 | #undef MODULE_PARAM_PREFIX | ||
| 48 | #define MODULE_PARAM_PREFIX "kdb." | ||
| 49 | |||
| 50 | static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; | ||
| 51 | module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); | ||
| 52 | |||
| 45 | #define GREP_LEN 256 | 53 | #define GREP_LEN 256 |
| 46 | char kdb_grep_string[GREP_LEN]; | 54 | char kdb_grep_string[GREP_LEN]; |
| 47 | int kdb_grepping_flag; | 55 | int kdb_grepping_flag; |
| @@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = { | |||
| 121 | KDBMSG(BADLENGTH, "Invalid length field"), | 129 | KDBMSG(BADLENGTH, "Invalid length field"), |
| 122 | KDBMSG(NOBP, "No Breakpoint exists"), | 130 | KDBMSG(NOBP, "No Breakpoint exists"), |
| 123 | KDBMSG(BADADDR, "Invalid address"), | 131 | KDBMSG(BADADDR, "Invalid address"), |
| 132 | KDBMSG(NOPERM, "Permission denied"), | ||
| 124 | }; | 133 | }; |
| 125 | #undef KDBMSG | 134 | #undef KDBMSG |
| 126 | 135 | ||
| @@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu) | |||
| 188 | } | 197 | } |
| 189 | 198 | ||
| 190 | /* | 199 | /* |
| 200 | * Check whether the flags of the current command and the permissions | ||
| 201 | * of the kdb console has allow a command to be run. | ||
| 202 | */ | ||
| 203 | static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, | ||
| 204 | bool no_args) | ||
| 205 | { | ||
| 206 | /* permissions comes from userspace so needs massaging slightly */ | ||
| 207 | permissions &= KDB_ENABLE_MASK; | ||
| 208 | permissions |= KDB_ENABLE_ALWAYS_SAFE; | ||
| 209 | |||
| 210 | /* some commands change group when launched with no arguments */ | ||
| 211 | if (no_args) | ||
| 212 | permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT; | ||
| 213 | |||
| 214 | flags |= KDB_ENABLE_ALL; | ||
| 215 | |||
| 216 | return permissions & flags; | ||
| 217 | } | ||
| 218 | |||
| 219 | /* | ||
| 191 | * kdbgetenv - This function will return the character string value of | 220 | * kdbgetenv - This function will return the character string value of |
| 192 | * an environment variable. | 221 | * an environment variable. |
| 193 | * Parameters: | 222 | * Parameters: |
| @@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg, | |||
| 476 | kdb_symtab_t symtab; | 505 | kdb_symtab_t symtab; |
| 477 | 506 | ||
| 478 | /* | 507 | /* |
| 508 | * If the enable flags prohibit both arbitrary memory access | ||
| 509 | * and flow control then there are no reasonable grounds to | ||
| 510 | * provide symbol lookup. | ||
| 511 | */ | ||
| 512 | if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL, | ||
| 513 | kdb_cmd_enabled, false)) | ||
| 514 | return KDB_NOPERM; | ||
| 515 | |||
| 516 | /* | ||
| 479 | * Process arguments which follow the following syntax: | 517 | * Process arguments which follow the following syntax: |
| 480 | * | 518 | * |
| 481 | * symbol | numeric-address [+/- numeric-offset] | 519 | * symbol | numeric-address [+/- numeric-offset] |
| @@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) | |||
| 641 | if (!s->count) | 679 | if (!s->count) |
| 642 | s->usable = 0; | 680 | s->usable = 0; |
| 643 | if (s->usable) | 681 | if (s->usable) |
| 644 | kdb_register(s->name, kdb_exec_defcmd, | 682 | /* macros are always safe because when executed each |
| 645 | s->usage, s->help, 0); | 683 | * internal command re-enters kdb_parse() and is |
| 684 | * safety checked individually. | ||
| 685 | */ | ||
| 686 | kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, | ||
| 687 | s->help, 0, | ||
| 688 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 646 | return 0; | 689 | return 0; |
| 647 | } | 690 | } |
| 648 | if (!s->usable) | 691 | if (!s->usable) |
| @@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr) | |||
| 1003 | 1046 | ||
| 1004 | if (i < kdb_max_commands) { | 1047 | if (i < kdb_max_commands) { |
| 1005 | int result; | 1048 | int result; |
| 1049 | |||
| 1050 | if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) | ||
| 1051 | return KDB_NOPERM; | ||
| 1052 | |||
| 1006 | KDB_STATE_SET(CMD); | 1053 | KDB_STATE_SET(CMD); |
| 1007 | result = (*tp->cmd_func)(argc-1, (const char **)argv); | 1054 | result = (*tp->cmd_func)(argc-1, (const char **)argv); |
| 1008 | if (result && ignore_errors && result > KDB_CMD_GO) | 1055 | if (result && ignore_errors && result > KDB_CMD_GO) |
| 1009 | result = 0; | 1056 | result = 0; |
| 1010 | KDB_STATE_CLEAR(CMD); | 1057 | KDB_STATE_CLEAR(CMD); |
| 1011 | switch (tp->cmd_repeat) { | 1058 | |
| 1012 | case KDB_REPEAT_NONE: | 1059 | if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) |
| 1013 | argc = 0; | 1060 | return result; |
| 1014 | if (argv[0]) | 1061 | |
| 1015 | *(argv[0]) = '\0'; | 1062 | argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; |
| 1016 | break; | 1063 | if (argv[argc]) |
| 1017 | case KDB_REPEAT_NO_ARGS: | 1064 | *(argv[argc]) = '\0'; |
| 1018 | argc = 1; | ||
| 1019 | if (argv[1]) | ||
| 1020 | *(argv[1]) = '\0'; | ||
| 1021 | break; | ||
| 1022 | case KDB_REPEAT_WITH_ARGS: | ||
| 1023 | break; | ||
| 1024 | } | ||
| 1025 | return result; | 1065 | return result; |
| 1026 | } | 1066 | } |
| 1027 | 1067 | ||
| @@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv) | |||
| 1921 | */ | 1961 | */ |
| 1922 | static int kdb_sr(int argc, const char **argv) | 1962 | static int kdb_sr(int argc, const char **argv) |
| 1923 | { | 1963 | { |
| 1964 | bool check_mask = | ||
| 1965 | !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false); | ||
| 1966 | |||
| 1924 | if (argc != 1) | 1967 | if (argc != 1) |
| 1925 | return KDB_ARGCOUNT; | 1968 | return KDB_ARGCOUNT; |
| 1969 | |||
| 1926 | kdb_trap_printk++; | 1970 | kdb_trap_printk++; |
| 1927 | __handle_sysrq(*argv[1], false); | 1971 | __handle_sysrq(*argv[1], check_mask); |
| 1928 | kdb_trap_printk--; | 1972 | kdb_trap_printk--; |
| 1929 | 1973 | ||
| 1930 | return 0; | 1974 | return 0; |
| @@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void) | |||
| 2157 | for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { | 2201 | for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { |
| 2158 | if (!cpu_online(i)) { | 2202 | if (!cpu_online(i)) { |
| 2159 | state = 'F'; /* cpu is offline */ | 2203 | state = 'F'; /* cpu is offline */ |
| 2204 | } else if (!kgdb_info[i].enter_kgdb) { | ||
| 2205 | state = 'D'; /* cpu is online but unresponsive */ | ||
| 2160 | } else { | 2206 | } else { |
| 2161 | state = ' '; /* cpu is responding to kdb */ | 2207 | state = ' '; /* cpu is responding to kdb */ |
| 2162 | if (kdb_task_state_char(KDB_TSK(i)) == 'I') | 2208 | if (kdb_task_state_char(KDB_TSK(i)) == 'I') |
| @@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) | |||
| 2210 | /* | 2256 | /* |
| 2211 | * Validate cpunum | 2257 | * Validate cpunum |
| 2212 | */ | 2258 | */ |
| 2213 | if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) | 2259 | if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) |
| 2214 | return KDB_BADCPUNUM; | 2260 | return KDB_BADCPUNUM; |
| 2215 | 2261 | ||
| 2216 | dbg_switch_cpu = cpunum; | 2262 | dbg_switch_cpu = cpunum; |
| @@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv) | |||
| 2375 | return 0; | 2421 | return 0; |
| 2376 | if (!kt->cmd_name) | 2422 | if (!kt->cmd_name) |
| 2377 | continue; | 2423 | continue; |
| 2424 | if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) | ||
| 2425 | continue; | ||
| 2378 | if (strlen(kt->cmd_usage) > 20) | 2426 | if (strlen(kt->cmd_usage) > 20) |
| 2379 | space = "\n "; | 2427 | space = "\n "; |
| 2380 | kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, | 2428 | kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, |
| @@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv) | |||
| 2629 | } | 2677 | } |
| 2630 | 2678 | ||
| 2631 | /* | 2679 | /* |
| 2632 | * kdb_register_repeat - This function is used to register a kernel | 2680 | * kdb_register_flags - This function is used to register a kernel |
| 2633 | * debugger command. | 2681 | * debugger command. |
| 2634 | * Inputs: | 2682 | * Inputs: |
| 2635 | * cmd Command name | 2683 | * cmd Command name |
| @@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv) | |||
| 2641 | * zero for success, one if a duplicate command. | 2689 | * zero for success, one if a duplicate command. |
| 2642 | */ | 2690 | */ |
| 2643 | #define kdb_command_extend 50 /* arbitrary */ | 2691 | #define kdb_command_extend 50 /* arbitrary */ |
| 2644 | int kdb_register_repeat(char *cmd, | 2692 | int kdb_register_flags(char *cmd, |
| 2645 | kdb_func_t func, | 2693 | kdb_func_t func, |
| 2646 | char *usage, | 2694 | char *usage, |
| 2647 | char *help, | 2695 | char *help, |
| 2648 | short minlen, | 2696 | short minlen, |
| 2649 | kdb_repeat_t repeat) | 2697 | kdb_cmdflags_t flags) |
| 2650 | { | 2698 | { |
| 2651 | int i; | 2699 | int i; |
| 2652 | kdbtab_t *kp; | 2700 | kdbtab_t *kp; |
| @@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd, | |||
| 2694 | kp->cmd_func = func; | 2742 | kp->cmd_func = func; |
| 2695 | kp->cmd_usage = usage; | 2743 | kp->cmd_usage = usage; |
| 2696 | kp->cmd_help = help; | 2744 | kp->cmd_help = help; |
| 2697 | kp->cmd_flags = 0; | ||
| 2698 | kp->cmd_minlen = minlen; | 2745 | kp->cmd_minlen = minlen; |
| 2699 | kp->cmd_repeat = repeat; | 2746 | kp->cmd_flags = flags; |
| 2700 | 2747 | ||
| 2701 | return 0; | 2748 | return 0; |
| 2702 | } | 2749 | } |
| 2703 | EXPORT_SYMBOL_GPL(kdb_register_repeat); | 2750 | EXPORT_SYMBOL_GPL(kdb_register_flags); |
| 2704 | 2751 | ||
| 2705 | 2752 | ||
| 2706 | /* | 2753 | /* |
| 2707 | * kdb_register - Compatibility register function for commands that do | 2754 | * kdb_register - Compatibility register function for commands that do |
| 2708 | * not need to specify a repeat state. Equivalent to | 2755 | * not need to specify a repeat state. Equivalent to |
| 2709 | * kdb_register_repeat with KDB_REPEAT_NONE. | 2756 | * kdb_register_flags with flags set to 0. |
| 2710 | * Inputs: | 2757 | * Inputs: |
| 2711 | * cmd Command name | 2758 | * cmd Command name |
| 2712 | * func Function to execute the command | 2759 | * func Function to execute the command |
| @@ -2721,8 +2768,7 @@ int kdb_register(char *cmd, | |||
| 2721 | char *help, | 2768 | char *help, |
| 2722 | short minlen) | 2769 | short minlen) |
| 2723 | { | 2770 | { |
| 2724 | return kdb_register_repeat(cmd, func, usage, help, minlen, | 2771 | return kdb_register_flags(cmd, func, usage, help, minlen, 0); |
| 2725 | KDB_REPEAT_NONE); | ||
| 2726 | } | 2772 | } |
| 2727 | EXPORT_SYMBOL_GPL(kdb_register); | 2773 | EXPORT_SYMBOL_GPL(kdb_register); |
| 2728 | 2774 | ||
| @@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void) | |||
| 2764 | for_each_kdbcmd(kp, i) | 2810 | for_each_kdbcmd(kp, i) |
| 2765 | kp->cmd_name = NULL; | 2811 | kp->cmd_name = NULL; |
| 2766 | 2812 | ||
| 2767 | kdb_register_repeat("md", kdb_md, "<vaddr>", | 2813 | kdb_register_flags("md", kdb_md, "<vaddr>", |
| 2768 | "Display Memory Contents, also mdWcN, e.g. md8c1", 1, | 2814 | "Display Memory Contents, also mdWcN, e.g. md8c1", 1, |
| 2769 | KDB_REPEAT_NO_ARGS); | 2815 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2770 | kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", | 2816 | kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>", |
| 2771 | "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); | 2817 | "Display Raw Memory", 0, |
| 2772 | kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", | 2818 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2773 | "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); | 2819 | kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>", |
| 2774 | kdb_register_repeat("mds", kdb_md, "<vaddr>", | 2820 | "Display Physical Memory", 0, |
| 2775 | "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); | 2821 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2776 | kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", | 2822 | kdb_register_flags("mds", kdb_md, "<vaddr>", |
| 2777 | "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); | 2823 | "Display Memory Symbolically", 0, |
| 2778 | kdb_register_repeat("go", kdb_go, "[<vaddr>]", | 2824 | KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS); |
| 2779 | "Continue Execution", 1, KDB_REPEAT_NONE); | 2825 | kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>", |
| 2780 | kdb_register_repeat("rd", kdb_rd, "", | 2826 | "Modify Memory Contents", 0, |
| 2781 | "Display Registers", 0, KDB_REPEAT_NONE); | 2827 | KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS); |
| 2782 | kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", | 2828 | kdb_register_flags("go", kdb_go, "[<vaddr>]", |
| 2783 | "Modify Registers", 0, KDB_REPEAT_NONE); | 2829 | "Continue Execution", 1, |
| 2784 | kdb_register_repeat("ef", kdb_ef, "<vaddr>", | 2830 | KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); |
| 2785 | "Display exception frame", 0, KDB_REPEAT_NONE); | 2831 | kdb_register_flags("rd", kdb_rd, "", |
| 2786 | kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", | 2832 | "Display Registers", 0, |
| 2787 | "Stack traceback", 1, KDB_REPEAT_NONE); | 2833 | KDB_ENABLE_REG_READ); |
| 2788 | kdb_register_repeat("btp", kdb_bt, "<pid>", | 2834 | kdb_register_flags("rm", kdb_rm, "<reg> <contents>", |
| 2789 | "Display stack for process <pid>", 0, KDB_REPEAT_NONE); | 2835 | "Modify Registers", 0, |
| 2790 | kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", | 2836 | KDB_ENABLE_REG_WRITE); |
| 2791 | "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); | 2837 | kdb_register_flags("ef", kdb_ef, "<vaddr>", |
| 2792 | kdb_register_repeat("btc", kdb_bt, "", | 2838 | "Display exception frame", 0, |
| 2793 | "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); | 2839 | KDB_ENABLE_MEM_READ); |
| 2794 | kdb_register_repeat("btt", kdb_bt, "<vaddr>", | 2840 | kdb_register_flags("bt", kdb_bt, "[<vaddr>]", |
| 2841 | "Stack traceback", 1, | ||
| 2842 | KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); | ||
| 2843 | kdb_register_flags("btp", kdb_bt, "<pid>", | ||
| 2844 | "Display stack for process <pid>", 0, | ||
| 2845 | KDB_ENABLE_INSPECT); | ||
| 2846 | kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", | ||
| 2847 | "Backtrace all processes matching state flag", 0, | ||
| 2848 | KDB_ENABLE_INSPECT); | ||
| 2849 | kdb_register_flags("btc", kdb_bt, "", | ||
| 2850 | "Backtrace current process on each cpu", 0, | ||
| 2851 | KDB_ENABLE_INSPECT); | ||
| 2852 | kdb_register_flags("btt", kdb_bt, "<vaddr>", | ||
| 2795 | "Backtrace process given its struct task address", 0, | 2853 | "Backtrace process given its struct task address", 0, |
| 2796 | KDB_REPEAT_NONE); | 2854 | KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS); |
| 2797 | kdb_register_repeat("env", kdb_env, "", | 2855 | kdb_register_flags("env", kdb_env, "", |
| 2798 | "Show environment variables", 0, KDB_REPEAT_NONE); | 2856 | "Show environment variables", 0, |
| 2799 | kdb_register_repeat("set", kdb_set, "", | 2857 | KDB_ENABLE_ALWAYS_SAFE); |
| 2800 | "Set environment variables", 0, KDB_REPEAT_NONE); | 2858 | kdb_register_flags("set", kdb_set, "", |
| 2801 | kdb_register_repeat("help", kdb_help, "", | 2859 | "Set environment variables", 0, |
| 2802 | "Display Help Message", 1, KDB_REPEAT_NONE); | 2860 | KDB_ENABLE_ALWAYS_SAFE); |
| 2803 | kdb_register_repeat("?", kdb_help, "", | 2861 | kdb_register_flags("help", kdb_help, "", |
| 2804 | "Display Help Message", 0, KDB_REPEAT_NONE); | 2862 | "Display Help Message", 1, |
| 2805 | kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", | 2863 | KDB_ENABLE_ALWAYS_SAFE); |
| 2806 | "Switch to new cpu", 0, KDB_REPEAT_NONE); | 2864 | kdb_register_flags("?", kdb_help, "", |
| 2807 | kdb_register_repeat("kgdb", kdb_kgdb, "", | 2865 | "Display Help Message", 0, |
| 2808 | "Enter kgdb mode", 0, KDB_REPEAT_NONE); | 2866 | KDB_ENABLE_ALWAYS_SAFE); |
| 2809 | kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", | 2867 | kdb_register_flags("cpu", kdb_cpu, "<cpunum>", |
| 2810 | "Display active task list", 0, KDB_REPEAT_NONE); | 2868 | "Switch to new cpu", 0, |
| 2811 | kdb_register_repeat("pid", kdb_pid, "<pidnum>", | 2869 | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS); |
| 2812 | "Switch to another task", 0, KDB_REPEAT_NONE); | 2870 | kdb_register_flags("kgdb", kdb_kgdb, "", |
| 2813 | kdb_register_repeat("reboot", kdb_reboot, "", | 2871 | "Enter kgdb mode", 0, 0); |
| 2814 | "Reboot the machine immediately", 0, KDB_REPEAT_NONE); | 2872 | kdb_register_flags("ps", kdb_ps, "[<flags>|A]", |
| 2873 | "Display active task list", 0, | ||
| 2874 | KDB_ENABLE_INSPECT); | ||
| 2875 | kdb_register_flags("pid", kdb_pid, "<pidnum>", | ||
| 2876 | "Switch to another task", 0, | ||
| 2877 | KDB_ENABLE_INSPECT); | ||
| 2878 | kdb_register_flags("reboot", kdb_reboot, "", | ||
| 2879 | "Reboot the machine immediately", 0, | ||
| 2880 | KDB_ENABLE_REBOOT); | ||
| 2815 | #if defined(CONFIG_MODULES) | 2881 | #if defined(CONFIG_MODULES) |
| 2816 | kdb_register_repeat("lsmod", kdb_lsmod, "", | 2882 | kdb_register_flags("lsmod", kdb_lsmod, "", |
| 2817 | "List loaded kernel modules", 0, KDB_REPEAT_NONE); | 2883 | "List loaded kernel modules", 0, |
| 2884 | KDB_ENABLE_INSPECT); | ||
| 2818 | #endif | 2885 | #endif |
| 2819 | #if defined(CONFIG_MAGIC_SYSRQ) | 2886 | #if defined(CONFIG_MAGIC_SYSRQ) |
| 2820 | kdb_register_repeat("sr", kdb_sr, "<key>", | 2887 | kdb_register_flags("sr", kdb_sr, "<key>", |
| 2821 | "Magic SysRq key", 0, KDB_REPEAT_NONE); | 2888 | "Magic SysRq key", 0, |
| 2889 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2822 | #endif | 2890 | #endif |
| 2823 | #if defined(CONFIG_PRINTK) | 2891 | #if defined(CONFIG_PRINTK) |
| 2824 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", | 2892 | kdb_register_flags("dmesg", kdb_dmesg, "[lines]", |
| 2825 | "Display syslog buffer", 0, KDB_REPEAT_NONE); | 2893 | "Display syslog buffer", 0, |
| 2894 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2826 | #endif | 2895 | #endif |
| 2827 | if (arch_kgdb_ops.enable_nmi) { | 2896 | if (arch_kgdb_ops.enable_nmi) { |
| 2828 | kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", | 2897 | kdb_register_flags("disable_nmi", kdb_disable_nmi, "", |
| 2829 | "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); | 2898 | "Disable NMI entry to KDB", 0, |
| 2830 | } | 2899 | KDB_ENABLE_ALWAYS_SAFE); |
| 2831 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", | 2900 | } |
| 2832 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); | 2901 | kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"", |
| 2833 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", | 2902 | "Define a set of commands, down to endefcmd", 0, |
| 2834 | "Send a signal to a process", 0, KDB_REPEAT_NONE); | 2903 | KDB_ENABLE_ALWAYS_SAFE); |
| 2835 | kdb_register_repeat("summary", kdb_summary, "", | 2904 | kdb_register_flags("kill", kdb_kill, "<-signal> <pid>", |
| 2836 | "Summarize the system", 4, KDB_REPEAT_NONE); | 2905 | "Send a signal to a process", 0, |
| 2837 | kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", | 2906 | KDB_ENABLE_SIGNAL); |
| 2838 | "Display per_cpu variables", 3, KDB_REPEAT_NONE); | 2907 | kdb_register_flags("summary", kdb_summary, "", |
| 2839 | kdb_register_repeat("grephelp", kdb_grep_help, "", | 2908 | "Summarize the system", 4, |
| 2840 | "Display help on | grep", 0, KDB_REPEAT_NONE); | 2909 | KDB_ENABLE_ALWAYS_SAFE); |
| 2910 | kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", | ||
| 2911 | "Display per_cpu variables", 3, | ||
| 2912 | KDB_ENABLE_MEM_READ); | ||
| 2913 | kdb_register_flags("grephelp", kdb_grep_help, "", | ||
| 2914 | "Display help on | grep", 0, | ||
| 2915 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 2841 | } | 2916 | } |
| 2842 | 2917 | ||
| 2843 | /* Execute any commands defined in kdb_cmds. */ | 2918 | /* Execute any commands defined in kdb_cmds. */ |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 7afd3c8c41d5..eaacd1693954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -172,10 +172,9 @@ typedef struct _kdbtab { | |||
| 172 | kdb_func_t cmd_func; /* Function to execute command */ | 172 | kdb_func_t cmd_func; /* Function to execute command */ |
| 173 | char *cmd_usage; /* Usage String for this command */ | 173 | char *cmd_usage; /* Usage String for this command */ |
| 174 | char *cmd_help; /* Help message for this command */ | 174 | char *cmd_help; /* Help message for this command */ |
| 175 | short cmd_flags; /* Parsing flags */ | ||
| 176 | short cmd_minlen; /* Minimum legal # command | 175 | short cmd_minlen; /* Minimum legal # command |
| 177 | * chars required */ | 176 | * chars required */ |
| 178 | kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ | 177 | kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ |
| 179 | } kdbtab_t; | 178 | } kdbtab_t; |
| 180 | 179 | ||
| 181 | extern int kdb_bt(int, const char **); /* KDB display back trace */ | 180 | extern int kdb_bt(int, const char **); /* KDB display back trace */ |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..d659487254d5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
| @@ -52,7 +52,7 @@ static void release_callchain_buffers(void) | |||
| 52 | struct callchain_cpus_entries *entries; | 52 | struct callchain_cpus_entries *entries; |
| 53 | 53 | ||
| 54 | entries = callchain_cpus_entries; | 54 | entries = callchain_cpus_entries; |
| 55 | rcu_assign_pointer(callchain_cpus_entries, NULL); | 55 | RCU_INIT_POINTER(callchain_cpus_entries, NULL); |
| 56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | 56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); |
| 57 | } | 57 | } |
| 58 | 58 | ||
| @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) | |||
| 137 | int cpu; | 137 | int cpu; |
| 138 | struct callchain_cpus_entries *entries; | 138 | struct callchain_cpus_entries *entries; |
| 139 | 139 | ||
| 140 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | 140 | *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); |
| 141 | if (*rctx == -1) | 141 | if (*rctx == -1) |
| 142 | return NULL; | 142 | return NULL; |
| 143 | 143 | ||
| @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) | |||
| 153 | static void | 153 | static void |
| 154 | put_callchain_entry(int rctx) | 154 | put_callchain_entry(int rctx) |
| 155 | { | 155 | { |
| 156 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | 156 | put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | struct perf_callchain_entry * | 159 | struct perf_callchain_entry * |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 963bf139e2b2..882f835a0d85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -47,6 +47,8 @@ | |||
| 47 | 47 | ||
| 48 | #include <asm/irq_regs.h> | 48 | #include <asm/irq_regs.h> |
| 49 | 49 | ||
| 50 | static struct workqueue_struct *perf_wq; | ||
| 51 | |||
| 50 | struct remote_function_call { | 52 | struct remote_function_call { |
| 51 | struct task_struct *p; | 53 | struct task_struct *p; |
| 52 | int (*func)(void *info); | 54 | int (*func)(void *info); |
| @@ -120,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | |||
| 120 | return data.ret; | 122 | return data.ret; |
| 121 | } | 123 | } |
| 122 | 124 | ||
| 125 | #define EVENT_OWNER_KERNEL ((void *) -1) | ||
| 126 | |||
| 127 | static bool is_kernel_event(struct perf_event *event) | ||
| 128 | { | ||
| 129 | return event->owner == EVENT_OWNER_KERNEL; | ||
| 130 | } | ||
| 131 | |||
| 123 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | 132 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ |
| 124 | PERF_FLAG_FD_OUTPUT |\ | 133 | PERF_FLAG_FD_OUTPUT |\ |
| 125 | PERF_FLAG_PID_CGROUP |\ | 134 | PERF_FLAG_PID_CGROUP |\ |
| @@ -240,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w) | |||
| 240 | u64 avg_local_sample_len; | 249 | u64 avg_local_sample_len; |
| 241 | u64 local_samples_len; | 250 | u64 local_samples_len; |
| 242 | 251 | ||
| 243 | local_samples_len = __get_cpu_var(running_sample_length); | 252 | local_samples_len = __this_cpu_read(running_sample_length); |
| 244 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 245 | 254 | ||
| 246 | printk_ratelimited(KERN_WARNING | 255 | printk_ratelimited(KERN_WARNING |
| @@ -262,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 262 | return; | 271 | return; |
| 263 | 272 | ||
| 264 | /* decay the counter by 1 average sample */ | 273 | /* decay the counter by 1 average sample */ |
| 265 | local_samples_len = __get_cpu_var(running_sample_length); | 274 | local_samples_len = __this_cpu_read(running_sample_length); |
| 266 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; | 275 | local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 267 | local_samples_len += sample_len_ns; | 276 | local_samples_len += sample_len_ns; |
| 268 | __get_cpu_var(running_sample_length) = local_samples_len; | 277 | __this_cpu_write(running_sample_length, local_samples_len); |
| 269 | 278 | ||
| 270 | /* | 279 | /* |
| 271 | * note: this will be biased artifically low until we have | 280 | * note: this will be biased artifically low until we have |
| @@ -392,14 +401,9 @@ perf_cgroup_match(struct perf_event *event) | |||
| 392 | event->cgrp->css.cgroup); | 401 | event->cgrp->css.cgroup); |
| 393 | } | 402 | } |
| 394 | 403 | ||
| 395 | static inline void perf_put_cgroup(struct perf_event *event) | ||
| 396 | { | ||
| 397 | css_put(&event->cgrp->css); | ||
| 398 | } | ||
| 399 | |||
| 400 | static inline void perf_detach_cgroup(struct perf_event *event) | 404 | static inline void perf_detach_cgroup(struct perf_event *event) |
| 401 | { | 405 | { |
| 402 | perf_put_cgroup(event); | 406 | css_put(&event->cgrp->css); |
| 403 | event->cgrp = NULL; | 407 | event->cgrp = NULL; |
| 404 | } | 408 | } |
| 405 | 409 | ||
| @@ -610,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 610 | if (!f.file) | 614 | if (!f.file) |
| 611 | return -EBADF; | 615 | return -EBADF; |
| 612 | 616 | ||
| 613 | css = css_tryget_online_from_dir(f.file->f_dentry, | 617 | css = css_tryget_online_from_dir(f.file->f_path.dentry, |
| 614 | &perf_event_cgrp_subsys); | 618 | &perf_event_cgrp_subsys); |
| 615 | if (IS_ERR(css)) { | 619 | if (IS_ERR(css)) { |
| 616 | ret = PTR_ERR(css); | 620 | ret = PTR_ERR(css); |
| @@ -878,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list); | |||
| 878 | static void perf_pmu_rotate_start(struct pmu *pmu) | 882 | static void perf_pmu_rotate_start(struct pmu *pmu) |
| 879 | { | 883 | { |
| 880 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 884 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 881 | struct list_head *head = &__get_cpu_var(rotation_list); | 885 | struct list_head *head = this_cpu_ptr(&rotation_list); |
| 882 | 886 | ||
| 883 | WARN_ON(!irqs_disabled()); | 887 | WARN_ON(!irqs_disabled()); |
| 884 | 888 | ||
| @@ -902,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx) | |||
| 902 | } | 906 | } |
| 903 | } | 907 | } |
| 904 | 908 | ||
| 905 | static void unclone_ctx(struct perf_event_context *ctx) | 909 | /* |
| 910 | * This must be done under the ctx->lock, such as to serialize against | ||
| 911 | * context_equiv(), therefore we cannot call put_ctx() since that might end up | ||
| 912 | * calling scheduler related locks and ctx->lock nests inside those. | ||
| 913 | */ | ||
| 914 | static __must_check struct perf_event_context * | ||
| 915 | unclone_ctx(struct perf_event_context *ctx) | ||
| 906 | { | 916 | { |
| 907 | if (ctx->parent_ctx) { | 917 | struct perf_event_context *parent_ctx = ctx->parent_ctx; |
| 908 | put_ctx(ctx->parent_ctx); | 918 | |
| 919 | lockdep_assert_held(&ctx->lock); | ||
| 920 | |||
| 921 | if (parent_ctx) | ||
| 909 | ctx->parent_ctx = NULL; | 922 | ctx->parent_ctx = NULL; |
| 910 | } | ||
| 911 | ctx->generation++; | 923 | ctx->generation++; |
| 924 | |||
| 925 | return parent_ctx; | ||
| 912 | } | 926 | } |
| 913 | 927 | ||
| 914 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 928 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
| @@ -1375,6 +1389,45 @@ out: | |||
| 1375 | perf_event__header_size(tmp); | 1389 | perf_event__header_size(tmp); |
| 1376 | } | 1390 | } |
| 1377 | 1391 | ||
| 1392 | /* | ||
| 1393 | * User event without the task. | ||
| 1394 | */ | ||
| 1395 | static bool is_orphaned_event(struct perf_event *event) | ||
| 1396 | { | ||
| 1397 | return event && !is_kernel_event(event) && !event->owner; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | /* | ||
| 1401 | * Event has a parent but parent's task finished and it's | ||
| 1402 | * alive only because of children holding refference. | ||
| 1403 | */ | ||
| 1404 | static bool is_orphaned_child(struct perf_event *event) | ||
| 1405 | { | ||
| 1406 | return is_orphaned_event(event->parent); | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | static void orphans_remove_work(struct work_struct *work); | ||
| 1410 | |||
| 1411 | static void schedule_orphans_remove(struct perf_event_context *ctx) | ||
| 1412 | { | ||
| 1413 | if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) | ||
| 1414 | return; | ||
| 1415 | |||
| 1416 | if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { | ||
| 1417 | get_ctx(ctx); | ||
| 1418 | ctx->orphans_remove_sched = true; | ||
| 1419 | } | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | static int __init perf_workqueue_init(void) | ||
| 1423 | { | ||
| 1424 | perf_wq = create_singlethread_workqueue("perf"); | ||
| 1425 | WARN(!perf_wq, "failed to create perf workqueue\n"); | ||
| 1426 | return perf_wq ? 0 : -1; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | core_initcall(perf_workqueue_init); | ||
| 1430 | |||
| 1378 | static inline int | 1431 | static inline int |
| 1379 | event_filter_match(struct perf_event *event) | 1432 | event_filter_match(struct perf_event *event) |
| 1380 | { | 1433 | { |
| @@ -1424,6 +1477,9 @@ event_sched_out(struct perf_event *event, | |||
| 1424 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1477 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
| 1425 | cpuctx->exclusive = 0; | 1478 | cpuctx->exclusive = 0; |
| 1426 | 1479 | ||
| 1480 | if (is_orphaned_child(event)) | ||
| 1481 | schedule_orphans_remove(ctx); | ||
| 1482 | |||
| 1427 | perf_pmu_enable(event->pmu); | 1483 | perf_pmu_enable(event->pmu); |
| 1428 | } | 1484 | } |
| 1429 | 1485 | ||
| @@ -1506,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group | |||
| 1506 | 1562 | ||
| 1507 | if (!task) { | 1563 | if (!task) { |
| 1508 | /* | 1564 | /* |
| 1509 | * Per cpu events are removed via an smp call and | 1565 | * Per cpu events are removed via an smp call. The removal can |
| 1510 | * the removal is always successful. | 1566 | * fail if the CPU is currently offline, but in that case we |
| 1567 | * already called __perf_remove_from_context from | ||
| 1568 | * perf_event_exit_cpu. | ||
| 1511 | */ | 1569 | */ |
| 1512 | cpu_function_call(event->cpu, __perf_remove_from_context, &re); | 1570 | cpu_function_call(event->cpu, __perf_remove_from_context, &re); |
| 1513 | return; | 1571 | return; |
| @@ -1731,6 +1789,9 @@ event_sched_in(struct perf_event *event, | |||
| 1731 | if (event->attr.exclusive) | 1789 | if (event->attr.exclusive) |
| 1732 | cpuctx->exclusive = 1; | 1790 | cpuctx->exclusive = 1; |
| 1733 | 1791 | ||
| 1792 | if (is_orphaned_child(event)) | ||
| 1793 | schedule_orphans_remove(ctx); | ||
| 1794 | |||
| 1734 | out: | 1795 | out: |
| 1735 | perf_pmu_enable(event->pmu); | 1796 | perf_pmu_enable(event->pmu); |
| 1736 | 1797 | ||
| @@ -2210,6 +2271,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2210 | static int context_equiv(struct perf_event_context *ctx1, | 2271 | static int context_equiv(struct perf_event_context *ctx1, |
| 2211 | struct perf_event_context *ctx2) | 2272 | struct perf_event_context *ctx2) |
| 2212 | { | 2273 | { |
| 2274 | lockdep_assert_held(&ctx1->lock); | ||
| 2275 | lockdep_assert_held(&ctx2->lock); | ||
| 2276 | |||
| 2213 | /* Pinning disables the swap optimization */ | 2277 | /* Pinning disables the swap optimization */ |
| 2214 | if (ctx1->pin_count || ctx2->pin_count) | 2278 | if (ctx1->pin_count || ctx2->pin_count) |
| 2215 | return 0; | 2279 | return 0; |
| @@ -2331,7 +2395,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2331 | next_parent = rcu_dereference(next_ctx->parent_ctx); | 2395 | next_parent = rcu_dereference(next_ctx->parent_ctx); |
| 2332 | 2396 | ||
| 2333 | /* If neither context have a parent context; they cannot be clones. */ | 2397 | /* If neither context have a parent context; they cannot be clones. */ |
| 2334 | if (!parent || !next_parent) | 2398 | if (!parent && !next_parent) |
| 2335 | goto unlock; | 2399 | goto unlock; |
| 2336 | 2400 | ||
| 2337 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | 2401 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { |
| @@ -2400,7 +2464,7 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 2400 | * to check if we have to switch out PMU state. | 2464 | * to check if we have to switch out PMU state. |
| 2401 | * cgroup event are system-wide mode only | 2465 | * cgroup event are system-wide mode only |
| 2402 | */ | 2466 | */ |
| 2403 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2467 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
| 2404 | perf_cgroup_sched_out(task, next); | 2468 | perf_cgroup_sched_out(task, next); |
| 2405 | } | 2469 | } |
| 2406 | 2470 | ||
| @@ -2643,11 +2707,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
| 2643 | * to check if we have to switch in PMU state. | 2707 | * to check if we have to switch in PMU state. |
| 2644 | * cgroup event are system-wide mode only | 2708 | * cgroup event are system-wide mode only |
| 2645 | */ | 2709 | */ |
| 2646 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2710 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
| 2647 | perf_cgroup_sched_in(prev, task); | 2711 | perf_cgroup_sched_in(prev, task); |
| 2648 | 2712 | ||
| 2649 | /* check for system-wide branch_stack events */ | 2713 | /* check for system-wide branch_stack events */ |
| 2650 | if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) | 2714 | if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) |
| 2651 | perf_branch_stack_sched_in(prev, task); | 2715 | perf_branch_stack_sched_in(prev, task); |
| 2652 | } | 2716 | } |
| 2653 | 2717 | ||
| @@ -2902,7 +2966,7 @@ bool perf_event_can_stop_tick(void) | |||
| 2902 | 2966 | ||
| 2903 | void perf_event_task_tick(void) | 2967 | void perf_event_task_tick(void) |
| 2904 | { | 2968 | { |
| 2905 | struct list_head *head = &__get_cpu_var(rotation_list); | 2969 | struct list_head *head = this_cpu_ptr(&rotation_list); |
| 2906 | struct perf_cpu_context *cpuctx, *tmp; | 2970 | struct perf_cpu_context *cpuctx, *tmp; |
| 2907 | struct perf_event_context *ctx; | 2971 | struct perf_event_context *ctx; |
| 2908 | int throttled; | 2972 | int throttled; |
| @@ -2943,6 +3007,7 @@ static int event_enable_on_exec(struct perf_event *event, | |||
| 2943 | */ | 3007 | */ |
| 2944 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) | 3008 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
| 2945 | { | 3009 | { |
| 3010 | struct perf_event_context *clone_ctx = NULL; | ||
| 2946 | struct perf_event *event; | 3011 | struct perf_event *event; |
| 2947 | unsigned long flags; | 3012 | unsigned long flags; |
| 2948 | int enabled = 0; | 3013 | int enabled = 0; |
| @@ -2974,7 +3039,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 2974 | * Unclone this context if we enabled any event. | 3039 | * Unclone this context if we enabled any event. |
| 2975 | */ | 3040 | */ |
| 2976 | if (enabled) | 3041 | if (enabled) |
| 2977 | unclone_ctx(ctx); | 3042 | clone_ctx = unclone_ctx(ctx); |
| 2978 | 3043 | ||
| 2979 | raw_spin_unlock(&ctx->lock); | 3044 | raw_spin_unlock(&ctx->lock); |
| 2980 | 3045 | ||
| @@ -2984,6 +3049,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 2984 | perf_event_context_sched_in(ctx, ctx->task); | 3049 | perf_event_context_sched_in(ctx, ctx->task); |
| 2985 | out: | 3050 | out: |
| 2986 | local_irq_restore(flags); | 3051 | local_irq_restore(flags); |
| 3052 | |||
| 3053 | if (clone_ctx) | ||
| 3054 | put_ctx(clone_ctx); | ||
| 2987 | } | 3055 | } |
| 2988 | 3056 | ||
| 2989 | void perf_event_exec(void) | 3057 | void perf_event_exec(void) |
| @@ -3078,6 +3146,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) | |||
| 3078 | INIT_LIST_HEAD(&ctx->flexible_groups); | 3146 | INIT_LIST_HEAD(&ctx->flexible_groups); |
| 3079 | INIT_LIST_HEAD(&ctx->event_list); | 3147 | INIT_LIST_HEAD(&ctx->event_list); |
| 3080 | atomic_set(&ctx->refcount, 1); | 3148 | atomic_set(&ctx->refcount, 1); |
| 3149 | INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); | ||
| 3081 | } | 3150 | } |
| 3082 | 3151 | ||
| 3083 | static struct perf_event_context * | 3152 | static struct perf_event_context * |
| @@ -3135,7 +3204,7 @@ errout: | |||
| 3135 | static struct perf_event_context * | 3204 | static struct perf_event_context * |
| 3136 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 3205 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
| 3137 | { | 3206 | { |
| 3138 | struct perf_event_context *ctx; | 3207 | struct perf_event_context *ctx, *clone_ctx = NULL; |
| 3139 | struct perf_cpu_context *cpuctx; | 3208 | struct perf_cpu_context *cpuctx; |
| 3140 | unsigned long flags; | 3209 | unsigned long flags; |
| 3141 | int ctxn, err; | 3210 | int ctxn, err; |
| @@ -3169,9 +3238,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
| 3169 | retry: | 3238 | retry: |
| 3170 | ctx = perf_lock_task_context(task, ctxn, &flags); | 3239 | ctx = perf_lock_task_context(task, ctxn, &flags); |
| 3171 | if (ctx) { | 3240 | if (ctx) { |
| 3172 | unclone_ctx(ctx); | 3241 | clone_ctx = unclone_ctx(ctx); |
| 3173 | ++ctx->pin_count; | 3242 | ++ctx->pin_count; |
| 3174 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 3243 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 3244 | |||
| 3245 | if (clone_ctx) | ||
| 3246 | put_ctx(clone_ctx); | ||
| 3175 | } else { | 3247 | } else { |
| 3176 | ctx = alloc_perf_context(pmu, task); | 3248 | ctx = alloc_perf_context(pmu, task); |
| 3177 | err = -ENOMEM; | 3249 | err = -ENOMEM; |
| @@ -3323,16 +3395,12 @@ static void free_event(struct perf_event *event) | |||
| 3323 | } | 3395 | } |
| 3324 | 3396 | ||
| 3325 | /* | 3397 | /* |
| 3326 | * Called when the last reference to the file is gone. | 3398 | * Remove user event from the owner task. |
| 3327 | */ | 3399 | */ |
| 3328 | static void put_event(struct perf_event *event) | 3400 | static void perf_remove_from_owner(struct perf_event *event) |
| 3329 | { | 3401 | { |
| 3330 | struct perf_event_context *ctx = event->ctx; | ||
| 3331 | struct task_struct *owner; | 3402 | struct task_struct *owner; |
| 3332 | 3403 | ||
| 3333 | if (!atomic_long_dec_and_test(&event->refcount)) | ||
| 3334 | return; | ||
| 3335 | |||
| 3336 | rcu_read_lock(); | 3404 | rcu_read_lock(); |
| 3337 | owner = ACCESS_ONCE(event->owner); | 3405 | owner = ACCESS_ONCE(event->owner); |
| 3338 | /* | 3406 | /* |
| @@ -3365,6 +3433,20 @@ static void put_event(struct perf_event *event) | |||
| 3365 | mutex_unlock(&owner->perf_event_mutex); | 3433 | mutex_unlock(&owner->perf_event_mutex); |
| 3366 | put_task_struct(owner); | 3434 | put_task_struct(owner); |
| 3367 | } | 3435 | } |
| 3436 | } | ||
| 3437 | |||
| 3438 | /* | ||
| 3439 | * Called when the last reference to the file is gone. | ||
| 3440 | */ | ||
| 3441 | static void put_event(struct perf_event *event) | ||
| 3442 | { | ||
| 3443 | struct perf_event_context *ctx = event->ctx; | ||
| 3444 | |||
| 3445 | if (!atomic_long_dec_and_test(&event->refcount)) | ||
| 3446 | return; | ||
| 3447 | |||
| 3448 | if (!is_kernel_event(event)) | ||
| 3449 | perf_remove_from_owner(event); | ||
| 3368 | 3450 | ||
| 3369 | WARN_ON_ONCE(ctx->parent_ctx); | 3451 | WARN_ON_ONCE(ctx->parent_ctx); |
| 3370 | /* | 3452 | /* |
| @@ -3399,6 +3481,42 @@ static int perf_release(struct inode *inode, struct file *file) | |||
| 3399 | return 0; | 3481 | return 0; |
| 3400 | } | 3482 | } |
| 3401 | 3483 | ||
| 3484 | /* | ||
| 3485 | * Remove all orphanes events from the context. | ||
| 3486 | */ | ||
| 3487 | static void orphans_remove_work(struct work_struct *work) | ||
| 3488 | { | ||
| 3489 | struct perf_event_context *ctx; | ||
| 3490 | struct perf_event *event, *tmp; | ||
| 3491 | |||
| 3492 | ctx = container_of(work, struct perf_event_context, | ||
| 3493 | orphans_remove.work); | ||
| 3494 | |||
| 3495 | mutex_lock(&ctx->mutex); | ||
| 3496 | list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { | ||
| 3497 | struct perf_event *parent_event = event->parent; | ||
| 3498 | |||
| 3499 | if (!is_orphaned_child(event)) | ||
| 3500 | continue; | ||
| 3501 | |||
| 3502 | perf_remove_from_context(event, true); | ||
| 3503 | |||
| 3504 | mutex_lock(&parent_event->child_mutex); | ||
| 3505 | list_del_init(&event->child_list); | ||
| 3506 | mutex_unlock(&parent_event->child_mutex); | ||
| 3507 | |||
| 3508 | free_event(event); | ||
| 3509 | put_event(parent_event); | ||
| 3510 | } | ||
| 3511 | |||
| 3512 | raw_spin_lock_irq(&ctx->lock); | ||
| 3513 | ctx->orphans_remove_sched = false; | ||
| 3514 | raw_spin_unlock_irq(&ctx->lock); | ||
| 3515 | mutex_unlock(&ctx->mutex); | ||
| 3516 | |||
| 3517 | put_ctx(ctx); | ||
| 3518 | } | ||
| 3519 | |||
| 3402 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 3520 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
| 3403 | { | 3521 | { |
| 3404 | struct perf_event *child; | 3522 | struct perf_event *child; |
| @@ -3496,6 +3614,19 @@ static int perf_event_read_one(struct perf_event *event, | |||
| 3496 | return n * sizeof(u64); | 3614 | return n * sizeof(u64); |
| 3497 | } | 3615 | } |
| 3498 | 3616 | ||
| 3617 | static bool is_event_hup(struct perf_event *event) | ||
| 3618 | { | ||
| 3619 | bool no_children; | ||
| 3620 | |||
| 3621 | if (event->state != PERF_EVENT_STATE_EXIT) | ||
| 3622 | return false; | ||
| 3623 | |||
| 3624 | mutex_lock(&event->child_mutex); | ||
| 3625 | no_children = list_empty(&event->child_list); | ||
| 3626 | mutex_unlock(&event->child_mutex); | ||
| 3627 | return no_children; | ||
| 3628 | } | ||
| 3629 | |||
| 3499 | /* | 3630 | /* |
| 3500 | * Read the performance event - simple non blocking version for now | 3631 | * Read the performance event - simple non blocking version for now |
| 3501 | */ | 3632 | */ |
| @@ -3537,7 +3668,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
| 3537 | { | 3668 | { |
| 3538 | struct perf_event *event = file->private_data; | 3669 | struct perf_event *event = file->private_data; |
| 3539 | struct ring_buffer *rb; | 3670 | struct ring_buffer *rb; |
| 3540 | unsigned int events = POLL_HUP; | 3671 | unsigned int events = POLLHUP; |
| 3672 | |||
| 3673 | poll_wait(file, &event->waitq, wait); | ||
| 3674 | |||
| 3675 | if (is_event_hup(event)) | ||
| 3676 | return events; | ||
| 3541 | 3677 | ||
| 3542 | /* | 3678 | /* |
| 3543 | * Pin the event->rb by taking event->mmap_mutex; otherwise | 3679 | * Pin the event->rb by taking event->mmap_mutex; otherwise |
| @@ -3548,9 +3684,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
| 3548 | if (rb) | 3684 | if (rb) |
| 3549 | events = atomic_xchg(&rb->poll, 0); | 3685 | events = atomic_xchg(&rb->poll, 0); |
| 3550 | mutex_unlock(&event->mmap_mutex); | 3686 | mutex_unlock(&event->mmap_mutex); |
| 3551 | |||
| 3552 | poll_wait(file, &event->waitq, wait); | ||
| 3553 | |||
| 3554 | return events; | 3687 | return events; |
| 3555 | } | 3688 | } |
| 3556 | 3689 | ||
| @@ -4327,22 +4460,29 @@ perf_output_sample_regs(struct perf_output_handle *handle, | |||
| 4327 | } | 4460 | } |
| 4328 | } | 4461 | } |
| 4329 | 4462 | ||
| 4330 | static void perf_sample_regs_user(struct perf_regs_user *regs_user, | 4463 | static void perf_sample_regs_user(struct perf_regs *regs_user, |
| 4331 | struct pt_regs *regs) | 4464 | struct pt_regs *regs, |
| 4465 | struct pt_regs *regs_user_copy) | ||
| 4332 | { | 4466 | { |
| 4333 | if (!user_mode(regs)) { | 4467 | if (user_mode(regs)) { |
| 4334 | if (current->mm) | 4468 | regs_user->abi = perf_reg_abi(current); |
| 4335 | regs = task_pt_regs(current); | ||
| 4336 | else | ||
| 4337 | regs = NULL; | ||
| 4338 | } | ||
| 4339 | |||
| 4340 | if (regs) { | ||
| 4341 | regs_user->regs = regs; | 4469 | regs_user->regs = regs; |
| 4342 | regs_user->abi = perf_reg_abi(current); | 4470 | } else if (current->mm) { |
| 4471 | perf_get_regs_user(regs_user, regs, regs_user_copy); | ||
| 4472 | } else { | ||
| 4473 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; | ||
| 4474 | regs_user->regs = NULL; | ||
| 4343 | } | 4475 | } |
| 4344 | } | 4476 | } |
| 4345 | 4477 | ||
| 4478 | static void perf_sample_regs_intr(struct perf_regs *regs_intr, | ||
| 4479 | struct pt_regs *regs) | ||
| 4480 | { | ||
| 4481 | regs_intr->regs = regs; | ||
| 4482 | regs_intr->abi = perf_reg_abi(current); | ||
| 4483 | } | ||
| 4484 | |||
| 4485 | |||
| 4346 | /* | 4486 | /* |
| 4347 | * Get remaining task size from user stack pointer. | 4487 | * Get remaining task size from user stack pointer. |
| 4348 | * | 4488 | * |
| @@ -4724,6 +4864,23 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4724 | if (sample_type & PERF_SAMPLE_TRANSACTION) | 4864 | if (sample_type & PERF_SAMPLE_TRANSACTION) |
| 4725 | perf_output_put(handle, data->txn); | 4865 | perf_output_put(handle, data->txn); |
| 4726 | 4866 | ||
| 4867 | if (sample_type & PERF_SAMPLE_REGS_INTR) { | ||
| 4868 | u64 abi = data->regs_intr.abi; | ||
| 4869 | /* | ||
| 4870 | * If there are no regs to dump, notice it through | ||
| 4871 | * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). | ||
| 4872 | */ | ||
| 4873 | perf_output_put(handle, abi); | ||
| 4874 | |||
| 4875 | if (abi) { | ||
| 4876 | u64 mask = event->attr.sample_regs_intr; | ||
| 4877 | |||
| 4878 | perf_output_sample_regs(handle, | ||
| 4879 | data->regs_intr.regs, | ||
| 4880 | mask); | ||
| 4881 | } | ||
| 4882 | } | ||
| 4883 | |||
| 4727 | if (!event->attr.watermark) { | 4884 | if (!event->attr.watermark) { |
| 4728 | int wakeup_events = event->attr.wakeup_events; | 4885 | int wakeup_events = event->attr.wakeup_events; |
| 4729 | 4886 | ||
| @@ -4789,12 +4946,14 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4789 | header->size += size; | 4946 | header->size += size; |
| 4790 | } | 4947 | } |
| 4791 | 4948 | ||
| 4949 | if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) | ||
| 4950 | perf_sample_regs_user(&data->regs_user, regs, | ||
| 4951 | &data->regs_user_copy); | ||
| 4952 | |||
| 4792 | if (sample_type & PERF_SAMPLE_REGS_USER) { | 4953 | if (sample_type & PERF_SAMPLE_REGS_USER) { |
| 4793 | /* regs dump ABI info */ | 4954 | /* regs dump ABI info */ |
| 4794 | int size = sizeof(u64); | 4955 | int size = sizeof(u64); |
| 4795 | 4956 | ||
| 4796 | perf_sample_regs_user(&data->regs_user, regs); | ||
| 4797 | |||
| 4798 | if (data->regs_user.regs) { | 4957 | if (data->regs_user.regs) { |
| 4799 | u64 mask = event->attr.sample_regs_user; | 4958 | u64 mask = event->attr.sample_regs_user; |
| 4800 | size += hweight64(mask) * sizeof(u64); | 4959 | size += hweight64(mask) * sizeof(u64); |
| @@ -4810,15 +4969,11 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4810 | * in case new sample type is added, because we could eat | 4969 | * in case new sample type is added, because we could eat |
| 4811 | * up the rest of the sample size. | 4970 | * up the rest of the sample size. |
| 4812 | */ | 4971 | */ |
| 4813 | struct perf_regs_user *uregs = &data->regs_user; | ||
| 4814 | u16 stack_size = event->attr.sample_stack_user; | 4972 | u16 stack_size = event->attr.sample_stack_user; |
| 4815 | u16 size = sizeof(u64); | 4973 | u16 size = sizeof(u64); |
| 4816 | 4974 | ||
| 4817 | if (!uregs->abi) | ||
| 4818 | perf_sample_regs_user(uregs, regs); | ||
| 4819 | |||
| 4820 | stack_size = perf_sample_ustack_size(stack_size, header->size, | 4975 | stack_size = perf_sample_ustack_size(stack_size, header->size, |
| 4821 | uregs->regs); | 4976 | data->regs_user.regs); |
| 4822 | 4977 | ||
| 4823 | /* | 4978 | /* |
| 4824 | * If there is something to dump, add space for the dump | 4979 | * If there is something to dump, add space for the dump |
| @@ -4831,6 +4986,21 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4831 | data->stack_user_size = stack_size; | 4986 | data->stack_user_size = stack_size; |
| 4832 | header->size += size; | 4987 | header->size += size; |
| 4833 | } | 4988 | } |
| 4989 | |||
| 4990 | if (sample_type & PERF_SAMPLE_REGS_INTR) { | ||
| 4991 | /* regs dump ABI info */ | ||
| 4992 | int size = sizeof(u64); | ||
| 4993 | |||
| 4994 | perf_sample_regs_intr(&data->regs_intr, regs); | ||
| 4995 | |||
| 4996 | if (data->regs_intr.regs) { | ||
| 4997 | u64 mask = event->attr.sample_regs_intr; | ||
| 4998 | |||
| 4999 | size += hweight64(mask) * sizeof(u64); | ||
| 5000 | } | ||
| 5001 | |||
| 5002 | header->size += size; | ||
| 5003 | } | ||
| 4834 | } | 5004 | } |
| 4835 | 5005 | ||
| 4836 | static void perf_event_output(struct perf_event *event, | 5006 | static void perf_event_output(struct perf_event *event, |
| @@ -5702,7 +5872,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 5702 | struct perf_sample_data *data, | 5872 | struct perf_sample_data *data, |
| 5703 | struct pt_regs *regs) | 5873 | struct pt_regs *regs) |
| 5704 | { | 5874 | { |
| 5705 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5875 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5706 | struct perf_event *event; | 5876 | struct perf_event *event; |
| 5707 | struct hlist_head *head; | 5877 | struct hlist_head *head; |
| 5708 | 5878 | ||
| @@ -5721,7 +5891,7 @@ end: | |||
| 5721 | 5891 | ||
| 5722 | int perf_swevent_get_recursion_context(void) | 5892 | int perf_swevent_get_recursion_context(void) |
| 5723 | { | 5893 | { |
| 5724 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5894 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5725 | 5895 | ||
| 5726 | return get_recursion_context(swhash->recursion); | 5896 | return get_recursion_context(swhash->recursion); |
| 5727 | } | 5897 | } |
| @@ -5729,7 +5899,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | |||
| 5729 | 5899 | ||
| 5730 | inline void perf_swevent_put_recursion_context(int rctx) | 5900 | inline void perf_swevent_put_recursion_context(int rctx) |
| 5731 | { | 5901 | { |
| 5732 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5902 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5733 | 5903 | ||
| 5734 | put_recursion_context(swhash->recursion, rctx); | 5904 | put_recursion_context(swhash->recursion, rctx); |
| 5735 | } | 5905 | } |
| @@ -5758,7 +5928,7 @@ static void perf_swevent_read(struct perf_event *event) | |||
| 5758 | 5928 | ||
| 5759 | static int perf_swevent_add(struct perf_event *event, int flags) | 5929 | static int perf_swevent_add(struct perf_event *event, int flags) |
| 5760 | { | 5930 | { |
| 5761 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | 5931 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
| 5762 | struct hw_perf_event *hwc = &event->hw; | 5932 | struct hw_perf_event *hwc = &event->hw; |
| 5763 | struct hlist_head *head; | 5933 | struct hlist_head *head; |
| 5764 | 5934 | ||
| @@ -5814,7 +5984,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) | |||
| 5814 | if (!hlist) | 5984 | if (!hlist) |
| 5815 | return; | 5985 | return; |
| 5816 | 5986 | ||
| 5817 | rcu_assign_pointer(swhash->swevent_hlist, NULL); | 5987 | RCU_INIT_POINTER(swhash->swevent_hlist, NULL); |
| 5818 | kfree_rcu(hlist, rcu_head); | 5988 | kfree_rcu(hlist, rcu_head); |
| 5819 | } | 5989 | } |
| 5820 | 5990 | ||
| @@ -5940,11 +6110,6 @@ static int perf_swevent_init(struct perf_event *event) | |||
| 5940 | return 0; | 6110 | return 0; |
| 5941 | } | 6111 | } |
| 5942 | 6112 | ||
| 5943 | static int perf_swevent_event_idx(struct perf_event *event) | ||
| 5944 | { | ||
| 5945 | return 0; | ||
| 5946 | } | ||
| 5947 | |||
| 5948 | static struct pmu perf_swevent = { | 6113 | static struct pmu perf_swevent = { |
| 5949 | .task_ctx_nr = perf_sw_context, | 6114 | .task_ctx_nr = perf_sw_context, |
| 5950 | 6115 | ||
| @@ -5954,8 +6119,6 @@ static struct pmu perf_swevent = { | |||
| 5954 | .start = perf_swevent_start, | 6119 | .start = perf_swevent_start, |
| 5955 | .stop = perf_swevent_stop, | 6120 | .stop = perf_swevent_stop, |
| 5956 | .read = perf_swevent_read, | 6121 | .read = perf_swevent_read, |
| 5957 | |||
| 5958 | .event_idx = perf_swevent_event_idx, | ||
| 5959 | }; | 6122 | }; |
| 5960 | 6123 | ||
| 5961 | #ifdef CONFIG_EVENT_TRACING | 6124 | #ifdef CONFIG_EVENT_TRACING |
| @@ -6073,8 +6236,6 @@ static struct pmu perf_tracepoint = { | |||
| 6073 | .start = perf_swevent_start, | 6236 | .start = perf_swevent_start, |
| 6074 | .stop = perf_swevent_stop, | 6237 | .stop = perf_swevent_stop, |
| 6075 | .read = perf_swevent_read, | 6238 | .read = perf_swevent_read, |
| 6076 | |||
| 6077 | .event_idx = perf_swevent_event_idx, | ||
| 6078 | }; | 6239 | }; |
| 6079 | 6240 | ||
| 6080 | static inline void perf_tp_register(void) | 6241 | static inline void perf_tp_register(void) |
| @@ -6300,8 +6461,6 @@ static struct pmu perf_cpu_clock = { | |||
| 6300 | .start = cpu_clock_event_start, | 6461 | .start = cpu_clock_event_start, |
| 6301 | .stop = cpu_clock_event_stop, | 6462 | .stop = cpu_clock_event_stop, |
| 6302 | .read = cpu_clock_event_read, | 6463 | .read = cpu_clock_event_read, |
| 6303 | |||
| 6304 | .event_idx = perf_swevent_event_idx, | ||
| 6305 | }; | 6464 | }; |
| 6306 | 6465 | ||
| 6307 | /* | 6466 | /* |
| @@ -6380,8 +6539,6 @@ static struct pmu perf_task_clock = { | |||
| 6380 | .start = task_clock_event_start, | 6539 | .start = task_clock_event_start, |
| 6381 | .stop = task_clock_event_stop, | 6540 | .stop = task_clock_event_stop, |
| 6382 | .read = task_clock_event_read, | 6541 | .read = task_clock_event_read, |
| 6383 | |||
| 6384 | .event_idx = perf_swevent_event_idx, | ||
| 6385 | }; | 6542 | }; |
| 6386 | 6543 | ||
| 6387 | static void perf_pmu_nop_void(struct pmu *pmu) | 6544 | static void perf_pmu_nop_void(struct pmu *pmu) |
| @@ -6411,7 +6568,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) | |||
| 6411 | 6568 | ||
| 6412 | static int perf_event_idx_default(struct perf_event *event) | 6569 | static int perf_event_idx_default(struct perf_event *event) |
| 6413 | { | 6570 | { |
| 6414 | return event->hw.idx + 1; | 6571 | return 0; |
| 6415 | } | 6572 | } |
| 6416 | 6573 | ||
| 6417 | /* | 6574 | /* |
| @@ -7031,6 +7188,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 7031 | ret = -EINVAL; | 7188 | ret = -EINVAL; |
| 7032 | } | 7189 | } |
| 7033 | 7190 | ||
| 7191 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) | ||
| 7192 | ret = perf_reg_validate(attr->sample_regs_intr); | ||
| 7034 | out: | 7193 | out: |
| 7035 | return ret; | 7194 | return ret; |
| 7036 | 7195 | ||
| @@ -7315,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7315 | 7474 | ||
| 7316 | if (move_group) { | 7475 | if (move_group) { |
| 7317 | synchronize_rcu(); | 7476 | synchronize_rcu(); |
| 7318 | perf_install_in_context(ctx, group_leader, event->cpu); | 7477 | perf_install_in_context(ctx, group_leader, group_leader->cpu); |
| 7319 | get_ctx(ctx); | 7478 | get_ctx(ctx); |
| 7320 | list_for_each_entry(sibling, &group_leader->sibling_list, | 7479 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 7321 | group_entry) { | 7480 | group_entry) { |
| 7322 | perf_install_in_context(ctx, sibling, event->cpu); | 7481 | perf_install_in_context(ctx, sibling, sibling->cpu); |
| 7323 | get_ctx(ctx); | 7482 | get_ctx(ctx); |
| 7324 | } | 7483 | } |
| 7325 | } | 7484 | } |
| @@ -7397,6 +7556,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7397 | goto err; | 7556 | goto err; |
| 7398 | } | 7557 | } |
| 7399 | 7558 | ||
| 7559 | /* Mark owner so we could distinguish it from user events. */ | ||
| 7560 | event->owner = EVENT_OWNER_KERNEL; | ||
| 7561 | |||
| 7400 | account_event(event); | 7562 | account_event(event); |
| 7401 | 7563 | ||
| 7402 | ctx = find_get_context(event->pmu, task, cpu); | 7564 | ctx = find_get_context(event->pmu, task, cpu); |
| @@ -7484,6 +7646,12 @@ static void sync_child_event(struct perf_event *child_event, | |||
| 7484 | mutex_unlock(&parent_event->child_mutex); | 7646 | mutex_unlock(&parent_event->child_mutex); |
| 7485 | 7647 | ||
| 7486 | /* | 7648 | /* |
| 7649 | * Make sure user/parent get notified, that we just | ||
| 7650 | * lost one event. | ||
| 7651 | */ | ||
| 7652 | perf_event_wakeup(parent_event); | ||
| 7653 | |||
| 7654 | /* | ||
| 7487 | * Release the parent event, if this was the last | 7655 | * Release the parent event, if this was the last |
| 7488 | * reference to it. | 7656 | * reference to it. |
| 7489 | */ | 7657 | */ |
| @@ -7517,13 +7685,16 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 7517 | if (child_event->parent) { | 7685 | if (child_event->parent) { |
| 7518 | sync_child_event(child_event, child); | 7686 | sync_child_event(child_event, child); |
| 7519 | free_event(child_event); | 7687 | free_event(child_event); |
| 7688 | } else { | ||
| 7689 | child_event->state = PERF_EVENT_STATE_EXIT; | ||
| 7690 | perf_event_wakeup(child_event); | ||
| 7520 | } | 7691 | } |
| 7521 | } | 7692 | } |
| 7522 | 7693 | ||
| 7523 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7694 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
| 7524 | { | 7695 | { |
| 7525 | struct perf_event *child_event, *next; | 7696 | struct perf_event *child_event, *next; |
| 7526 | struct perf_event_context *child_ctx, *parent_ctx; | 7697 | struct perf_event_context *child_ctx, *clone_ctx = NULL; |
| 7527 | unsigned long flags; | 7698 | unsigned long flags; |
| 7528 | 7699 | ||
| 7529 | if (likely(!child->perf_event_ctxp[ctxn])) { | 7700 | if (likely(!child->perf_event_ctxp[ctxn])) { |
| @@ -7550,28 +7721,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 7550 | child->perf_event_ctxp[ctxn] = NULL; | 7721 | child->perf_event_ctxp[ctxn] = NULL; |
| 7551 | 7722 | ||
| 7552 | /* | 7723 | /* |
| 7553 | * In order to avoid freeing: child_ctx->parent_ctx->task | ||
| 7554 | * under perf_event_context::lock, grab another reference. | ||
| 7555 | */ | ||
| 7556 | parent_ctx = child_ctx->parent_ctx; | ||
| 7557 | if (parent_ctx) | ||
| 7558 | get_ctx(parent_ctx); | ||
| 7559 | |||
| 7560 | /* | ||
| 7561 | * If this context is a clone; unclone it so it can't get | 7724 | * If this context is a clone; unclone it so it can't get |
| 7562 | * swapped to another process while we're removing all | 7725 | * swapped to another process while we're removing all |
| 7563 | * the events from it. | 7726 | * the events from it. |
| 7564 | */ | 7727 | */ |
| 7565 | unclone_ctx(child_ctx); | 7728 | clone_ctx = unclone_ctx(child_ctx); |
| 7566 | update_context_time(child_ctx); | 7729 | update_context_time(child_ctx); |
| 7567 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 7730 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
| 7568 | 7731 | ||
| 7569 | /* | 7732 | if (clone_ctx) |
| 7570 | * Now that we no longer hold perf_event_context::lock, drop | 7733 | put_ctx(clone_ctx); |
| 7571 | * our extra child_ctx->parent_ctx reference. | ||
| 7572 | */ | ||
| 7573 | if (parent_ctx) | ||
| 7574 | put_ctx(parent_ctx); | ||
| 7575 | 7734 | ||
| 7576 | /* | 7735 | /* |
| 7577 | * Report the task dead after unscheduling the events so that we | 7736 | * Report the task dead after unscheduling the events so that we |
| @@ -7700,6 +7859,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 7700 | struct perf_event *group_leader, | 7859 | struct perf_event *group_leader, |
| 7701 | struct perf_event_context *child_ctx) | 7860 | struct perf_event_context *child_ctx) |
| 7702 | { | 7861 | { |
| 7862 | enum perf_event_active_state parent_state = parent_event->state; | ||
| 7703 | struct perf_event *child_event; | 7863 | struct perf_event *child_event; |
| 7704 | unsigned long flags; | 7864 | unsigned long flags; |
| 7705 | 7865 | ||
| @@ -7720,7 +7880,8 @@ inherit_event(struct perf_event *parent_event, | |||
| 7720 | if (IS_ERR(child_event)) | 7880 | if (IS_ERR(child_event)) |
| 7721 | return child_event; | 7881 | return child_event; |
| 7722 | 7882 | ||
| 7723 | if (!atomic_long_inc_not_zero(&parent_event->refcount)) { | 7883 | if (is_orphaned_event(parent_event) || |
| 7884 | !atomic_long_inc_not_zero(&parent_event->refcount)) { | ||
| 7724 | free_event(child_event); | 7885 | free_event(child_event); |
| 7725 | return NULL; | 7886 | return NULL; |
| 7726 | } | 7887 | } |
| @@ -7732,7 +7893,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 7732 | * not its attr.disabled bit. We hold the parent's mutex, | 7893 | * not its attr.disabled bit. We hold the parent's mutex, |
| 7733 | * so we won't race with perf_event_{en, dis}able_family. | 7894 | * so we won't race with perf_event_{en, dis}able_family. |
| 7734 | */ | 7895 | */ |
| 7735 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | 7896 | if (parent_state >= PERF_EVENT_STATE_INACTIVE) |
| 7736 | child_event->state = PERF_EVENT_STATE_INACTIVE; | 7897 | child_event->state = PERF_EVENT_STATE_INACTIVE; |
| 7737 | else | 7898 | else |
| 7738 | child_event->state = PERF_EVENT_STATE_OFF; | 7899 | child_event->state = PERF_EVENT_STATE_OFF; |
| @@ -7997,7 +8158,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) | |||
| 7997 | 8158 | ||
| 7998 | static void __perf_event_exit_context(void *__info) | 8159 | static void __perf_event_exit_context(void *__info) |
| 7999 | { | 8160 | { |
| 8000 | struct remove_event re = { .detach_group = false }; | 8161 | struct remove_event re = { .detach_group = true }; |
| 8001 | struct perf_event_context *ctx = __info; | 8162 | struct perf_event_context *ctx = __info; |
| 8002 | 8163 | ||
| 8003 | perf_pmu_rotate_stop(ctx->pmu); | 8164 | perf_pmu_rotate_stop(ctx->pmu); |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) | |||
| 605 | bp->hw.state = PERF_HES_STOPPED; | 605 | bp->hw.state = PERF_HES_STOPPED; |
| 606 | } | 606 | } |
| 607 | 607 | ||
| 608 | static int hw_breakpoint_event_idx(struct perf_event *bp) | ||
| 609 | { | ||
| 610 | return 0; | ||
| 611 | } | ||
| 612 | |||
| 613 | static struct pmu perf_breakpoint = { | 608 | static struct pmu perf_breakpoint = { |
| 614 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | 609 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ |
| 615 | 610 | ||
| @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { | |||
| 619 | .start = hw_breakpoint_start, | 614 | .start = hw_breakpoint_start, |
| 620 | .stop = hw_breakpoint_stop, | 615 | .stop = hw_breakpoint_stop, |
| 621 | .read = hw_breakpoint_pmu_read, | 616 | .read = hw_breakpoint_pmu_read, |
| 622 | |||
| 623 | .event_idx = hw_breakpoint_event_idx, | ||
| 624 | }; | 617 | }; |
| 625 | 618 | ||
| 626 | int __init init_hw_breakpoint(void) | 619 | int __init init_hw_breakpoint(void) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1d0af8a2c646..cb346f26a22d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 195 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
| 196 | ptep_clear_flush(vma, addr, ptep); | 196 | ptep_clear_flush_notify(vma, addr, ptep); |
| 197 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 197 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
| 198 | 198 | ||
| 199 | page_remove_rmap(page); | 199 | page_remove_rmap(page); |
| @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 724 | int more = 0; | 724 | int more = 0; |
| 725 | 725 | ||
| 726 | again: | 726 | again: |
| 727 | mutex_lock(&mapping->i_mmap_mutex); | 727 | i_mmap_lock_read(mapping); |
| 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 728 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 729 | if (!valid_vma(vma, is_register)) | 729 | if (!valid_vma(vma, is_register)) |
| 730 | continue; | 730 | continue; |
| 731 | 731 | ||
| 732 | if (!prev && !more) { | 732 | if (!prev && !more) { |
| 733 | /* | 733 | /* |
| 734 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | 734 | * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through |
| 735 | * reclaim. This is optimistic, no harm done if it fails. | 735 | * reclaim. This is optimistic, no harm done if it fails. |
| 736 | */ | 736 | */ |
| 737 | prev = kmalloc(sizeof(struct map_info), | 737 | prev = kmalloc(sizeof(struct map_info), |
| @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 755 | info->mm = vma->vm_mm; | 755 | info->mm = vma->vm_mm; |
| 756 | info->vaddr = offset_to_vaddr(vma, offset); | 756 | info->vaddr = offset_to_vaddr(vma, offset); |
| 757 | } | 757 | } |
| 758 | mutex_unlock(&mapping->i_mmap_mutex); | 758 | i_mmap_unlock_read(mapping); |
| 759 | 759 | ||
| 760 | if (!more) | 760 | if (!more) |
| 761 | goto out; | 761 | goto out; |
| @@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void) | |||
| 1640 | if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { | 1640 | if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { |
| 1641 | utask->state = UTASK_SSTEP_TRAPPED; | 1641 | utask->state = UTASK_SSTEP_TRAPPED; |
| 1642 | set_tsk_thread_flag(t, TIF_UPROBE); | 1642 | set_tsk_thread_flag(t, TIF_UPROBE); |
| 1643 | set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | ||
| 1644 | } | 1643 | } |
| 1645 | } | 1644 | } |
| 1646 | 1645 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a3..6806c55475ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -115,32 +115,30 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 115 | 115 | ||
| 116 | if (tsk == sig->curr_target) | 116 | if (tsk == sig->curr_target) |
| 117 | sig->curr_target = next_thread(tsk); | 117 | sig->curr_target = next_thread(tsk); |
| 118 | /* | ||
| 119 | * Accumulate here the counters for all threads but the | ||
| 120 | * group leader as they die, so they can be added into | ||
| 121 | * the process-wide totals when those are taken. | ||
| 122 | * The group leader stays around as a zombie as long | ||
| 123 | * as there are other threads. When it gets reaped, | ||
| 124 | * the exit.c code will add its counts into these totals. | ||
| 125 | * We won't ever get here for the group leader, since it | ||
| 126 | * will have been the last reference on the signal_struct. | ||
| 127 | */ | ||
| 128 | task_cputime(tsk, &utime, &stime); | ||
| 129 | sig->utime += utime; | ||
| 130 | sig->stime += stime; | ||
| 131 | sig->gtime += task_gtime(tsk); | ||
| 132 | sig->min_flt += tsk->min_flt; | ||
| 133 | sig->maj_flt += tsk->maj_flt; | ||
| 134 | sig->nvcsw += tsk->nvcsw; | ||
| 135 | sig->nivcsw += tsk->nivcsw; | ||
| 136 | sig->inblock += task_io_get_inblock(tsk); | ||
| 137 | sig->oublock += task_io_get_oublock(tsk); | ||
| 138 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 139 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 140 | } | 118 | } |
| 141 | 119 | ||
| 120 | /* | ||
| 121 | * Accumulate here the counters for all threads as they die. We could | ||
| 122 | * skip the group leader because it is the last user of signal_struct, | ||
| 123 | * but we want to avoid the race with thread_group_cputime() which can | ||
| 124 | * see the empty ->thread_head list. | ||
| 125 | */ | ||
| 126 | task_cputime(tsk, &utime, &stime); | ||
| 127 | write_seqlock(&sig->stats_lock); | ||
| 128 | sig->utime += utime; | ||
| 129 | sig->stime += stime; | ||
| 130 | sig->gtime += task_gtime(tsk); | ||
| 131 | sig->min_flt += tsk->min_flt; | ||
| 132 | sig->maj_flt += tsk->maj_flt; | ||
| 133 | sig->nvcsw += tsk->nvcsw; | ||
| 134 | sig->nivcsw += tsk->nivcsw; | ||
| 135 | sig->inblock += task_io_get_inblock(tsk); | ||
| 136 | sig->oublock += task_io_get_oublock(tsk); | ||
| 137 | task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||
| 138 | sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||
| 142 | sig->nr_threads--; | 139 | sig->nr_threads--; |
| 143 | __unhash_process(tsk, group_dead); | 140 | __unhash_process(tsk, group_dead); |
| 141 | write_sequnlock(&sig->stats_lock); | ||
| 144 | 142 | ||
| 145 | /* | 143 | /* |
| 146 | * Do this under ->siglock, we can race with another thread | 144 | * Do this under ->siglock, we can race with another thread |
| @@ -214,27 +212,6 @@ repeat: | |||
| 214 | } | 212 | } |
| 215 | 213 | ||
| 216 | /* | 214 | /* |
| 217 | * This checks not only the pgrp, but falls back on the pid if no | ||
| 218 | * satisfactory pgrp is found. I dunno - gdb doesn't work correctly | ||
| 219 | * without this... | ||
| 220 | * | ||
| 221 | * The caller must hold rcu lock or the tasklist lock. | ||
| 222 | */ | ||
| 223 | struct pid *session_of_pgrp(struct pid *pgrp) | ||
| 224 | { | ||
| 225 | struct task_struct *p; | ||
| 226 | struct pid *sid = NULL; | ||
| 227 | |||
| 228 | p = pid_task(pgrp, PIDTYPE_PGID); | ||
| 229 | if (p == NULL) | ||
| 230 | p = pid_task(pgrp, PIDTYPE_PID); | ||
| 231 | if (p != NULL) | ||
| 232 | sid = task_session(p); | ||
| 233 | |||
| 234 | return sid; | ||
| 235 | } | ||
| 236 | |||
| 237 | /* | ||
| 238 | * Determine if a process group is "orphaned", according to the POSIX | 215 | * Determine if a process group is "orphaned", according to the POSIX |
| 239 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 216 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
| 240 | * by terminal-generated stop signals. Newly orphaned process groups are | 217 | * by terminal-generated stop signals. Newly orphaned process groups are |
| @@ -461,6 +438,44 @@ static void exit_mm(struct task_struct *tsk) | |||
| 461 | clear_thread_flag(TIF_MEMDIE); | 438 | clear_thread_flag(TIF_MEMDIE); |
| 462 | } | 439 | } |
| 463 | 440 | ||
| 441 | static struct task_struct *find_alive_thread(struct task_struct *p) | ||
| 442 | { | ||
| 443 | struct task_struct *t; | ||
| 444 | |||
| 445 | for_each_thread(p, t) { | ||
| 446 | if (!(t->flags & PF_EXITING)) | ||
| 447 | return t; | ||
| 448 | } | ||
| 449 | return NULL; | ||
| 450 | } | ||
| 451 | |||
| 452 | static struct task_struct *find_child_reaper(struct task_struct *father) | ||
| 453 | __releases(&tasklist_lock) | ||
| 454 | __acquires(&tasklist_lock) | ||
| 455 | { | ||
| 456 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | ||
| 457 | struct task_struct *reaper = pid_ns->child_reaper; | ||
| 458 | |||
| 459 | if (likely(reaper != father)) | ||
| 460 | return reaper; | ||
| 461 | |||
| 462 | reaper = find_alive_thread(father); | ||
| 463 | if (reaper) { | ||
| 464 | pid_ns->child_reaper = reaper; | ||
| 465 | return reaper; | ||
| 466 | } | ||
| 467 | |||
| 468 | write_unlock_irq(&tasklist_lock); | ||
| 469 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
| 470 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
| 471 | father->signal->group_exit_code ?: father->exit_code); | ||
| 472 | } | ||
| 473 | zap_pid_ns_processes(pid_ns); | ||
| 474 | write_lock_irq(&tasklist_lock); | ||
| 475 | |||
| 476 | return father; | ||
| 477 | } | ||
| 478 | |||
| 464 | /* | 479 | /* |
| 465 | * When we die, we re-parent all our children, and try to: | 480 | * When we die, we re-parent all our children, and try to: |
| 466 | * 1. give them to another thread in our thread group, if such a member exists | 481 | * 1. give them to another thread in our thread group, if such a member exists |
| @@ -468,58 +483,36 @@ static void exit_mm(struct task_struct *tsk) | |||
| 468 | * child_subreaper for its children (like a service manager) | 483 | * child_subreaper for its children (like a service manager) |
| 469 | * 3. give it to the init process (PID 1) in our pid namespace | 484 | * 3. give it to the init process (PID 1) in our pid namespace |
| 470 | */ | 485 | */ |
| 471 | static struct task_struct *find_new_reaper(struct task_struct *father) | 486 | static struct task_struct *find_new_reaper(struct task_struct *father, |
| 472 | __releases(&tasklist_lock) | 487 | struct task_struct *child_reaper) |
| 473 | __acquires(&tasklist_lock) | ||
| 474 | { | 488 | { |
| 475 | struct pid_namespace *pid_ns = task_active_pid_ns(father); | 489 | struct task_struct *thread, *reaper; |
| 476 | struct task_struct *thread; | ||
| 477 | 490 | ||
| 478 | thread = father; | 491 | thread = find_alive_thread(father); |
| 479 | while_each_thread(father, thread) { | 492 | if (thread) |
| 480 | if (thread->flags & PF_EXITING) | ||
| 481 | continue; | ||
| 482 | if (unlikely(pid_ns->child_reaper == father)) | ||
| 483 | pid_ns->child_reaper = thread; | ||
| 484 | return thread; | 493 | return thread; |
| 485 | } | ||
| 486 | |||
| 487 | if (unlikely(pid_ns->child_reaper == father)) { | ||
| 488 | write_unlock_irq(&tasklist_lock); | ||
| 489 | if (unlikely(pid_ns == &init_pid_ns)) { | ||
| 490 | panic("Attempted to kill init! exitcode=0x%08x\n", | ||
| 491 | father->signal->group_exit_code ?: | ||
| 492 | father->exit_code); | ||
| 493 | } | ||
| 494 | |||
| 495 | zap_pid_ns_processes(pid_ns); | ||
| 496 | write_lock_irq(&tasklist_lock); | ||
| 497 | } else if (father->signal->has_child_subreaper) { | ||
| 498 | struct task_struct *reaper; | ||
| 499 | 494 | ||
| 495 | if (father->signal->has_child_subreaper) { | ||
| 500 | /* | 496 | /* |
| 501 | * Find the first ancestor marked as child_subreaper. | 497 | * Find the first ->is_child_subreaper ancestor in our pid_ns. |
| 502 | * Note that the code below checks same_thread_group(reaper, | 498 | * We start from father to ensure we can not look into another |
| 503 | * pid_ns->child_reaper). This is what we need to DTRT in a | 499 | * namespace, this is safe because all its threads are dead. |
| 504 | * PID namespace. However we still need the check above, see | ||
| 505 | * http://marc.info/?l=linux-kernel&m=131385460420380 | ||
| 506 | */ | 500 | */ |
| 507 | for (reaper = father->real_parent; | 501 | for (reaper = father; |
| 508 | reaper != &init_task; | 502 | !same_thread_group(reaper, child_reaper); |
| 509 | reaper = reaper->real_parent) { | 503 | reaper = reaper->real_parent) { |
| 510 | if (same_thread_group(reaper, pid_ns->child_reaper)) | 504 | /* call_usermodehelper() descendants need this check */ |
| 505 | if (reaper == &init_task) | ||
| 511 | break; | 506 | break; |
| 512 | if (!reaper->signal->is_child_subreaper) | 507 | if (!reaper->signal->is_child_subreaper) |
| 513 | continue; | 508 | continue; |
| 514 | thread = reaper; | 509 | thread = find_alive_thread(reaper); |
| 515 | do { | 510 | if (thread) |
| 516 | if (!(thread->flags & PF_EXITING)) | 511 | return thread; |
| 517 | return reaper; | ||
| 518 | } while_each_thread(reaper, thread); | ||
| 519 | } | 512 | } |
| 520 | } | 513 | } |
| 521 | 514 | ||
| 522 | return pid_ns->child_reaper; | 515 | return child_reaper; |
| 523 | } | 516 | } |
| 524 | 517 | ||
| 525 | /* | 518 | /* |
| @@ -528,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
| 528 | static void reparent_leader(struct task_struct *father, struct task_struct *p, | 521 | static void reparent_leader(struct task_struct *father, struct task_struct *p, |
| 529 | struct list_head *dead) | 522 | struct list_head *dead) |
| 530 | { | 523 | { |
| 531 | list_move_tail(&p->sibling, &p->real_parent->children); | 524 | if (unlikely(p->exit_state == EXIT_DEAD)) |
| 532 | |||
| 533 | if (p->exit_state == EXIT_DEAD) | ||
| 534 | return; | ||
| 535 | /* | ||
| 536 | * If this is a threaded reparent there is no need to | ||
| 537 | * notify anyone anything has happened. | ||
| 538 | */ | ||
| 539 | if (same_thread_group(p->real_parent, father)) | ||
| 540 | return; | 525 | return; |
| 541 | 526 | ||
| 542 | /* We don't want people slaying init. */ | 527 | /* We don't want people slaying init. */ |
| @@ -547,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
| 547 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 532 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
| 548 | if (do_notify_parent(p, p->exit_signal)) { | 533 | if (do_notify_parent(p, p->exit_signal)) { |
| 549 | p->exit_state = EXIT_DEAD; | 534 | p->exit_state = EXIT_DEAD; |
| 550 | list_move_tail(&p->sibling, dead); | 535 | list_add(&p->ptrace_entry, dead); |
| 551 | } | 536 | } |
| 552 | } | 537 | } |
| 553 | 538 | ||
| 554 | kill_orphaned_pgrp(p, father); | 539 | kill_orphaned_pgrp(p, father); |
| 555 | } | 540 | } |
| 556 | 541 | ||
| 557 | static void forget_original_parent(struct task_struct *father) | 542 | /* |
| 543 | * This does two things: | ||
| 544 | * | ||
| 545 | * A. Make init inherit all the child processes | ||
| 546 | * B. Check to see if any process groups have become orphaned | ||
| 547 | * as a result of our exiting, and if they have any stopped | ||
| 548 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
| 549 | */ | ||
| 550 | static void forget_original_parent(struct task_struct *father, | ||
| 551 | struct list_head *dead) | ||
| 558 | { | 552 | { |
| 559 | struct task_struct *p, *n, *reaper; | 553 | struct task_struct *p, *t, *reaper; |
| 560 | LIST_HEAD(dead_children); | ||
| 561 | 554 | ||
| 562 | write_lock_irq(&tasklist_lock); | 555 | if (unlikely(!list_empty(&father->ptraced))) |
| 563 | /* | 556 | exit_ptrace(father, dead); |
| 564 | * Note that exit_ptrace() and find_new_reaper() might | ||
| 565 | * drop tasklist_lock and reacquire it. | ||
| 566 | */ | ||
| 567 | exit_ptrace(father); | ||
| 568 | reaper = find_new_reaper(father); | ||
| 569 | 557 | ||
| 570 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 558 | /* Can drop and reacquire tasklist_lock */ |
| 571 | struct task_struct *t = p; | 559 | reaper = find_child_reaper(father); |
| 560 | if (list_empty(&father->children)) | ||
| 561 | return; | ||
| 572 | 562 | ||
| 573 | do { | 563 | reaper = find_new_reaper(father, reaper); |
| 564 | list_for_each_entry(p, &father->children, sibling) { | ||
| 565 | for_each_thread(p, t) { | ||
| 574 | t->real_parent = reaper; | 566 | t->real_parent = reaper; |
| 575 | if (t->parent == father) { | 567 | BUG_ON((!t->ptrace) != (t->parent == father)); |
| 576 | BUG_ON(t->ptrace); | 568 | if (likely(!t->ptrace)) |
| 577 | t->parent = t->real_parent; | 569 | t->parent = t->real_parent; |
| 578 | } | ||
| 579 | if (t->pdeath_signal) | 570 | if (t->pdeath_signal) |
| 580 | group_send_sig_info(t->pdeath_signal, | 571 | group_send_sig_info(t->pdeath_signal, |
| 581 | SEND_SIG_NOINFO, t); | 572 | SEND_SIG_NOINFO, t); |
| 582 | } while_each_thread(p, t); | 573 | } |
| 583 | reparent_leader(father, p, &dead_children); | 574 | /* |
| 584 | } | 575 | * If this is a threaded reparent there is no need to |
| 585 | write_unlock_irq(&tasklist_lock); | 576 | * notify anyone anything has happened. |
| 586 | 577 | */ | |
| 587 | BUG_ON(!list_empty(&father->children)); | 578 | if (!same_thread_group(reaper, father)) |
| 588 | 579 | reparent_leader(father, p, dead); | |
| 589 | list_for_each_entry_safe(p, n, &dead_children, sibling) { | ||
| 590 | list_del_init(&p->sibling); | ||
| 591 | release_task(p); | ||
| 592 | } | 580 | } |
| 581 | list_splice_tail_init(&father->children, &reaper->children); | ||
| 593 | } | 582 | } |
| 594 | 583 | ||
| 595 | /* | 584 | /* |
| @@ -599,18 +588,12 @@ static void forget_original_parent(struct task_struct *father) | |||
| 599 | static void exit_notify(struct task_struct *tsk, int group_dead) | 588 | static void exit_notify(struct task_struct *tsk, int group_dead) |
| 600 | { | 589 | { |
| 601 | bool autoreap; | 590 | bool autoreap; |
| 602 | 591 | struct task_struct *p, *n; | |
| 603 | /* | 592 | LIST_HEAD(dead); |
| 604 | * This does two things: | ||
| 605 | * | ||
| 606 | * A. Make init inherit all the child processes | ||
| 607 | * B. Check to see if any process groups have become orphaned | ||
| 608 | * as a result of our exiting, and if they have any stopped | ||
| 609 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | ||
| 610 | */ | ||
| 611 | forget_original_parent(tsk); | ||
| 612 | 593 | ||
| 613 | write_lock_irq(&tasklist_lock); | 594 | write_lock_irq(&tasklist_lock); |
| 595 | forget_original_parent(tsk, &dead); | ||
| 596 | |||
| 614 | if (group_dead) | 597 | if (group_dead) |
| 615 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 598 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
| 616 | 599 | ||
| @@ -628,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 628 | } | 611 | } |
| 629 | 612 | ||
| 630 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; | 613 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
| 614 | if (tsk->exit_state == EXIT_DEAD) | ||
| 615 | list_add(&tsk->ptrace_entry, &dead); | ||
| 631 | 616 | ||
| 632 | /* mt-exec, de_thread() is waiting for group leader */ | 617 | /* mt-exec, de_thread() is waiting for group leader */ |
| 633 | if (unlikely(tsk->signal->notify_count < 0)) | 618 | if (unlikely(tsk->signal->notify_count < 0)) |
| 634 | wake_up_process(tsk->signal->group_exit_task); | 619 | wake_up_process(tsk->signal->group_exit_task); |
| 635 | write_unlock_irq(&tasklist_lock); | 620 | write_unlock_irq(&tasklist_lock); |
| 636 | 621 | ||
| 637 | /* If the process is dead, release it - nobody will wait for it */ | 622 | list_for_each_entry_safe(p, n, &dead, ptrace_entry) { |
| 638 | if (autoreap) | 623 | list_del_init(&p->ptrace_entry); |
| 639 | release_task(tsk); | 624 | release_task(p); |
| 625 | } | ||
| 640 | } | 626 | } |
| 641 | 627 | ||
| 642 | #ifdef CONFIG_DEBUG_STACK_USAGE | 628 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| @@ -667,6 +653,7 @@ void do_exit(long code) | |||
| 667 | { | 653 | { |
| 668 | struct task_struct *tsk = current; | 654 | struct task_struct *tsk = current; |
| 669 | int group_dead; | 655 | int group_dead; |
| 656 | TASKS_RCU(int tasks_rcu_i); | ||
| 670 | 657 | ||
| 671 | profile_task_exit(tsk); | 658 | profile_task_exit(tsk); |
| 672 | 659 | ||
| @@ -775,6 +762,7 @@ void do_exit(long code) | |||
| 775 | */ | 762 | */ |
| 776 | flush_ptrace_hw_breakpoint(tsk); | 763 | flush_ptrace_hw_breakpoint(tsk); |
| 777 | 764 | ||
| 765 | TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); | ||
| 778 | exit_notify(tsk, group_dead); | 766 | exit_notify(tsk, group_dead); |
| 779 | proc_exit_connector(tsk); | 767 | proc_exit_connector(tsk); |
| 780 | #ifdef CONFIG_NUMA | 768 | #ifdef CONFIG_NUMA |
| @@ -814,6 +802,7 @@ void do_exit(long code) | |||
| 814 | if (tsk->nr_dirtied) | 802 | if (tsk->nr_dirtied) |
| 815 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | 803 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); |
| 816 | exit_rcu(); | 804 | exit_rcu(); |
| 805 | TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); | ||
| 817 | 806 | ||
| 818 | /* | 807 | /* |
| 819 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | 808 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed |
| @@ -978,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | |||
| 978 | */ | 967 | */ |
| 979 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 968 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
| 980 | { | 969 | { |
| 981 | unsigned long state; | 970 | int state, retval, status; |
| 982 | int retval, status, traced; | ||
| 983 | pid_t pid = task_pid_vnr(p); | 971 | pid_t pid = task_pid_vnr(p); |
| 984 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 972 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
| 985 | struct siginfo __user *infop; | 973 | struct siginfo __user *infop; |
| @@ -993,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 993 | 981 | ||
| 994 | get_task_struct(p); | 982 | get_task_struct(p); |
| 995 | read_unlock(&tasklist_lock); | 983 | read_unlock(&tasklist_lock); |
| 984 | sched_annotate_sleep(); | ||
| 985 | |||
| 996 | if ((exit_code & 0x7f) == 0) { | 986 | if ((exit_code & 0x7f) == 0) { |
| 997 | why = CLD_EXITED; | 987 | why = CLD_EXITED; |
| 998 | status = exit_code >> 8; | 988 | status = exit_code >> 8; |
| @@ -1002,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1002 | } | 992 | } |
| 1003 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | 993 | return wait_noreap_copyout(wo, p, pid, uid, why, status); |
| 1004 | } | 994 | } |
| 1005 | |||
| 1006 | traced = ptrace_reparented(p); | ||
| 1007 | /* | 995 | /* |
| 1008 | * Move the task's state to DEAD/TRACE, only one thread can do this. | 996 | * Move the task's state to DEAD/TRACE, only one thread can do this. |
| 1009 | */ | 997 | */ |
| 1010 | state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; | 998 | state = (ptrace_reparented(p) && thread_group_leader(p)) ? |
| 999 | EXIT_TRACE : EXIT_DEAD; | ||
| 1011 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) | 1000 | if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) |
| 1012 | return 0; | 1001 | return 0; |
| 1013 | /* | 1002 | /* |
| 1014 | * It can be ptraced but not reparented, check | 1003 | * We own this thread, nobody else can reap it. |
| 1015 | * thread_group_leader() to filter out sub-threads. | ||
| 1016 | */ | 1004 | */ |
| 1017 | if (likely(!traced) && thread_group_leader(p)) { | 1005 | read_unlock(&tasklist_lock); |
| 1018 | struct signal_struct *psig; | 1006 | sched_annotate_sleep(); |
| 1019 | struct signal_struct *sig; | 1007 | |
| 1008 | /* | ||
| 1009 | * Check thread_group_leader() to exclude the traced sub-threads. | ||
| 1010 | */ | ||
| 1011 | if (state == EXIT_DEAD && thread_group_leader(p)) { | ||
| 1012 | struct signal_struct *sig = p->signal; | ||
| 1013 | struct signal_struct *psig = current->signal; | ||
| 1020 | unsigned long maxrss; | 1014 | unsigned long maxrss; |
| 1021 | cputime_t tgutime, tgstime; | 1015 | cputime_t tgutime, tgstime; |
| 1022 | 1016 | ||
| @@ -1028,21 +1022,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1028 | * accumulate in the parent's signal_struct c* fields. | 1022 | * accumulate in the parent's signal_struct c* fields. |
| 1029 | * | 1023 | * |
| 1030 | * We don't bother to take a lock here to protect these | 1024 | * We don't bother to take a lock here to protect these |
| 1031 | * p->signal fields, because they are only touched by | 1025 | * p->signal fields because the whole thread group is dead |
| 1032 | * __exit_signal, which runs with tasklist_lock | 1026 | * and nobody can change them. |
| 1033 | * write-locked anyway, and so is excluded here. We do | 1027 | * |
| 1034 | * need to protect the access to parent->signal fields, | 1028 | * psig->stats_lock also protects us from our sub-theads |
| 1035 | * as other threads in the parent group can be right | 1029 | * which can reap other children at the same time. Until |
| 1036 | * here reaping other children at the same time. | 1030 | * we change k_getrusage()-like users to rely on this lock |
| 1031 | * we have to take ->siglock as well. | ||
| 1037 | * | 1032 | * |
| 1038 | * We use thread_group_cputime_adjusted() to get times for | 1033 | * We use thread_group_cputime_adjusted() to get times for |
| 1039 | * the thread group, which consolidates times for all threads | 1034 | * the thread group, which consolidates times for all threads |
| 1040 | * in the group including the group leader. | 1035 | * in the group including the group leader. |
| 1041 | */ | 1036 | */ |
| 1042 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | 1037 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
| 1043 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1038 | spin_lock_irq(¤t->sighand->siglock); |
| 1044 | psig = p->real_parent->signal; | 1039 | write_seqlock(&psig->stats_lock); |
| 1045 | sig = p->signal; | ||
| 1046 | psig->cutime += tgutime + sig->cutime; | 1040 | psig->cutime += tgutime + sig->cutime; |
| 1047 | psig->cstime += tgstime + sig->cstime; | 1041 | psig->cstime += tgstime + sig->cstime; |
| 1048 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; | 1042 | psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; |
| @@ -1065,15 +1059,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1065 | psig->cmaxrss = maxrss; | 1059 | psig->cmaxrss = maxrss; |
| 1066 | task_io_accounting_add(&psig->ioac, &p->ioac); | 1060 | task_io_accounting_add(&psig->ioac, &p->ioac); |
| 1067 | task_io_accounting_add(&psig->ioac, &sig->ioac); | 1061 | task_io_accounting_add(&psig->ioac, &sig->ioac); |
| 1068 | spin_unlock_irq(&p->real_parent->sighand->siglock); | 1062 | write_sequnlock(&psig->stats_lock); |
| 1063 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 1069 | } | 1064 | } |
| 1070 | 1065 | ||
| 1071 | /* | ||
| 1072 | * Now we are sure this task is interesting, and no other | ||
| 1073 | * thread can reap it because we its state == DEAD/TRACE. | ||
| 1074 | */ | ||
| 1075 | read_unlock(&tasklist_lock); | ||
| 1076 | |||
| 1077 | retval = wo->wo_rusage | 1066 | retval = wo->wo_rusage |
| 1078 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1067 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; |
| 1079 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1068 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
| @@ -1204,6 +1193,7 @@ unlock_sig: | |||
| 1204 | pid = task_pid_vnr(p); | 1193 | pid = task_pid_vnr(p); |
| 1205 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1194 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
| 1206 | read_unlock(&tasklist_lock); | 1195 | read_unlock(&tasklist_lock); |
| 1196 | sched_annotate_sleep(); | ||
| 1207 | 1197 | ||
| 1208 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1198 | if (unlikely(wo->wo_flags & WNOWAIT)) |
| 1209 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1199 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); |
| @@ -1266,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1266 | pid = task_pid_vnr(p); | 1256 | pid = task_pid_vnr(p); |
| 1267 | get_task_struct(p); | 1257 | get_task_struct(p); |
| 1268 | read_unlock(&tasklist_lock); | 1258 | read_unlock(&tasklist_lock); |
| 1259 | sched_annotate_sleep(); | ||
| 1269 | 1260 | ||
| 1270 | if (!wo->wo_info) { | 1261 | if (!wo->wo_info) { |
| 1271 | retval = wo->wo_rusage | 1262 | retval = wo->wo_rusage |
| @@ -1296,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1296 | static int wait_consider_task(struct wait_opts *wo, int ptrace, | 1287 | static int wait_consider_task(struct wait_opts *wo, int ptrace, |
| 1297 | struct task_struct *p) | 1288 | struct task_struct *p) |
| 1298 | { | 1289 | { |
| 1290 | /* | ||
| 1291 | * We can race with wait_task_zombie() from another thread. | ||
| 1292 | * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition | ||
| 1293 | * can't confuse the checks below. | ||
| 1294 | */ | ||
| 1295 | int exit_state = ACCESS_ONCE(p->exit_state); | ||
| 1299 | int ret; | 1296 | int ret; |
| 1300 | 1297 | ||
| 1301 | if (unlikely(p->exit_state == EXIT_DEAD)) | 1298 | if (unlikely(exit_state == EXIT_DEAD)) |
| 1302 | return 0; | 1299 | return 0; |
| 1303 | 1300 | ||
| 1304 | ret = eligible_child(wo, p); | 1301 | ret = eligible_child(wo, p); |
| @@ -1319,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
| 1319 | return 0; | 1316 | return 0; |
| 1320 | } | 1317 | } |
| 1321 | 1318 | ||
| 1322 | if (unlikely(p->exit_state == EXIT_TRACE)) { | 1319 | if (unlikely(exit_state == EXIT_TRACE)) { |
| 1323 | /* | 1320 | /* |
| 1324 | * ptrace == 0 means we are the natural parent. In this case | 1321 | * ptrace == 0 means we are the natural parent. In this case |
| 1325 | * we should clear notask_error, debugger will notify us. | 1322 | * we should clear notask_error, debugger will notify us. |
| @@ -1346,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
| 1346 | } | 1343 | } |
| 1347 | 1344 | ||
| 1348 | /* slay zombie? */ | 1345 | /* slay zombie? */ |
| 1349 | if (p->exit_state == EXIT_ZOMBIE) { | 1346 | if (exit_state == EXIT_ZOMBIE) { |
| 1350 | /* we don't reap group leaders with subthreads */ | 1347 | /* we don't reap group leaders with subthreads */ |
| 1351 | if (!delay_group_leader(p)) { | 1348 | if (!delay_group_leader(p)) { |
| 1352 | /* | 1349 | /* |
diff --git a/kernel/extable.c b/kernel/extable.c index d8a6446adbcb..c98f926277a8 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/ftrace.h> | 18 | #include <linux/ftrace.h> |
| 19 | #include <linux/memory.h> | 19 | #include <linux/memory.h> |
| 20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
| 21 | #include <linux/ftrace.h> | ||
| 21 | #include <linux/mutex.h> | 22 | #include <linux/mutex.h> |
| 22 | #include <linux/init.h> | 23 | #include <linux/init.h> |
| 23 | 24 | ||
| @@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr) | |||
| 102 | return 1; | 103 | return 1; |
| 103 | if (is_module_text_address(addr)) | 104 | if (is_module_text_address(addr)) |
| 104 | return 1; | 105 | return 1; |
| 106 | if (is_ftrace_trampoline(addr)) | ||
| 107 | return 1; | ||
| 105 | /* | 108 | /* |
| 106 | * There might be init symbols in saved stacktraces. | 109 | * There might be init symbols in saved stacktraces. |
| 107 | * Give those symbols a chance to be printed in | 110 | * Give those symbols a chance to be printed in |
| @@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr) | |||
| 119 | { | 122 | { |
| 120 | if (core_kernel_text(addr)) | 123 | if (core_kernel_text(addr)) |
| 121 | return 1; | 124 | return 1; |
| 122 | return is_module_text_address(addr); | 125 | if (is_module_text_address(addr)) |
| 126 | return 1; | ||
| 127 | return is_ftrace_trampoline(addr); | ||
| 123 | } | 128 | } |
| 124 | 129 | ||
| 125 | /* | 130 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index a91e47d86de2..4dc2ddade9f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, | |||
| 294 | return 0; | 294 | return 0; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | void set_task_stack_end_magic(struct task_struct *tsk) | ||
| 298 | { | ||
| 299 | unsigned long *stackend; | ||
| 300 | |||
| 301 | stackend = end_of_stack(tsk); | ||
| 302 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 303 | } | ||
| 304 | |||
| 297 | static struct task_struct *dup_task_struct(struct task_struct *orig) | 305 | static struct task_struct *dup_task_struct(struct task_struct *orig) |
| 298 | { | 306 | { |
| 299 | struct task_struct *tsk; | 307 | struct task_struct *tsk; |
| 300 | struct thread_info *ti; | 308 | struct thread_info *ti; |
| 301 | unsigned long *stackend; | ||
| 302 | int node = tsk_fork_get_node(orig); | 309 | int node = tsk_fork_get_node(orig); |
| 303 | int err; | 310 | int err; |
| 304 | 311 | ||
| @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 328 | setup_thread_stack(tsk, orig); | 335 | setup_thread_stack(tsk, orig); |
| 329 | clear_user_return_notifier(tsk); | 336 | clear_user_return_notifier(tsk); |
| 330 | clear_tsk_need_resched(tsk); | 337 | clear_tsk_need_resched(tsk); |
| 331 | stackend = end_of_stack(tsk); | 338 | set_task_stack_end_magic(tsk); |
| 332 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | ||
| 333 | 339 | ||
| 334 | #ifdef CONFIG_CC_STACKPROTECTOR | 340 | #ifdef CONFIG_CC_STACKPROTECTOR |
| 335 | tsk->stack_canary = get_random_int(); | 341 | tsk->stack_canary = get_random_int(); |
| @@ -427,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 427 | get_file(file); | 433 | get_file(file); |
| 428 | if (tmp->vm_flags & VM_DENYWRITE) | 434 | if (tmp->vm_flags & VM_DENYWRITE) |
| 429 | atomic_dec(&inode->i_writecount); | 435 | atomic_dec(&inode->i_writecount); |
| 430 | mutex_lock(&mapping->i_mmap_mutex); | 436 | i_mmap_lock_write(mapping); |
| 431 | if (tmp->vm_flags & VM_SHARED) | 437 | if (tmp->vm_flags & VM_SHARED) |
| 432 | atomic_inc(&mapping->i_mmap_writable); | 438 | atomic_inc(&mapping->i_mmap_writable); |
| 433 | flush_dcache_mmap_lock(mapping); | 439 | flush_dcache_mmap_lock(mapping); |
| @@ -439,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 439 | vma_interval_tree_insert_after(tmp, mpnt, | 445 | vma_interval_tree_insert_after(tmp, mpnt, |
| 440 | &mapping->i_mmap); | 446 | &mapping->i_mmap); |
| 441 | flush_dcache_mmap_unlock(mapping); | 447 | flush_dcache_mmap_unlock(mapping); |
| 442 | mutex_unlock(&mapping->i_mmap_mutex); | 448 | i_mmap_unlock_write(mapping); |
| 443 | } | 449 | } |
| 444 | 450 | ||
| 445 | /* | 451 | /* |
| @@ -601,9 +607,8 @@ static void check_mm(struct mm_struct *mm) | |||
| 601 | printk(KERN_ALERT "BUG: Bad rss-counter state " | 607 | printk(KERN_ALERT "BUG: Bad rss-counter state " |
| 602 | "mm:%p idx:%d val:%ld\n", mm, i, x); | 608 | "mm:%p idx:%d val:%ld\n", mm, i, x); |
| 603 | } | 609 | } |
| 604 | |||
| 605 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 610 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
| 606 | VM_BUG_ON(mm->pmd_huge_pte); | 611 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); |
| 607 | #endif | 612 | #endif |
| 608 | } | 613 | } |
| 609 | 614 | ||
| @@ -1017,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
| 1017 | { | 1022 | { |
| 1018 | if (atomic_dec_and_test(&sighand->count)) { | 1023 | if (atomic_dec_and_test(&sighand->count)) { |
| 1019 | signalfd_cleanup(sighand); | 1024 | signalfd_cleanup(sighand); |
| 1025 | /* | ||
| 1026 | * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it | ||
| 1027 | * without an RCU grace period, see __lock_task_sighand(). | ||
| 1028 | */ | ||
| 1020 | kmem_cache_free(sighand_cachep, sighand); | 1029 | kmem_cache_free(sighand_cachep, sighand); |
| 1021 | } | 1030 | } |
| 1022 | } | 1031 | } |
| 1023 | 1032 | ||
| 1024 | |||
| 1025 | /* | 1033 | /* |
| 1026 | * Initialize POSIX timer handling for a thread group. | 1034 | * Initialize POSIX timer handling for a thread group. |
| 1027 | */ | 1035 | */ |
| @@ -1068,6 +1076,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1068 | sig->curr_target = tsk; | 1076 | sig->curr_target = tsk; |
| 1069 | init_sigpending(&sig->shared_pending); | 1077 | init_sigpending(&sig->shared_pending); |
| 1070 | INIT_LIST_HEAD(&sig->posix_timers); | 1078 | INIT_LIST_HEAD(&sig->posix_timers); |
| 1079 | seqlock_init(&sig->stats_lock); | ||
| 1071 | 1080 | ||
| 1072 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1081 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1073 | sig->real_timer.function = it_real_fn; | 1082 | sig->real_timer.function = it_real_fn; |
diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) | |||
| 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) | 42 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
| 43 | return false; | 43 | return false; |
| 44 | 44 | ||
| 45 | if (test_thread_flag(TIF_MEMDIE)) | ||
| 46 | return false; | ||
| 47 | |||
| 45 | if (pm_nosig_freezing || cgroup_freezing(p)) | 48 | if (pm_nosig_freezing || cgroup_freezing(p)) |
| 46 | return true; | 49 | return true; |
| 47 | 50 | ||
| @@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p) | |||
| 147 | { | 150 | { |
| 148 | unsigned long flags; | 151 | unsigned long flags; |
| 149 | 152 | ||
| 150 | /* | ||
| 151 | * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to | ||
| 152 | * be visible to @p as waking up implies wmb. Waking up inside | ||
| 153 | * freezer_lock also prevents wakeups from leaking outside | ||
| 154 | * refrigerator. | ||
| 155 | */ | ||
| 156 | spin_lock_irqsave(&freezer_lock, flags); | 153 | spin_lock_irqsave(&freezer_lock, flags); |
| 157 | if (frozen(p)) | 154 | if (frozen(p)) |
| 158 | wake_up_process(p); | 155 | wake_up_process(p); |
diff --git a/kernel/futex.c b/kernel/futex.c index 815d7af2ffe8..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -143,9 +143,8 @@ | |||
| 143 | * | 143 | * |
| 144 | * Where (A) orders the waiters increment and the futex value read through | 144 | * Where (A) orders the waiters increment and the futex value read through |
| 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write | 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write |
| 146 | * to futex and the waiters read -- this is done by the barriers in | 146 | * to futex and the waiters read -- this is done by the barriers for both |
| 147 | * get_futex_key_refs(), through either ihold or atomic_inc, depending on the | 147 | * shared and private futexes in get_futex_key_refs(). |
| 148 | * futex type. | ||
| 149 | * | 148 | * |
| 150 | * This yields the following case (where X:=waiters, Y:=futex): | 149 | * This yields the following case (where X:=waiters, Y:=futex): |
| 151 | * | 150 | * |
| @@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key) | |||
| 343 | case FUT_OFF_MMSHARED: | 342 | case FUT_OFF_MMSHARED: |
| 344 | futex_get_mm(key); /* implies MB (B) */ | 343 | futex_get_mm(key); /* implies MB (B) */ |
| 345 | break; | 344 | break; |
| 345 | default: | ||
| 346 | /* | ||
| 347 | * Private futexes do not hold reference on an inode or | ||
| 348 | * mm, therefore the only purpose of calling get_futex_key_refs | ||
| 349 | * is because we need the barrier for the lockless waiter check. | ||
| 350 | */ | ||
| 351 | smp_mb(); /* explicit MB (B) */ | ||
| 346 | } | 352 | } |
| 347 | } | 353 | } |
| 348 | 354 | ||
| 349 | /* | 355 | /* |
| 350 | * Drop a reference to the resource addressed by a key. | 356 | * Drop a reference to the resource addressed by a key. |
| 351 | * The hash bucket spinlock must not be held. | 357 | * The hash bucket spinlock must not be held. This is |
| 358 | * a no-op for private futexes, see comment in the get | ||
| 359 | * counterpart. | ||
| 352 | */ | 360 | */ |
| 353 | static void drop_futex_key_refs(union futex_key *key) | 361 | static void drop_futex_key_refs(union futex_key *key) |
| 354 | { | 362 | { |
| @@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) | |||
| 639 | return pi_state; | 647 | return pi_state; |
| 640 | } | 648 | } |
| 641 | 649 | ||
| 650 | /* | ||
| 651 | * Must be called with the hb lock held. | ||
| 652 | */ | ||
| 642 | static void free_pi_state(struct futex_pi_state *pi_state) | 653 | static void free_pi_state(struct futex_pi_state *pi_state) |
| 643 | { | 654 | { |
| 655 | if (!pi_state) | ||
| 656 | return; | ||
| 657 | |||
| 644 | if (!atomic_dec_and_test(&pi_state->refcount)) | 658 | if (!atomic_dec_and_test(&pi_state->refcount)) |
| 645 | return; | 659 | return; |
| 646 | 660 | ||
| @@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
| 1519 | } | 1533 | } |
| 1520 | 1534 | ||
| 1521 | retry: | 1535 | retry: |
| 1522 | if (pi_state != NULL) { | ||
| 1523 | /* | ||
| 1524 | * We will have to lookup the pi_state again, so free this one | ||
| 1525 | * to keep the accounting correct. | ||
| 1526 | */ | ||
| 1527 | free_pi_state(pi_state); | ||
| 1528 | pi_state = NULL; | ||
| 1529 | } | ||
| 1530 | |||
| 1531 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); | 1536 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
| 1532 | if (unlikely(ret != 0)) | 1537 | if (unlikely(ret != 0)) |
| 1533 | goto out; | 1538 | goto out; |
| @@ -1617,6 +1622,8 @@ retry_private: | |||
| 1617 | case 0: | 1622 | case 0: |
| 1618 | break; | 1623 | break; |
| 1619 | case -EFAULT: | 1624 | case -EFAULT: |
| 1625 | free_pi_state(pi_state); | ||
| 1626 | pi_state = NULL; | ||
| 1620 | double_unlock_hb(hb1, hb2); | 1627 | double_unlock_hb(hb1, hb2); |
| 1621 | hb_waiters_dec(hb2); | 1628 | hb_waiters_dec(hb2); |
| 1622 | put_futex_key(&key2); | 1629 | put_futex_key(&key2); |
| @@ -1632,6 +1639,8 @@ retry_private: | |||
| 1632 | * exit to complete. | 1639 | * exit to complete. |
| 1633 | * - The user space value changed. | 1640 | * - The user space value changed. |
| 1634 | */ | 1641 | */ |
| 1642 | free_pi_state(pi_state); | ||
| 1643 | pi_state = NULL; | ||
| 1635 | double_unlock_hb(hb1, hb2); | 1644 | double_unlock_hb(hb1, hb2); |
| 1636 | hb_waiters_dec(hb2); | 1645 | hb_waiters_dec(hb2); |
| 1637 | put_futex_key(&key2); | 1646 | put_futex_key(&key2); |
| @@ -1708,6 +1717,7 @@ retry_private: | |||
| 1708 | } | 1717 | } |
| 1709 | 1718 | ||
| 1710 | out_unlock: | 1719 | out_unlock: |
| 1720 | free_pi_state(pi_state); | ||
| 1711 | double_unlock_hb(hb1, hb2); | 1721 | double_unlock_hb(hb1, hb2); |
| 1712 | hb_waiters_dec(hb2); | 1722 | hb_waiters_dec(hb2); |
| 1713 | 1723 | ||
| @@ -1725,8 +1735,6 @@ out_put_keys: | |||
| 1725 | out_put_key1: | 1735 | out_put_key1: |
| 1726 | put_futex_key(&key1); | 1736 | put_futex_key(&key1); |
| 1727 | out: | 1737 | out: |
| 1728 | if (pi_state != NULL) | ||
| 1729 | free_pi_state(pi_state); | ||
| 1730 | return ret ? ret : task_count; | 1738 | return ret ? ret : task_count; |
| 1731 | } | 1739 | } |
| 1732 | 1740 | ||
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8ac4399..c92e44855ddd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -32,10 +32,13 @@ config GCOV_KERNEL | |||
| 32 | Note that the debugfs filesystem has to be mounted to access | 32 | Note that the debugfs filesystem has to be mounted to access |
| 33 | profiling data. | 33 | profiling data. |
| 34 | 34 | ||
| 35 | config ARCH_HAS_GCOV_PROFILE_ALL | ||
| 36 | def_bool n | ||
| 37 | |||
| 35 | config GCOV_PROFILE_ALL | 38 | config GCOV_PROFILE_ALL |
| 36 | bool "Profile entire Kernel" | 39 | bool "Profile entire Kernel" |
| 37 | depends on GCOV_KERNEL | 40 | depends on GCOV_KERNEL |
| 38 | depends on SUPERH || S390 || X86 || PPC || MICROBLAZE | 41 | depends on ARCH_HAS_GCOV_PROFILE_ALL |
| 39 | default n | 42 | default n |
| 40 | ---help--- | 43 | ---help--- |
| 41 | This options activates profiling for the entire kernel. | 44 | This options activates profiling for the entire kernel. |
diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cfa..664411f171b5 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
| 9 | #include <linux/user_namespace.h> | ||
| 9 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
| 10 | 11 | ||
| 11 | /* init to 2 - one for init_task, one to ensure it is never freed */ | 12 | /* init to 2 - one for init_task, one to ensure it is never freed */ |
| @@ -213,6 +214,14 @@ out: | |||
| 213 | return i; | 214 | return i; |
| 214 | } | 215 | } |
| 215 | 216 | ||
| 217 | bool may_setgroups(void) | ||
| 218 | { | ||
| 219 | struct user_namespace *user_ns = current_user_ns(); | ||
| 220 | |||
| 221 | return ns_capable(user_ns, CAP_SETGID) && | ||
| 222 | userns_may_setgroups(user_ns); | ||
| 223 | } | ||
| 224 | |||
| 216 | /* | 225 | /* |
| 217 | * SMP: Our groups are copy-on-write. We can set them safely | 226 | * SMP: Our groups are copy-on-write. We can set them safely |
| 218 | * without another task interfering. | 227 | * without another task interfering. |
| @@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
| 223 | struct group_info *group_info; | 232 | struct group_info *group_info; |
| 224 | int retval; | 233 | int retval; |
| 225 | 234 | ||
| 226 | if (!ns_capable(current_user_ns(), CAP_SETGID)) | 235 | if (!may_setgroups()) |
| 227 | return -EPERM; | 236 | return -EPERM; |
| 228 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 237 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 229 | return -EINVAL; | 238 | return -EINVAL; |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -55,6 +55,24 @@ config GENERIC_IRQ_CHIP | |||
| 55 | config IRQ_DOMAIN | 55 | config IRQ_DOMAIN |
| 56 | bool | 56 | bool |
| 57 | 57 | ||
| 58 | # Support for hierarchical irq domains | ||
| 59 | config IRQ_DOMAIN_HIERARCHY | ||
| 60 | bool | ||
| 61 | select IRQ_DOMAIN | ||
| 62 | |||
| 63 | # Generic MSI interrupt support | ||
| 64 | config GENERIC_MSI_IRQ | ||
| 65 | bool | ||
| 66 | |||
| 67 | # Generic MSI hierarchical interrupt domain support | ||
| 68 | config GENERIC_MSI_IRQ_DOMAIN | ||
| 69 | bool | ||
| 70 | select IRQ_DOMAIN_HIERARCHY | ||
| 71 | select GENERIC_MSI_IRQ | ||
| 72 | |||
| 73 | config HANDLE_DOMAIN_IRQ | ||
| 74 | bool | ||
| 75 | |||
| 58 | config IRQ_DOMAIN_DEBUG | 76 | config IRQ_DOMAIN_DEBUG |
| 59 | bool "Expose hardware/virtual IRQ mapping via debugfs" | 77 | bool "Expose hardware/virtual IRQ mapping via debugfs" |
| 60 | depends on IRQ_DOMAIN && DEBUG_FS | 78 | depends on IRQ_DOMAIN && DEBUG_FS |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | |||
| 6 | obj-$(CONFIG_PROC_FS) += proc.o | 6 | obj-$(CONFIG_PROC_FS) += proc.o |
| 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
| 8 | obj-$(CONFIG_PM_SLEEP) += pm.o | 8 | obj-$(CONFIG_PM_SLEEP) += pm.o |
| 9 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6223fab9a9d2..6f1c7a566b95 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
| 17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
| 18 | #include <linux/irqdomain.h> | ||
| 18 | 19 | ||
| 19 | #include <trace/events/irq.h> | 20 | #include <trace/events/irq.h> |
| 20 | 21 | ||
| @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) | |||
| 178 | irq_state_clr_disabled(desc); | 179 | irq_state_clr_disabled(desc); |
| 179 | desc->depth = 0; | 180 | desc->depth = 0; |
| 180 | 181 | ||
| 182 | irq_domain_activate_irq(&desc->irq_data); | ||
| 181 | if (desc->irq_data.chip->irq_startup) { | 183 | if (desc->irq_data.chip->irq_startup) { |
| 182 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | 184 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); |
| 183 | irq_state_clr_masked(desc); | 185 | irq_state_clr_masked(desc); |
| @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) | |||
| 199 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 201 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
| 200 | else | 202 | else |
| 201 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 203 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
| 204 | irq_domain_deactivate_irq(&desc->irq_data); | ||
| 202 | irq_state_set_masked(desc); | 205 | irq_state_set_masked(desc); |
| 203 | } | 206 | } |
| 204 | 207 | ||
| @@ -342,6 +345,31 @@ static bool irq_check_poll(struct irq_desc *desc) | |||
| 342 | return irq_wait_for_poll(desc); | 345 | return irq_wait_for_poll(desc); |
| 343 | } | 346 | } |
| 344 | 347 | ||
| 348 | static bool irq_may_run(struct irq_desc *desc) | ||
| 349 | { | ||
| 350 | unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; | ||
| 351 | |||
| 352 | /* | ||
| 353 | * If the interrupt is not in progress and is not an armed | ||
| 354 | * wakeup interrupt, proceed. | ||
| 355 | */ | ||
| 356 | if (!irqd_has_set(&desc->irq_data, mask)) | ||
| 357 | return true; | ||
| 358 | |||
| 359 | /* | ||
| 360 | * If the interrupt is an armed wakeup source, mark it pending | ||
| 361 | * and suspended, disable it and notify the pm core about the | ||
| 362 | * event. | ||
| 363 | */ | ||
| 364 | if (irq_pm_check_wakeup(desc)) | ||
| 365 | return false; | ||
| 366 | |||
| 367 | /* | ||
| 368 | * Handle a potential concurrent poll on a different core. | ||
| 369 | */ | ||
| 370 | return irq_check_poll(desc); | ||
| 371 | } | ||
| 372 | |||
| 345 | /** | 373 | /** |
| 346 | * handle_simple_irq - Simple and software-decoded IRQs. | 374 | * handle_simple_irq - Simple and software-decoded IRQs. |
| 347 | * @irq: the interrupt number | 375 | * @irq: the interrupt number |
| @@ -359,9 +387,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
| 359 | { | 387 | { |
| 360 | raw_spin_lock(&desc->lock); | 388 | raw_spin_lock(&desc->lock); |
| 361 | 389 | ||
| 362 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 390 | if (!irq_may_run(desc)) |
| 363 | if (!irq_check_poll(desc)) | 391 | goto out_unlock; |
| 364 | goto out_unlock; | ||
| 365 | 392 | ||
| 366 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 393 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 367 | kstat_incr_irqs_this_cpu(irq, desc); | 394 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -412,9 +439,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
| 412 | raw_spin_lock(&desc->lock); | 439 | raw_spin_lock(&desc->lock); |
| 413 | mask_ack_irq(desc); | 440 | mask_ack_irq(desc); |
| 414 | 441 | ||
| 415 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 442 | if (!irq_may_run(desc)) |
| 416 | if (!irq_check_poll(desc)) | 443 | goto out_unlock; |
| 417 | goto out_unlock; | ||
| 418 | 444 | ||
| 419 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 445 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 420 | kstat_incr_irqs_this_cpu(irq, desc); | 446 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -485,9 +511,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
| 485 | 511 | ||
| 486 | raw_spin_lock(&desc->lock); | 512 | raw_spin_lock(&desc->lock); |
| 487 | 513 | ||
| 488 | if (unlikely(irqd_irq_inprogress(&desc->irq_data))) | 514 | if (!irq_may_run(desc)) |
| 489 | if (!irq_check_poll(desc)) | 515 | goto out; |
| 490 | goto out; | ||
| 491 | 516 | ||
| 492 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 517 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 493 | kstat_incr_irqs_this_cpu(irq, desc); | 518 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -541,19 +566,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
| 541 | raw_spin_lock(&desc->lock); | 566 | raw_spin_lock(&desc->lock); |
| 542 | 567 | ||
| 543 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 568 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 569 | |||
| 570 | if (!irq_may_run(desc)) { | ||
| 571 | desc->istate |= IRQS_PENDING; | ||
| 572 | mask_ack_irq(desc); | ||
| 573 | goto out_unlock; | ||
| 574 | } | ||
| 575 | |||
| 544 | /* | 576 | /* |
| 545 | * If we're currently running this IRQ, or its disabled, | 577 | * If its disabled or no action available then mask it and get |
| 546 | * we shouldn't process the IRQ. Mark it pending, handle | 578 | * out of here. |
| 547 | * the necessary masking and go out | ||
| 548 | */ | 579 | */ |
| 549 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | 580 | if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { |
| 550 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | 581 | desc->istate |= IRQS_PENDING; |
| 551 | if (!irq_check_poll(desc)) { | 582 | mask_ack_irq(desc); |
| 552 | desc->istate |= IRQS_PENDING; | 583 | goto out_unlock; |
| 553 | mask_ack_irq(desc); | ||
| 554 | goto out_unlock; | ||
| 555 | } | ||
| 556 | } | 584 | } |
| 585 | |||
| 557 | kstat_incr_irqs_this_cpu(irq, desc); | 586 | kstat_incr_irqs_this_cpu(irq, desc); |
| 558 | 587 | ||
| 559 | /* Start handling the irq */ | 588 | /* Start handling the irq */ |
| @@ -602,18 +631,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) | |||
| 602 | raw_spin_lock(&desc->lock); | 631 | raw_spin_lock(&desc->lock); |
| 603 | 632 | ||
| 604 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 633 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
| 634 | |||
| 635 | if (!irq_may_run(desc)) { | ||
| 636 | desc->istate |= IRQS_PENDING; | ||
| 637 | goto out_eoi; | ||
| 638 | } | ||
| 639 | |||
| 605 | /* | 640 | /* |
| 606 | * If we're currently running this IRQ, or its disabled, | 641 | * If its disabled or no action available then mask it and get |
| 607 | * we shouldn't process the IRQ. Mark it pending, handle | 642 | * out of here. |
| 608 | * the necessary masking and go out | ||
| 609 | */ | 643 | */ |
| 610 | if (unlikely(irqd_irq_disabled(&desc->irq_data) || | 644 | if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { |
| 611 | irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { | 645 | desc->istate |= IRQS_PENDING; |
| 612 | if (!irq_check_poll(desc)) { | 646 | goto out_eoi; |
| 613 | desc->istate |= IRQS_PENDING; | ||
| 614 | goto out_eoi; | ||
| 615 | } | ||
| 616 | } | 647 | } |
| 648 | |||
| 617 | kstat_incr_irqs_this_cpu(irq, desc); | 649 | kstat_incr_irqs_this_cpu(irq, desc); |
| 618 | 650 | ||
| 619 | do { | 651 | do { |
| @@ -670,7 +702,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) | |||
| 670 | { | 702 | { |
| 671 | struct irq_chip *chip = irq_desc_get_chip(desc); | 703 | struct irq_chip *chip = irq_desc_get_chip(desc); |
| 672 | struct irqaction *action = desc->action; | 704 | struct irqaction *action = desc->action; |
| 673 | void *dev_id = __this_cpu_ptr(action->percpu_dev_id); | 705 | void *dev_id = raw_cpu_ptr(action->percpu_dev_id); |
| 674 | irqreturn_t res; | 706 | irqreturn_t res; |
| 675 | 707 | ||
| 676 | kstat_incr_irqs_this_cpu(irq, desc); | 708 | kstat_incr_irqs_this_cpu(irq, desc); |
| @@ -699,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
| 699 | if (!handle) { | 731 | if (!handle) { |
| 700 | handle = handle_bad_irq; | 732 | handle = handle_bad_irq; |
| 701 | } else { | 733 | } else { |
| 702 | if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) | 734 | struct irq_data *irq_data = &desc->irq_data; |
| 735 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 736 | /* | ||
| 737 | * With hierarchical domains we might run into a | ||
| 738 | * situation where the outermost chip is not yet set | ||
| 739 | * up, but the inner chips are there. Instead of | ||
| 740 | * bailing we install the handler, but obviously we | ||
| 741 | * cannot enable/startup the interrupt at this point. | ||
| 742 | */ | ||
| 743 | while (irq_data) { | ||
| 744 | if (irq_data->chip != &no_irq_chip) | ||
| 745 | break; | ||
| 746 | /* | ||
| 747 | * Bail out if the outer chip is not set up | ||
| 748 | * and the interrrupt supposed to be started | ||
| 749 | * right away. | ||
| 750 | */ | ||
| 751 | if (WARN_ON(is_chained)) | ||
| 752 | goto out; | ||
| 753 | /* Try the parent */ | ||
| 754 | irq_data = irq_data->parent_data; | ||
| 755 | } | ||
| 756 | #endif | ||
| 757 | if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) | ||
| 703 | goto out; | 758 | goto out; |
| 704 | } | 759 | } |
| 705 | 760 | ||
| @@ -818,3 +873,105 @@ void irq_cpu_offline(void) | |||
| 818 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 873 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 819 | } | 874 | } |
| 820 | } | 875 | } |
| 876 | |||
| 877 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 878 | /** | ||
| 879 | * irq_chip_ack_parent - Acknowledge the parent interrupt | ||
| 880 | * @data: Pointer to interrupt specific data | ||
| 881 | */ | ||
| 882 | void irq_chip_ack_parent(struct irq_data *data) | ||
| 883 | { | ||
| 884 | data = data->parent_data; | ||
| 885 | data->chip->irq_ack(data); | ||
| 886 | } | ||
| 887 | |||
| 888 | /** | ||
| 889 | * irq_chip_mask_parent - Mask the parent interrupt | ||
| 890 | * @data: Pointer to interrupt specific data | ||
| 891 | */ | ||
| 892 | void irq_chip_mask_parent(struct irq_data *data) | ||
| 893 | { | ||
| 894 | data = data->parent_data; | ||
| 895 | data->chip->irq_mask(data); | ||
| 896 | } | ||
| 897 | |||
| 898 | /** | ||
| 899 | * irq_chip_unmask_parent - Unmask the parent interrupt | ||
| 900 | * @data: Pointer to interrupt specific data | ||
| 901 | */ | ||
| 902 | void irq_chip_unmask_parent(struct irq_data *data) | ||
| 903 | { | ||
| 904 | data = data->parent_data; | ||
| 905 | data->chip->irq_unmask(data); | ||
| 906 | } | ||
| 907 | |||
| 908 | /** | ||
| 909 | * irq_chip_eoi_parent - Invoke EOI on the parent interrupt | ||
| 910 | * @data: Pointer to interrupt specific data | ||
| 911 | */ | ||
| 912 | void irq_chip_eoi_parent(struct irq_data *data) | ||
| 913 | { | ||
| 914 | data = data->parent_data; | ||
| 915 | data->chip->irq_eoi(data); | ||
| 916 | } | ||
| 917 | |||
| 918 | /** | ||
| 919 | * irq_chip_set_affinity_parent - Set affinity on the parent interrupt | ||
| 920 | * @data: Pointer to interrupt specific data | ||
| 921 | * @dest: The affinity mask to set | ||
| 922 | * @force: Flag to enforce setting (disable online checks) | ||
| 923 | * | ||
| 924 | * Conditinal, as the underlying parent chip might not implement it. | ||
| 925 | */ | ||
| 926 | int irq_chip_set_affinity_parent(struct irq_data *data, | ||
| 927 | const struct cpumask *dest, bool force) | ||
| 928 | { | ||
| 929 | data = data->parent_data; | ||
| 930 | if (data->chip->irq_set_affinity) | ||
| 931 | return data->chip->irq_set_affinity(data, dest, force); | ||
| 932 | |||
| 933 | return -ENOSYS; | ||
| 934 | } | ||
| 935 | |||
| 936 | /** | ||
| 937 | * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware | ||
| 938 | * @data: Pointer to interrupt specific data | ||
| 939 | * | ||
| 940 | * Iterate through the domain hierarchy of the interrupt and check | ||
| 941 | * whether a hw retrigger function exists. If yes, invoke it. | ||
| 942 | */ | ||
| 943 | int irq_chip_retrigger_hierarchy(struct irq_data *data) | ||
| 944 | { | ||
| 945 | for (data = data->parent_data; data; data = data->parent_data) | ||
| 946 | if (data->chip && data->chip->irq_retrigger) | ||
| 947 | return data->chip->irq_retrigger(data); | ||
| 948 | |||
| 949 | return -ENOSYS; | ||
| 950 | } | ||
| 951 | #endif | ||
| 952 | |||
| 953 | /** | ||
| 954 | * irq_chip_compose_msi_msg - Componse msi message for a irq chip | ||
| 955 | * @data: Pointer to interrupt specific data | ||
| 956 | * @msg: Pointer to the MSI message | ||
| 957 | * | ||
| 958 | * For hierarchical domains we find the first chip in the hierarchy | ||
| 959 | * which implements the irq_compose_msi_msg callback. For non | ||
| 960 | * hierarchical we use the top level chip. | ||
| 961 | */ | ||
| 962 | int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) | ||
| 963 | { | ||
| 964 | struct irq_data *pos = NULL; | ||
| 965 | |||
| 966 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 967 | for (; data; data = data->parent_data) | ||
| 968 | #endif | ||
| 969 | if (data->chip && data->chip->irq_compose_msi_msg) | ||
| 970 | pos = data; | ||
| 971 | if (!pos) | ||
| 972 | return -ENOSYS; | ||
| 973 | |||
| 974 | pos->chip->irq_compose_msi_msg(pos, msg); | ||
| 975 | |||
| 976 | return 0; | ||
| 977 | } | ||
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef0606797c9..d5d0f7345c54 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
| @@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) | |||
| 38 | * | 38 | * |
| 39 | * Except for the extra @dev argument, this function takes the | 39 | * Except for the extra @dev argument, this function takes the |
| 40 | * same arguments and performs the same function as | 40 | * same arguments and performs the same function as |
| 41 | * request_irq(). IRQs requested with this function will be | 41 | * request_threaded_irq(). IRQs requested with this function will be |
| 42 | * automatically freed on driver detach. | 42 | * automatically freed on driver detach. |
| 43 | * | 43 | * |
| 44 | * If an IRQ allocated with this function needs to be freed | 44 | * If an IRQ allocated with this function needs to be freed |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cf80e7b0ddab..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
| @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) | |||
| 39 | u32 mask = d->mask; | 39 | u32 mask = d->mask; |
| 40 | 40 | ||
| 41 | irq_gc_lock(gc); | 41 | irq_gc_lock(gc); |
| 42 | irq_reg_writel(mask, gc->reg_base + ct->regs.disable); | 42 | irq_reg_writel(gc, mask, ct->regs.disable); |
| 43 | *ct->mask_cache &= ~mask; | 43 | *ct->mask_cache &= ~mask; |
| 44 | irq_gc_unlock(gc); | 44 | irq_gc_unlock(gc); |
| 45 | } | 45 | } |
| @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) | |||
| 59 | 59 | ||
| 60 | irq_gc_lock(gc); | 60 | irq_gc_lock(gc); |
| 61 | *ct->mask_cache |= mask; | 61 | *ct->mask_cache |= mask; |
| 62 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); | 62 | irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); |
| 63 | irq_gc_unlock(gc); | 63 | irq_gc_unlock(gc); |
| 64 | } | 64 | } |
| 65 | EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); | 65 | EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); |
| @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) | |||
| 79 | 79 | ||
| 80 | irq_gc_lock(gc); | 80 | irq_gc_lock(gc); |
| 81 | *ct->mask_cache &= ~mask; | 81 | *ct->mask_cache &= ~mask; |
| 82 | irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); | 82 | irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); |
| 83 | irq_gc_unlock(gc); | 83 | irq_gc_unlock(gc); |
| 84 | } | 84 | } |
| 85 | EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); | 85 | EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); |
| @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) | |||
| 98 | u32 mask = d->mask; | 98 | u32 mask = d->mask; |
| 99 | 99 | ||
| 100 | irq_gc_lock(gc); | 100 | irq_gc_lock(gc); |
| 101 | irq_reg_writel(mask, gc->reg_base + ct->regs.enable); | 101 | irq_reg_writel(gc, mask, ct->regs.enable); |
| 102 | *ct->mask_cache |= mask; | 102 | *ct->mask_cache |= mask; |
| 103 | irq_gc_unlock(gc); | 103 | irq_gc_unlock(gc); |
| 104 | } | 104 | } |
| @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) | |||
| 114 | u32 mask = d->mask; | 114 | u32 mask = d->mask; |
| 115 | 115 | ||
| 116 | irq_gc_lock(gc); | 116 | irq_gc_lock(gc); |
| 117 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 117 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 118 | irq_gc_unlock(gc); | 118 | irq_gc_unlock(gc); |
| 119 | } | 119 | } |
| 120 | EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); | 120 | EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); |
| @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) | |||
| 130 | u32 mask = ~d->mask; | 130 | u32 mask = ~d->mask; |
| 131 | 131 | ||
| 132 | irq_gc_lock(gc); | 132 | irq_gc_lock(gc); |
| 133 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 133 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 134 | irq_gc_unlock(gc); | 134 | irq_gc_unlock(gc); |
| 135 | } | 135 | } |
| 136 | 136 | ||
| @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | |||
| 145 | u32 mask = d->mask; | 145 | u32 mask = d->mask; |
| 146 | 146 | ||
| 147 | irq_gc_lock(gc); | 147 | irq_gc_lock(gc); |
| 148 | irq_reg_writel(mask, gc->reg_base + ct->regs.mask); | 148 | irq_reg_writel(gc, mask, ct->regs.mask); |
| 149 | irq_reg_writel(mask, gc->reg_base + ct->regs.ack); | 149 | irq_reg_writel(gc, mask, ct->regs.ack); |
| 150 | irq_gc_unlock(gc); | 150 | irq_gc_unlock(gc); |
| 151 | } | 151 | } |
| 152 | 152 | ||
| @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) | |||
| 161 | u32 mask = d->mask; | 161 | u32 mask = d->mask; |
| 162 | 162 | ||
| 163 | irq_gc_lock(gc); | 163 | irq_gc_lock(gc); |
| 164 | irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); | 164 | irq_reg_writel(gc, mask, ct->regs.eoi); |
| 165 | irq_gc_unlock(gc); | 165 | irq_gc_unlock(gc); |
| 166 | } | 166 | } |
| 167 | 167 | ||
| @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) | |||
| 191 | return 0; | 191 | return 0; |
| 192 | } | 192 | } |
| 193 | 193 | ||
| 194 | static u32 irq_readl_be(void __iomem *addr) | ||
| 195 | { | ||
| 196 | return ioread32be(addr); | ||
| 197 | } | ||
| 198 | |||
| 199 | static void irq_writel_be(u32 val, void __iomem *addr) | ||
| 200 | { | ||
| 201 | iowrite32be(val, addr); | ||
| 202 | } | ||
| 203 | |||
| 194 | static void | 204 | static void |
| 195 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | 205 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, |
| 196 | int num_ct, unsigned int irq_base, | 206 | int num_ct, unsigned int irq_base, |
| @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) | |||
| 245 | } | 255 | } |
| 246 | ct[i].mask_cache = mskptr; | 256 | ct[i].mask_cache = mskptr; |
| 247 | if (flags & IRQ_GC_INIT_MASK_CACHE) | 257 | if (flags & IRQ_GC_INIT_MASK_CACHE) |
| 248 | *mskptr = irq_reg_readl(gc->reg_base + mskreg); | 258 | *mskptr = irq_reg_readl(gc, mskreg); |
| 249 | } | 259 | } |
| 250 | } | 260 | } |
| 251 | 261 | ||
| @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
| 300 | dgc->gc[i] = gc = tmp; | 310 | dgc->gc[i] = gc = tmp; |
| 301 | irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, | 311 | irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, |
| 302 | NULL, handler); | 312 | NULL, handler); |
| 313 | |||
| 303 | gc->domain = d; | 314 | gc->domain = d; |
| 315 | if (gcflags & IRQ_GC_BE_IO) { | ||
| 316 | gc->reg_readl = &irq_readl_be; | ||
| 317 | gc->reg_writel = &irq_writel_be; | ||
| 318 | } | ||
| 319 | |||
| 304 | raw_spin_lock_irqsave(&gc_lock, flags); | 320 | raw_spin_lock_irqsave(&gc_lock, flags); |
| 305 | list_add_tail(&gc->list, &gc_list); | 321 | list_add_tail(&gc->list, &gc_list); |
| 306 | raw_spin_unlock_irqrestore(&gc_lock, flags); | 322 | raw_spin_unlock_irqrestore(&gc_lock, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 099ea2e0eb88..df553b0af936 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -63,8 +63,8 @@ enum { | |||
| 63 | 63 | ||
| 64 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | 64 | extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, |
| 65 | unsigned long flags); | 65 | unsigned long flags); |
| 66 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 66 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq); |
| 67 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 67 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq); |
| 68 | 68 | ||
| 69 | extern int irq_startup(struct irq_desc *desc, bool resend); | 69 | extern int irq_startup(struct irq_desc *desc, bool resend); |
| 70 | extern void irq_shutdown(struct irq_desc *desc); | 70 | extern void irq_shutdown(struct irq_desc *desc); |
| @@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc); | |||
| 78 | 78 | ||
| 79 | #ifdef CONFIG_SPARSE_IRQ | 79 | #ifdef CONFIG_SPARSE_IRQ |
| 80 | static inline void irq_mark_irq(unsigned int irq) { } | 80 | static inline void irq_mark_irq(unsigned int irq) { } |
| 81 | extern void irq_lock_sparse(void); | ||
| 82 | extern void irq_unlock_sparse(void); | ||
| 81 | #else | 83 | #else |
| 82 | extern void irq_mark_irq(unsigned int irq); | 84 | extern void irq_mark_irq(unsigned int irq); |
| 85 | static inline void irq_lock_sparse(void) { } | ||
| 86 | static inline void irq_unlock_sparse(void) { } | ||
| 83 | #endif | 87 | #endif |
| 84 | 88 | ||
| 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 89 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
| @@ -194,3 +198,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d | |||
| 194 | __this_cpu_inc(*desc->kstat_irqs); | 198 | __this_cpu_inc(*desc->kstat_irqs); |
| 195 | __this_cpu_inc(kstat.irqs_sum); | 199 | __this_cpu_inc(kstat.irqs_sum); |
| 196 | } | 200 | } |
| 201 | |||
| 202 | #ifdef CONFIG_PM_SLEEP | ||
| 203 | bool irq_pm_check_wakeup(struct irq_desc *desc); | ||
| 204 | void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); | ||
| 205 | void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); | ||
| 206 | #else | ||
| 207 | static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } | ||
| 208 | static inline void | ||
| 209 | irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } | ||
| 210 | static inline void | ||
| 211 | irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } | ||
| 212 | #endif | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..99793b9b6d23 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
| 15 | #include <linux/radix-tree.h> | 15 | #include <linux/radix-tree.h> |
| 16 | #include <linux/bitmap.h> | 16 | #include <linux/bitmap.h> |
| 17 | #include <linux/irqdomain.h> | ||
| 17 | 18 | ||
| 18 | #include "internals.h" | 19 | #include "internals.h" |
| 19 | 20 | ||
| @@ -131,6 +132,16 @@ static void free_masks(struct irq_desc *desc) | |||
| 131 | static inline void free_masks(struct irq_desc *desc) { } | 132 | static inline void free_masks(struct irq_desc *desc) { } |
| 132 | #endif | 133 | #endif |
| 133 | 134 | ||
| 135 | void irq_lock_sparse(void) | ||
| 136 | { | ||
| 137 | mutex_lock(&sparse_irq_lock); | ||
| 138 | } | ||
| 139 | |||
| 140 | void irq_unlock_sparse(void) | ||
| 141 | { | ||
| 142 | mutex_unlock(&sparse_irq_lock); | ||
| 143 | } | ||
| 144 | |||
| 134 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) | 145 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) |
| 135 | { | 146 | { |
| 136 | struct irq_desc *desc; | 147 | struct irq_desc *desc; |
| @@ -167,6 +178,12 @@ static void free_desc(unsigned int irq) | |||
| 167 | 178 | ||
| 168 | unregister_irq_proc(irq, desc); | 179 | unregister_irq_proc(irq, desc); |
| 169 | 180 | ||
| 181 | /* | ||
| 182 | * sparse_irq_lock protects also show_interrupts() and | ||
| 183 | * kstat_irq_usr(). Once we deleted the descriptor from the | ||
| 184 | * sparse tree we can free it. Access in proc will fail to | ||
| 185 | * lookup the descriptor. | ||
| 186 | */ | ||
| 170 | mutex_lock(&sparse_irq_lock); | 187 | mutex_lock(&sparse_irq_lock); |
| 171 | delete_irq_desc(irq); | 188 | delete_irq_desc(irq); |
| 172 | mutex_unlock(&sparse_irq_lock); | 189 | mutex_unlock(&sparse_irq_lock); |
| @@ -336,6 +353,47 @@ int generic_handle_irq(unsigned int irq) | |||
| 336 | } | 353 | } |
| 337 | EXPORT_SYMBOL_GPL(generic_handle_irq); | 354 | EXPORT_SYMBOL_GPL(generic_handle_irq); |
| 338 | 355 | ||
| 356 | #ifdef CONFIG_HANDLE_DOMAIN_IRQ | ||
| 357 | /** | ||
| 358 | * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain | ||
| 359 | * @domain: The domain where to perform the lookup | ||
| 360 | * @hwirq: The HW irq number to convert to a logical one | ||
| 361 | * @lookup: Whether to perform the domain lookup or not | ||
| 362 | * @regs: Register file coming from the low-level handling code | ||
| 363 | * | ||
| 364 | * Returns: 0 on success, or -EINVAL if conversion has failed | ||
| 365 | */ | ||
| 366 | int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, | ||
| 367 | bool lookup, struct pt_regs *regs) | ||
| 368 | { | ||
| 369 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
| 370 | unsigned int irq = hwirq; | ||
| 371 | int ret = 0; | ||
| 372 | |||
| 373 | irq_enter(); | ||
| 374 | |||
| 375 | #ifdef CONFIG_IRQ_DOMAIN | ||
| 376 | if (lookup) | ||
| 377 | irq = irq_find_mapping(domain, hwirq); | ||
| 378 | #endif | ||
| 379 | |||
| 380 | /* | ||
| 381 | * Some hardware gives randomly wrong interrupts. Rather | ||
| 382 | * than crashing, do something sensible. | ||
| 383 | */ | ||
| 384 | if (unlikely(!irq || irq >= nr_irqs)) { | ||
| 385 | ack_bad_irq(irq); | ||
| 386 | ret = -EINVAL; | ||
| 387 | } else { | ||
| 388 | generic_handle_irq(irq); | ||
| 389 | } | ||
| 390 | |||
| 391 | irq_exit(); | ||
| 392 | set_irq_regs(old_regs); | ||
| 393 | return ret; | ||
| 394 | } | ||
| 395 | #endif | ||
| 396 | |||
| 339 | /* Dynamic interrupt handling */ | 397 | /* Dynamic interrupt handling */ |
| 340 | 398 | ||
| 341 | /** | 399 | /** |
| @@ -532,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq) | |||
| 532 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 590 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
| 533 | } | 591 | } |
| 534 | 592 | ||
| 593 | /** | ||
| 594 | * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu | ||
| 595 | * @irq: The interrupt number | ||
| 596 | * @cpu: The cpu number | ||
| 597 | * | ||
| 598 | * Returns the sum of interrupt counts on @cpu since boot for | ||
| 599 | * @irq. The caller must ensure that the interrupt is not removed | ||
| 600 | * concurrently. | ||
| 601 | */ | ||
| 535 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | 602 | unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) |
| 536 | { | 603 | { |
| 537 | struct irq_desc *desc = irq_to_desc(irq); | 604 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -540,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
| 540 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 607 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
| 541 | } | 608 | } |
| 542 | 609 | ||
| 610 | /** | ||
| 611 | * kstat_irqs - Get the statistics for an interrupt | ||
| 612 | * @irq: The interrupt number | ||
| 613 | * | ||
| 614 | * Returns the sum of interrupt counts on all cpus since boot for | ||
| 615 | * @irq. The caller must ensure that the interrupt is not removed | ||
| 616 | * concurrently. | ||
| 617 | */ | ||
| 543 | unsigned int kstat_irqs(unsigned int irq) | 618 | unsigned int kstat_irqs(unsigned int irq) |
| 544 | { | 619 | { |
| 545 | struct irq_desc *desc = irq_to_desc(irq); | 620 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -552,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq) | |||
| 552 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | 627 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
| 553 | return sum; | 628 | return sum; |
| 554 | } | 629 | } |
| 630 | |||
| 631 | /** | ||
| 632 | * kstat_irqs_usr - Get the statistics for an interrupt | ||
| 633 | * @irq: The interrupt number | ||
| 634 | * | ||
| 635 | * Returns the sum of interrupt counts on all cpus since boot for | ||
| 636 | * @irq. Contrary to kstat_irqs() this can be called from any | ||
| 637 | * preemptible context. It's protected against concurrent removal of | ||
| 638 | * an interrupt descriptor when sparse irqs are enabled. | ||
| 639 | */ | ||
| 640 | unsigned int kstat_irqs_usr(unsigned int irq) | ||
| 641 | { | ||
| 642 | int sum; | ||
| 643 | |||
| 644 | irq_lock_sparse(); | ||
| 645 | sum = kstat_irqs(irq); | ||
| 646 | irq_unlock_sparse(); | ||
| 647 | return sum; | ||
| 648 | } | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6534ff6ce02e..7fac311057b8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); | |||
| 23 | static DEFINE_MUTEX(revmap_trees_mutex); | 23 | static DEFINE_MUTEX(revmap_trees_mutex); |
| 24 | static struct irq_domain *irq_default_domain; | 24 | static struct irq_domain *irq_default_domain; |
| 25 | 25 | ||
| 26 | static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, | ||
| 27 | irq_hw_number_t hwirq, int node); | ||
| 28 | static void irq_domain_check_hierarchy(struct irq_domain *domain); | ||
| 29 | |||
| 26 | /** | 30 | /** |
| 27 | * __irq_domain_add() - Allocate a new irq_domain data structure | 31 | * __irq_domain_add() - Allocate a new irq_domain data structure |
| 28 | * @of_node: optional device-tree node of the interrupt controller | 32 | * @of_node: optional device-tree node of the interrupt controller |
| @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; | |||
| 30 | * @hwirq_max: Maximum number of interrupts supported by controller | 34 | * @hwirq_max: Maximum number of interrupts supported by controller |
| 31 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 35 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
| 32 | * direct mapping | 36 | * direct mapping |
| 33 | * @ops: map/unmap domain callbacks | 37 | * @ops: domain callbacks |
| 34 | * @host_data: Controller private data pointer | 38 | * @host_data: Controller private data pointer |
| 35 | * | 39 | * |
| 36 | * Allocates and initialize and irq_domain structure. | 40 | * Allocates and initialize and irq_domain structure. |
| @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, | |||
| 56 | domain->hwirq_max = hwirq_max; | 60 | domain->hwirq_max = hwirq_max; |
| 57 | domain->revmap_size = size; | 61 | domain->revmap_size = size; |
| 58 | domain->revmap_direct_max_irq = direct_max; | 62 | domain->revmap_direct_max_irq = direct_max; |
| 63 | irq_domain_check_hierarchy(domain); | ||
| 59 | 64 | ||
| 60 | mutex_lock(&irq_domain_mutex); | 65 | mutex_lock(&irq_domain_mutex); |
| 61 | list_add(&domain->link, &irq_domain_list); | 66 | list_add(&domain->link, &irq_domain_list); |
| @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); | |||
| 109 | * @first_irq: first number of irq block assigned to the domain, | 114 | * @first_irq: first number of irq block assigned to the domain, |
| 110 | * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then | 115 | * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then |
| 111 | * pre-map all of the irqs in the domain to virqs starting at first_irq. | 116 | * pre-map all of the irqs in the domain to virqs starting at first_irq. |
| 112 | * @ops: map/unmap domain callbacks | 117 | * @ops: domain callbacks |
| 113 | * @host_data: Controller private data pointer | 118 | * @host_data: Controller private data pointer |
| 114 | * | 119 | * |
| 115 | * Allocates an irq_domain, and optionally if first_irq is positive then also | 120 | * Allocates an irq_domain, and optionally if first_irq is positive then also |
| @@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
| 174 | 179 | ||
| 175 | domain = __irq_domain_add(of_node, first_hwirq + size, | 180 | domain = __irq_domain_add(of_node, first_hwirq + size, |
| 176 | first_hwirq + size, 0, ops, host_data); | 181 | first_hwirq + size, 0, ops, host_data); |
| 177 | if (!domain) | 182 | if (domain) |
| 178 | return NULL; | 183 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); |
| 179 | |||
| 180 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); | ||
| 181 | 184 | ||
| 182 | return domain; | 185 | return domain; |
| 183 | } | 186 | } |
| @@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); | |||
| 388 | unsigned int irq_create_mapping(struct irq_domain *domain, | 391 | unsigned int irq_create_mapping(struct irq_domain *domain, |
| 389 | irq_hw_number_t hwirq) | 392 | irq_hw_number_t hwirq) |
| 390 | { | 393 | { |
| 391 | unsigned int hint; | ||
| 392 | int virq; | 394 | int virq; |
| 393 | 395 | ||
| 394 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 396 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
| @@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
| 410 | } | 412 | } |
| 411 | 413 | ||
| 412 | /* Allocate a virtual interrupt number */ | 414 | /* Allocate a virtual interrupt number */ |
| 413 | hint = hwirq % nr_irqs; | 415 | virq = irq_domain_alloc_descs(-1, 1, hwirq, |
| 414 | if (hint == 0) | 416 | of_node_to_nid(domain->of_node)); |
| 415 | hint++; | ||
| 416 | virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); | ||
| 417 | if (virq <= 0) | ||
| 418 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); | ||
| 419 | if (virq <= 0) { | 417 | if (virq <= 0) { |
| 420 | pr_debug("-> virq allocation failed\n"); | 418 | pr_debug("-> virq allocation failed\n"); |
| 421 | return 0; | 419 | return 0; |
| @@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
| 471 | struct irq_domain *domain; | 469 | struct irq_domain *domain; |
| 472 | irq_hw_number_t hwirq; | 470 | irq_hw_number_t hwirq; |
| 473 | unsigned int type = IRQ_TYPE_NONE; | 471 | unsigned int type = IRQ_TYPE_NONE; |
| 474 | unsigned int virq; | 472 | int virq; |
| 475 | 473 | ||
| 476 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; | 474 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; |
| 477 | if (!domain) { | 475 | if (!domain) { |
| @@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
| 489 | return 0; | 487 | return 0; |
| 490 | } | 488 | } |
| 491 | 489 | ||
| 492 | /* Create mapping */ | 490 | if (irq_domain_is_hierarchy(domain)) { |
| 493 | virq = irq_create_mapping(domain, hwirq); | 491 | /* |
| 494 | if (!virq) | 492 | * If we've already configured this interrupt, |
| 495 | return virq; | 493 | * don't do it again, or hell will break loose. |
| 494 | */ | ||
| 495 | virq = irq_find_mapping(domain, hwirq); | ||
| 496 | if (virq) | ||
| 497 | return virq; | ||
| 498 | |||
| 499 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); | ||
| 500 | if (virq <= 0) | ||
| 501 | return 0; | ||
| 502 | } else { | ||
| 503 | /* Create mapping */ | ||
| 504 | virq = irq_create_mapping(domain, hwirq); | ||
| 505 | if (!virq) | ||
| 506 | return virq; | ||
| 507 | } | ||
| 496 | 508 | ||
| 497 | /* Set type if specified and different than the current one */ | 509 | /* Set type if specified and different than the current one */ |
| 498 | if (type != IRQ_TYPE_NONE && | 510 | if (type != IRQ_TYPE_NONE && |
| @@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
| 540 | return 0; | 552 | return 0; |
| 541 | 553 | ||
| 542 | if (hwirq < domain->revmap_direct_max_irq) { | 554 | if (hwirq < domain->revmap_direct_max_irq) { |
| 543 | data = irq_get_irq_data(hwirq); | 555 | data = irq_domain_get_irq_data(domain, hwirq); |
| 544 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | 556 | if (data && data->hwirq == hwirq) |
| 545 | return hwirq; | 557 | return hwirq; |
| 546 | } | 558 | } |
| 547 | 559 | ||
| @@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { | |||
| 709 | .xlate = irq_domain_xlate_onetwocell, | 721 | .xlate = irq_domain_xlate_onetwocell, |
| 710 | }; | 722 | }; |
| 711 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 723 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
| 724 | |||
| 725 | static int irq_domain_alloc_descs(int virq, unsigned int cnt, | ||
| 726 | irq_hw_number_t hwirq, int node) | ||
| 727 | { | ||
| 728 | unsigned int hint; | ||
| 729 | |||
| 730 | if (virq >= 0) { | ||
| 731 | virq = irq_alloc_descs(virq, virq, cnt, node); | ||
| 732 | } else { | ||
| 733 | hint = hwirq % nr_irqs; | ||
| 734 | if (hint == 0) | ||
| 735 | hint++; | ||
| 736 | virq = irq_alloc_descs_from(hint, cnt, node); | ||
| 737 | if (virq <= 0 && hint > 1) | ||
| 738 | virq = irq_alloc_descs_from(1, cnt, node); | ||
| 739 | } | ||
| 740 | |||
| 741 | return virq; | ||
| 742 | } | ||
| 743 | |||
| 744 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 745 | /** | ||
| 746 | * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy | ||
| 747 | * @parent: Parent irq domain to associate with the new domain | ||
| 748 | * @flags: Irq domain flags associated to the domain | ||
| 749 | * @size: Size of the domain. See below | ||
| 750 | * @node: Optional device-tree node of the interrupt controller | ||
| 751 | * @ops: Pointer to the interrupt domain callbacks | ||
| 752 | * @host_data: Controller private data pointer | ||
| 753 | * | ||
| 754 | * If @size is 0 a tree domain is created, otherwise a linear domain. | ||
| 755 | * | ||
| 756 | * If successful the parent is associated to the new domain and the | ||
| 757 | * domain flags are set. | ||
| 758 | * Returns pointer to IRQ domain, or NULL on failure. | ||
| 759 | */ | ||
| 760 | struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, | ||
| 761 | unsigned int flags, | ||
| 762 | unsigned int size, | ||
| 763 | struct device_node *node, | ||
| 764 | const struct irq_domain_ops *ops, | ||
| 765 | void *host_data) | ||
| 766 | { | ||
| 767 | struct irq_domain *domain; | ||
| 768 | |||
| 769 | if (size) | ||
| 770 | domain = irq_domain_add_linear(node, size, ops, host_data); | ||
| 771 | else | ||
| 772 | domain = irq_domain_add_tree(node, ops, host_data); | ||
| 773 | if (domain) { | ||
| 774 | domain->parent = parent; | ||
| 775 | domain->flags |= flags; | ||
| 776 | } | ||
| 777 | |||
| 778 | return domain; | ||
| 779 | } | ||
| 780 | |||
| 781 | static void irq_domain_insert_irq(int virq) | ||
| 782 | { | ||
| 783 | struct irq_data *data; | ||
| 784 | |||
| 785 | for (data = irq_get_irq_data(virq); data; data = data->parent_data) { | ||
| 786 | struct irq_domain *domain = data->domain; | ||
| 787 | irq_hw_number_t hwirq = data->hwirq; | ||
| 788 | |||
| 789 | if (hwirq < domain->revmap_size) { | ||
| 790 | domain->linear_revmap[hwirq] = virq; | ||
| 791 | } else { | ||
| 792 | mutex_lock(&revmap_trees_mutex); | ||
| 793 | radix_tree_insert(&domain->revmap_tree, hwirq, data); | ||
| 794 | mutex_unlock(&revmap_trees_mutex); | ||
| 795 | } | ||
| 796 | |||
| 797 | /* If not already assigned, give the domain the chip's name */ | ||
| 798 | if (!domain->name && data->chip) | ||
| 799 | domain->name = data->chip->name; | ||
| 800 | } | ||
| 801 | |||
| 802 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | ||
| 803 | } | ||
| 804 | |||
| 805 | static void irq_domain_remove_irq(int virq) | ||
| 806 | { | ||
| 807 | struct irq_data *data; | ||
| 808 | |||
| 809 | irq_set_status_flags(virq, IRQ_NOREQUEST); | ||
| 810 | irq_set_chip_and_handler(virq, NULL, NULL); | ||
| 811 | synchronize_irq(virq); | ||
| 812 | smp_mb(); | ||
| 813 | |||
| 814 | for (data = irq_get_irq_data(virq); data; data = data->parent_data) { | ||
| 815 | struct irq_domain *domain = data->domain; | ||
| 816 | irq_hw_number_t hwirq = data->hwirq; | ||
| 817 | |||
| 818 | if (hwirq < domain->revmap_size) { | ||
| 819 | domain->linear_revmap[hwirq] = 0; | ||
| 820 | } else { | ||
| 821 | mutex_lock(&revmap_trees_mutex); | ||
| 822 | radix_tree_delete(&domain->revmap_tree, hwirq); | ||
| 823 | mutex_unlock(&revmap_trees_mutex); | ||
| 824 | } | ||
| 825 | } | ||
| 826 | } | ||
| 827 | |||
| 828 | static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, | ||
| 829 | struct irq_data *child) | ||
| 830 | { | ||
| 831 | struct irq_data *irq_data; | ||
| 832 | |||
| 833 | irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); | ||
| 834 | if (irq_data) { | ||
| 835 | child->parent_data = irq_data; | ||
| 836 | irq_data->irq = child->irq; | ||
| 837 | irq_data->node = child->node; | ||
| 838 | irq_data->domain = domain; | ||
| 839 | } | ||
| 840 | |||
| 841 | return irq_data; | ||
| 842 | } | ||
| 843 | |||
| 844 | static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) | ||
| 845 | { | ||
| 846 | struct irq_data *irq_data, *tmp; | ||
| 847 | int i; | ||
| 848 | |||
| 849 | for (i = 0; i < nr_irqs; i++) { | ||
| 850 | irq_data = irq_get_irq_data(virq + i); | ||
| 851 | tmp = irq_data->parent_data; | ||
| 852 | irq_data->parent_data = NULL; | ||
| 853 | irq_data->domain = NULL; | ||
| 854 | |||
| 855 | while (tmp) { | ||
| 856 | irq_data = tmp; | ||
| 857 | tmp = tmp->parent_data; | ||
| 858 | kfree(irq_data); | ||
| 859 | } | ||
| 860 | } | ||
| 861 | } | ||
| 862 | |||
| 863 | static int irq_domain_alloc_irq_data(struct irq_domain *domain, | ||
| 864 | unsigned int virq, unsigned int nr_irqs) | ||
| 865 | { | ||
| 866 | struct irq_data *irq_data; | ||
| 867 | struct irq_domain *parent; | ||
| 868 | int i; | ||
| 869 | |||
| 870 | /* The outermost irq_data is embedded in struct irq_desc */ | ||
| 871 | for (i = 0; i < nr_irqs; i++) { | ||
| 872 | irq_data = irq_get_irq_data(virq + i); | ||
| 873 | irq_data->domain = domain; | ||
| 874 | |||
| 875 | for (parent = domain->parent; parent; parent = parent->parent) { | ||
| 876 | irq_data = irq_domain_insert_irq_data(parent, irq_data); | ||
| 877 | if (!irq_data) { | ||
| 878 | irq_domain_free_irq_data(virq, i + 1); | ||
| 879 | return -ENOMEM; | ||
| 880 | } | ||
| 881 | } | ||
| 882 | } | ||
| 883 | |||
| 884 | return 0; | ||
| 885 | } | ||
| 886 | |||
| 887 | /** | ||
| 888 | * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain | ||
| 889 | * @domain: domain to match | ||
| 890 | * @virq: IRQ number to get irq_data | ||
| 891 | */ | ||
| 892 | struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, | ||
| 893 | unsigned int virq) | ||
| 894 | { | ||
| 895 | struct irq_data *irq_data; | ||
| 896 | |||
| 897 | for (irq_data = irq_get_irq_data(virq); irq_data; | ||
| 898 | irq_data = irq_data->parent_data) | ||
| 899 | if (irq_data->domain == domain) | ||
| 900 | return irq_data; | ||
| 901 | |||
| 902 | return NULL; | ||
| 903 | } | ||
| 904 | |||
| 905 | /** | ||
| 906 | * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain | ||
| 907 | * @domain: Interrupt domain to match | ||
| 908 | * @virq: IRQ number | ||
| 909 | * @hwirq: The hwirq number | ||
| 910 | * @chip: The associated interrupt chip | ||
| 911 | * @chip_data: The associated chip data | ||
| 912 | */ | ||
| 913 | int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, | ||
| 914 | irq_hw_number_t hwirq, struct irq_chip *chip, | ||
| 915 | void *chip_data) | ||
| 916 | { | ||
| 917 | struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); | ||
| 918 | |||
| 919 | if (!irq_data) | ||
| 920 | return -ENOENT; | ||
| 921 | |||
| 922 | irq_data->hwirq = hwirq; | ||
| 923 | irq_data->chip = chip ? chip : &no_irq_chip; | ||
| 924 | irq_data->chip_data = chip_data; | ||
| 925 | |||
| 926 | return 0; | ||
| 927 | } | ||
| 928 | |||
| 929 | /** | ||
| 930 | * irq_domain_set_info - Set the complete data for a @virq in @domain | ||
| 931 | * @domain: Interrupt domain to match | ||
| 932 | * @virq: IRQ number | ||
| 933 | * @hwirq: The hardware interrupt number | ||
| 934 | * @chip: The associated interrupt chip | ||
| 935 | * @chip_data: The associated interrupt chip data | ||
| 936 | * @handler: The interrupt flow handler | ||
| 937 | * @handler_data: The interrupt flow handler data | ||
| 938 | * @handler_name: The interrupt handler name | ||
| 939 | */ | ||
| 940 | void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, | ||
| 941 | irq_hw_number_t hwirq, struct irq_chip *chip, | ||
| 942 | void *chip_data, irq_flow_handler_t handler, | ||
| 943 | void *handler_data, const char *handler_name) | ||
| 944 | { | ||
| 945 | irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); | ||
| 946 | __irq_set_handler(virq, handler, 0, handler_name); | ||
| 947 | irq_set_handler_data(virq, handler_data); | ||
| 948 | } | ||
| 949 | |||
| 950 | /** | ||
| 951 | * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data | ||
| 952 | * @irq_data: The pointer to irq_data | ||
| 953 | */ | ||
| 954 | void irq_domain_reset_irq_data(struct irq_data *irq_data) | ||
| 955 | { | ||
| 956 | irq_data->hwirq = 0; | ||
| 957 | irq_data->chip = &no_irq_chip; | ||
| 958 | irq_data->chip_data = NULL; | ||
| 959 | } | ||
| 960 | |||
| 961 | /** | ||
| 962 | * irq_domain_free_irqs_common - Clear irq_data and free the parent | ||
| 963 | * @domain: Interrupt domain to match | ||
| 964 | * @virq: IRQ number to start with | ||
| 965 | * @nr_irqs: The number of irqs to free | ||
| 966 | */ | ||
| 967 | void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, | ||
| 968 | unsigned int nr_irqs) | ||
| 969 | { | ||
| 970 | struct irq_data *irq_data; | ||
| 971 | int i; | ||
| 972 | |||
| 973 | for (i = 0; i < nr_irqs; i++) { | ||
| 974 | irq_data = irq_domain_get_irq_data(domain, virq + i); | ||
| 975 | if (irq_data) | ||
| 976 | irq_domain_reset_irq_data(irq_data); | ||
| 977 | } | ||
| 978 | irq_domain_free_irqs_parent(domain, virq, nr_irqs); | ||
| 979 | } | ||
| 980 | |||
| 981 | /** | ||
| 982 | * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent | ||
| 983 | * @domain: Interrupt domain to match | ||
| 984 | * @virq: IRQ number to start with | ||
| 985 | * @nr_irqs: The number of irqs to free | ||
| 986 | */ | ||
| 987 | void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, | ||
| 988 | unsigned int nr_irqs) | ||
| 989 | { | ||
| 990 | int i; | ||
| 991 | |||
| 992 | for (i = 0; i < nr_irqs; i++) { | ||
| 993 | irq_set_handler_data(virq + i, NULL); | ||
| 994 | irq_set_handler(virq + i, NULL); | ||
| 995 | } | ||
| 996 | irq_domain_free_irqs_common(domain, virq, nr_irqs); | ||
| 997 | } | ||
| 998 | |||
| 999 | static bool irq_domain_is_auto_recursive(struct irq_domain *domain) | ||
| 1000 | { | ||
| 1001 | return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void irq_domain_free_irqs_recursive(struct irq_domain *domain, | ||
| 1005 | unsigned int irq_base, | ||
| 1006 | unsigned int nr_irqs) | ||
| 1007 | { | ||
| 1008 | domain->ops->free(domain, irq_base, nr_irqs); | ||
| 1009 | if (irq_domain_is_auto_recursive(domain)) { | ||
| 1010 | BUG_ON(!domain->parent); | ||
| 1011 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | ||
| 1012 | nr_irqs); | ||
| 1013 | } | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | ||
| 1017 | unsigned int irq_base, | ||
| 1018 | unsigned int nr_irqs, void *arg) | ||
| 1019 | { | ||
| 1020 | int ret = 0; | ||
| 1021 | struct irq_domain *parent = domain->parent; | ||
| 1022 | bool recursive = irq_domain_is_auto_recursive(domain); | ||
| 1023 | |||
| 1024 | BUG_ON(recursive && !parent); | ||
| 1025 | if (recursive) | ||
| 1026 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, | ||
| 1027 | nr_irqs, arg); | ||
| 1028 | if (ret >= 0) | ||
| 1029 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | ||
| 1030 | if (ret < 0 && recursive) | ||
| 1031 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); | ||
| 1032 | |||
| 1033 | return ret; | ||
| 1034 | } | ||
| 1035 | |||
| 1036 | /** | ||
| 1037 | * __irq_domain_alloc_irqs - Allocate IRQs from domain | ||
| 1038 | * @domain: domain to allocate from | ||
| 1039 | * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 | ||
| 1040 | * @nr_irqs: number of IRQs to allocate | ||
| 1041 | * @node: NUMA node id for memory allocation | ||
| 1042 | * @arg: domain specific argument | ||
| 1043 | * @realloc: IRQ descriptors have already been allocated if true | ||
| 1044 | * | ||
| 1045 | * Allocate IRQ numbers and initialized all data structures to support | ||
| 1046 | * hierarchy IRQ domains. | ||
| 1047 | * Parameter @realloc is mainly to support legacy IRQs. | ||
| 1048 | * Returns error code or allocated IRQ number | ||
| 1049 | * | ||
| 1050 | * The whole process to setup an IRQ has been split into two steps. | ||
| 1051 | * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ | ||
| 1052 | * descriptor and required hardware resources. The second step, | ||
| 1053 | * irq_domain_activate_irq(), is to program hardwares with preallocated | ||
| 1054 | * resources. In this way, it's easier to rollback when failing to | ||
| 1055 | * allocate resources. | ||
| 1056 | */ | ||
| 1057 | int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | ||
| 1058 | unsigned int nr_irqs, int node, void *arg, | ||
| 1059 | bool realloc) | ||
| 1060 | { | ||
| 1061 | int i, ret, virq; | ||
| 1062 | |||
| 1063 | if (domain == NULL) { | ||
| 1064 | domain = irq_default_domain; | ||
| 1065 | if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) | ||
| 1066 | return -EINVAL; | ||
| 1067 | } | ||
| 1068 | |||
| 1069 | if (!domain->ops->alloc) { | ||
| 1070 | pr_debug("domain->ops->alloc() is NULL\n"); | ||
| 1071 | return -ENOSYS; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | if (realloc && irq_base >= 0) { | ||
| 1075 | virq = irq_base; | ||
| 1076 | } else { | ||
| 1077 | virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); | ||
| 1078 | if (virq < 0) { | ||
| 1079 | pr_debug("cannot allocate IRQ(base %d, count %d)\n", | ||
| 1080 | irq_base, nr_irqs); | ||
| 1081 | return virq; | ||
| 1082 | } | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { | ||
| 1086 | pr_debug("cannot allocate memory for IRQ%d\n", virq); | ||
| 1087 | ret = -ENOMEM; | ||
| 1088 | goto out_free_desc; | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | mutex_lock(&irq_domain_mutex); | ||
| 1092 | ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); | ||
| 1093 | if (ret < 0) { | ||
| 1094 | mutex_unlock(&irq_domain_mutex); | ||
| 1095 | goto out_free_irq_data; | ||
| 1096 | } | ||
| 1097 | for (i = 0; i < nr_irqs; i++) | ||
| 1098 | irq_domain_insert_irq(virq + i); | ||
| 1099 | mutex_unlock(&irq_domain_mutex); | ||
| 1100 | |||
| 1101 | return virq; | ||
| 1102 | |||
| 1103 | out_free_irq_data: | ||
| 1104 | irq_domain_free_irq_data(virq, nr_irqs); | ||
| 1105 | out_free_desc: | ||
| 1106 | irq_free_descs(virq, nr_irqs); | ||
| 1107 | return ret; | ||
| 1108 | } | ||
| 1109 | |||
| 1110 | /** | ||
| 1111 | * irq_domain_free_irqs - Free IRQ number and associated data structures | ||
| 1112 | * @virq: base IRQ number | ||
| 1113 | * @nr_irqs: number of IRQs to free | ||
| 1114 | */ | ||
| 1115 | void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) | ||
| 1116 | { | ||
| 1117 | struct irq_data *data = irq_get_irq_data(virq); | ||
| 1118 | int i; | ||
| 1119 | |||
| 1120 | if (WARN(!data || !data->domain || !data->domain->ops->free, | ||
| 1121 | "NULL pointer, cannot free irq\n")) | ||
| 1122 | return; | ||
| 1123 | |||
| 1124 | mutex_lock(&irq_domain_mutex); | ||
| 1125 | for (i = 0; i < nr_irqs; i++) | ||
| 1126 | irq_domain_remove_irq(virq + i); | ||
| 1127 | irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); | ||
| 1128 | mutex_unlock(&irq_domain_mutex); | ||
| 1129 | |||
| 1130 | irq_domain_free_irq_data(virq, nr_irqs); | ||
| 1131 | irq_free_descs(virq, nr_irqs); | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | /** | ||
| 1135 | * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain | ||
| 1136 | * @irq_base: Base IRQ number | ||
| 1137 | * @nr_irqs: Number of IRQs to allocate | ||
| 1138 | * @arg: Allocation data (arch/domain specific) | ||
| 1139 | * | ||
| 1140 | * Check whether the domain has been setup recursive. If not allocate | ||
| 1141 | * through the parent domain. | ||
| 1142 | */ | ||
| 1143 | int irq_domain_alloc_irqs_parent(struct irq_domain *domain, | ||
| 1144 | unsigned int irq_base, unsigned int nr_irqs, | ||
| 1145 | void *arg) | ||
| 1146 | { | ||
| 1147 | /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ | ||
| 1148 | if (irq_domain_is_auto_recursive(domain)) | ||
| 1149 | return 0; | ||
| 1150 | |||
| 1151 | domain = domain->parent; | ||
| 1152 | if (domain) | ||
| 1153 | return irq_domain_alloc_irqs_recursive(domain, irq_base, | ||
| 1154 | nr_irqs, arg); | ||
| 1155 | return -ENOSYS; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | /** | ||
| 1159 | * irq_domain_free_irqs_parent - Free interrupts from parent domain | ||
| 1160 | * @irq_base: Base IRQ number | ||
| 1161 | * @nr_irqs: Number of IRQs to free | ||
| 1162 | * | ||
| 1163 | * Check whether the domain has been setup recursive. If not free | ||
| 1164 | * through the parent domain. | ||
| 1165 | */ | ||
| 1166 | void irq_domain_free_irqs_parent(struct irq_domain *domain, | ||
| 1167 | unsigned int irq_base, unsigned int nr_irqs) | ||
| 1168 | { | ||
| 1169 | /* irq_domain_free_irqs_recursive() will call parent's free */ | ||
| 1170 | if (!irq_domain_is_auto_recursive(domain) && domain->parent) | ||
| 1171 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | ||
| 1172 | nr_irqs); | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | /** | ||
| 1176 | * irq_domain_activate_irq - Call domain_ops->activate recursively to activate | ||
| 1177 | * interrupt | ||
| 1178 | * @irq_data: outermost irq_data associated with interrupt | ||
| 1179 | * | ||
| 1180 | * This is the second step to call domain_ops->activate to program interrupt | ||
| 1181 | * controllers, so the interrupt could actually get delivered. | ||
| 1182 | */ | ||
| 1183 | void irq_domain_activate_irq(struct irq_data *irq_data) | ||
| 1184 | { | ||
| 1185 | if (irq_data && irq_data->domain) { | ||
| 1186 | struct irq_domain *domain = irq_data->domain; | ||
| 1187 | |||
| 1188 | if (irq_data->parent_data) | ||
| 1189 | irq_domain_activate_irq(irq_data->parent_data); | ||
| 1190 | if (domain->ops->activate) | ||
| 1191 | domain->ops->activate(domain, irq_data); | ||
| 1192 | } | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | /** | ||
| 1196 | * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to | ||
| 1197 | * deactivate interrupt | ||
| 1198 | * @irq_data: outermost irq_data associated with interrupt | ||
| 1199 | * | ||
| 1200 | * It calls domain_ops->deactivate to program interrupt controllers to disable | ||
| 1201 | * interrupt delivery. | ||
| 1202 | */ | ||
| 1203 | void irq_domain_deactivate_irq(struct irq_data *irq_data) | ||
| 1204 | { | ||
| 1205 | if (irq_data && irq_data->domain) { | ||
| 1206 | struct irq_domain *domain = irq_data->domain; | ||
| 1207 | |||
| 1208 | if (domain->ops->deactivate) | ||
| 1209 | domain->ops->deactivate(domain, irq_data); | ||
| 1210 | if (irq_data->parent_data) | ||
| 1211 | irq_domain_deactivate_irq(irq_data->parent_data); | ||
| 1212 | } | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | static void irq_domain_check_hierarchy(struct irq_domain *domain) | ||
| 1216 | { | ||
| 1217 | /* Hierarchy irq_domains must implement callback alloc() */ | ||
| 1218 | if (domain->ops->alloc) | ||
| 1219 | domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; | ||
| 1220 | } | ||
| 1221 | #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ | ||
| 1222 | /** | ||
| 1223 | * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain | ||
| 1224 | * @domain: domain to match | ||
| 1225 | * @virq: IRQ number to get irq_data | ||
| 1226 | */ | ||
| 1227 | struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, | ||
| 1228 | unsigned int virq) | ||
| 1229 | { | ||
| 1230 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
| 1231 | |||
| 1232 | return (irq_data && irq_data->domain == domain) ? irq_data : NULL; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | static void irq_domain_check_hierarchy(struct irq_domain *domain) | ||
| 1236 | { | ||
| 1237 | } | ||
| 1238 | #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61bf06a..80692373abd6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
| 183 | ret = chip->irq_set_affinity(data, mask, force); | 183 | ret = chip->irq_set_affinity(data, mask, force); |
| 184 | switch (ret) { | 184 | switch (ret) { |
| 185 | case IRQ_SET_MASK_OK: | 185 | case IRQ_SET_MASK_OK: |
| 186 | case IRQ_SET_MASK_OK_DONE: | ||
| 186 | cpumask_copy(data->affinity, mask); | 187 | cpumask_copy(data->affinity, mask); |
| 187 | case IRQ_SET_MASK_OK_NOCOPY: | 188 | case IRQ_SET_MASK_OK_NOCOPY: |
| 188 | irq_set_thread_affinity(desc); | 189 | irq_set_thread_affinity(desc); |
| @@ -382,14 +383,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
| 382 | } | 383 | } |
| 383 | #endif | 384 | #endif |
| 384 | 385 | ||
| 385 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | 386 | void __disable_irq(struct irq_desc *desc, unsigned int irq) |
| 386 | { | 387 | { |
| 387 | if (suspend) { | ||
| 388 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) | ||
| 389 | return; | ||
| 390 | desc->istate |= IRQS_SUSPENDED; | ||
| 391 | } | ||
| 392 | |||
| 393 | if (!desc->depth++) | 388 | if (!desc->depth++) |
| 394 | irq_disable(desc); | 389 | irq_disable(desc); |
| 395 | } | 390 | } |
| @@ -401,7 +396,7 @@ static int __disable_irq_nosync(unsigned int irq) | |||
| 401 | 396 | ||
| 402 | if (!desc) | 397 | if (!desc) |
| 403 | return -EINVAL; | 398 | return -EINVAL; |
| 404 | __disable_irq(desc, irq, false); | 399 | __disable_irq(desc, irq); |
| 405 | irq_put_desc_busunlock(desc, flags); | 400 | irq_put_desc_busunlock(desc, flags); |
| 406 | return 0; | 401 | return 0; |
| 407 | } | 402 | } |
| @@ -442,20 +437,8 @@ void disable_irq(unsigned int irq) | |||
| 442 | } | 437 | } |
| 443 | EXPORT_SYMBOL(disable_irq); | 438 | EXPORT_SYMBOL(disable_irq); |
| 444 | 439 | ||
| 445 | void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | 440 | void __enable_irq(struct irq_desc *desc, unsigned int irq) |
| 446 | { | 441 | { |
| 447 | if (resume) { | ||
| 448 | if (!(desc->istate & IRQS_SUSPENDED)) { | ||
| 449 | if (!desc->action) | ||
| 450 | return; | ||
| 451 | if (!(desc->action->flags & IRQF_FORCE_RESUME)) | ||
| 452 | return; | ||
| 453 | /* Pretend that it got disabled ! */ | ||
| 454 | desc->depth++; | ||
| 455 | } | ||
| 456 | desc->istate &= ~IRQS_SUSPENDED; | ||
| 457 | } | ||
| 458 | |||
| 459 | switch (desc->depth) { | 442 | switch (desc->depth) { |
| 460 | case 0: | 443 | case 0: |
| 461 | err_out: | 444 | err_out: |
| @@ -497,7 +480,7 @@ void enable_irq(unsigned int irq) | |||
| 497 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) | 480 | KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) |
| 498 | goto out; | 481 | goto out; |
| 499 | 482 | ||
| 500 | __enable_irq(desc, irq, false); | 483 | __enable_irq(desc, irq); |
| 501 | out: | 484 | out: |
| 502 | irq_put_desc_busunlock(desc, flags); | 485 | irq_put_desc_busunlock(desc, flags); |
| 503 | } | 486 | } |
| @@ -618,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
| 618 | 601 | ||
| 619 | switch (ret) { | 602 | switch (ret) { |
| 620 | case IRQ_SET_MASK_OK: | 603 | case IRQ_SET_MASK_OK: |
| 604 | case IRQ_SET_MASK_OK_DONE: | ||
| 621 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); | 605 | irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); |
| 622 | irqd_set(&desc->irq_data, flags); | 606 | irqd_set(&desc->irq_data, flags); |
| 623 | 607 | ||
| @@ -1218,6 +1202,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1218 | new->irq = irq; | 1202 | new->irq = irq; |
| 1219 | *old_ptr = new; | 1203 | *old_ptr = new; |
| 1220 | 1204 | ||
| 1205 | irq_pm_install_action(desc, new); | ||
| 1206 | |||
| 1221 | /* Reset broken irq detection when installing new handler */ | 1207 | /* Reset broken irq detection when installing new handler */ |
| 1222 | desc->irq_count = 0; | 1208 | desc->irq_count = 0; |
| 1223 | desc->irqs_unhandled = 0; | 1209 | desc->irqs_unhandled = 0; |
| @@ -1228,7 +1214,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1228 | */ | 1214 | */ |
| 1229 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { | 1215 | if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { |
| 1230 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; | 1216 | desc->istate &= ~IRQS_SPURIOUS_DISABLED; |
| 1231 | __enable_irq(desc, irq, false); | 1217 | __enable_irq(desc, irq); |
| 1232 | } | 1218 | } |
| 1233 | 1219 | ||
| 1234 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1220 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| @@ -1336,6 +1322,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1336 | /* Found it - now remove it from the list of entries: */ | 1322 | /* Found it - now remove it from the list of entries: */ |
| 1337 | *action_ptr = action->next; | 1323 | *action_ptr = action->next; |
| 1338 | 1324 | ||
| 1325 | irq_pm_remove_action(desc, action); | ||
| 1326 | |||
| 1339 | /* If this was the last handler, shut down the IRQ line: */ | 1327 | /* If this was the last handler, shut down the IRQ line: */ |
| 1340 | if (!desc->action) { | 1328 | if (!desc->action) { |
| 1341 | irq_shutdown(desc); | 1329 | irq_shutdown(desc); |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..3e18163f336f --- /dev/null +++ b/kernel/irq/msi.c | |||
| @@ -0,0 +1,330 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/irq/msi.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2014 Intel Corp. | ||
| 5 | * Author: Jiang Liu <jiang.liu@linux.intel.com> | ||
| 6 | * | ||
| 7 | * This file is licensed under GPLv2. | ||
| 8 | * | ||
| 9 | * This file contains common code to support Message Signalled Interrupt for | ||
| 10 | * PCI compatible and non PCI compatible devices. | ||
| 11 | */ | ||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/device.h> | ||
| 14 | #include <linux/irq.h> | ||
| 15 | #include <linux/irqdomain.h> | ||
| 16 | #include <linux/msi.h> | ||
| 17 | |||
| 18 | /* Temparory solution for building, will be removed later */ | ||
| 19 | #include <linux/pci.h> | ||
| 20 | |||
| 21 | void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) | ||
| 22 | { | ||
| 23 | *msg = entry->msg; | ||
| 24 | } | ||
| 25 | |||
| 26 | void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) | ||
| 27 | { | ||
| 28 | struct msi_desc *entry = irq_get_msi_desc(irq); | ||
| 29 | |||
| 30 | __get_cached_msi_msg(entry, msg); | ||
| 31 | } | ||
| 32 | EXPORT_SYMBOL_GPL(get_cached_msi_msg); | ||
| 33 | |||
| 34 | #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN | ||
| 35 | static inline void irq_chip_write_msi_msg(struct irq_data *data, | ||
| 36 | struct msi_msg *msg) | ||
| 37 | { | ||
| 38 | data->chip->irq_write_msi_msg(data, msg); | ||
| 39 | } | ||
| 40 | |||
| 41 | /** | ||
| 42 | * msi_domain_set_affinity - Generic affinity setter function for MSI domains | ||
| 43 | * @irq_data: The irq data associated to the interrupt | ||
| 44 | * @mask: The affinity mask to set | ||
| 45 | * @force: Flag to enforce setting (disable online checks) | ||
| 46 | * | ||
| 47 | * Intended to be used by MSI interrupt controllers which are | ||
| 48 | * implemented with hierarchical domains. | ||
| 49 | */ | ||
| 50 | int msi_domain_set_affinity(struct irq_data *irq_data, | ||
| 51 | const struct cpumask *mask, bool force) | ||
| 52 | { | ||
| 53 | struct irq_data *parent = irq_data->parent_data; | ||
| 54 | struct msi_msg msg; | ||
| 55 | int ret; | ||
| 56 | |||
| 57 | ret = parent->chip->irq_set_affinity(parent, mask, force); | ||
| 58 | if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { | ||
| 59 | BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); | ||
| 60 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 61 | } | ||
| 62 | |||
| 63 | return ret; | ||
| 64 | } | ||
| 65 | |||
| 66 | static void msi_domain_activate(struct irq_domain *domain, | ||
| 67 | struct irq_data *irq_data) | ||
| 68 | { | ||
| 69 | struct msi_msg msg; | ||
| 70 | |||
| 71 | BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); | ||
| 72 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 73 | } | ||
| 74 | |||
| 75 | static void msi_domain_deactivate(struct irq_domain *domain, | ||
| 76 | struct irq_data *irq_data) | ||
| 77 | { | ||
| 78 | struct msi_msg msg; | ||
| 79 | |||
| 80 | memset(&msg, 0, sizeof(msg)); | ||
| 81 | irq_chip_write_msi_msg(irq_data, &msg); | ||
| 82 | } | ||
| 83 | |||
| 84 | static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, | ||
| 85 | unsigned int nr_irqs, void *arg) | ||
| 86 | { | ||
| 87 | struct msi_domain_info *info = domain->host_data; | ||
| 88 | struct msi_domain_ops *ops = info->ops; | ||
| 89 | irq_hw_number_t hwirq = ops->get_hwirq(info, arg); | ||
| 90 | int i, ret; | ||
| 91 | |||
| 92 | if (irq_find_mapping(domain, hwirq) > 0) | ||
| 93 | return -EEXIST; | ||
| 94 | |||
| 95 | ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); | ||
| 96 | if (ret < 0) | ||
| 97 | return ret; | ||
| 98 | |||
| 99 | for (i = 0; i < nr_irqs; i++) { | ||
| 100 | ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); | ||
| 101 | if (ret < 0) { | ||
| 102 | if (ops->msi_free) { | ||
| 103 | for (i--; i > 0; i--) | ||
| 104 | ops->msi_free(domain, info, virq + i); | ||
| 105 | } | ||
| 106 | irq_domain_free_irqs_top(domain, virq, nr_irqs); | ||
| 107 | return ret; | ||
| 108 | } | ||
| 109 | } | ||
| 110 | |||
| 111 | return 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | static void msi_domain_free(struct irq_domain *domain, unsigned int virq, | ||
| 115 | unsigned int nr_irqs) | ||
| 116 | { | ||
| 117 | struct msi_domain_info *info = domain->host_data; | ||
| 118 | int i; | ||
| 119 | |||
| 120 | if (info->ops->msi_free) { | ||
| 121 | for (i = 0; i < nr_irqs; i++) | ||
| 122 | info->ops->msi_free(domain, info, virq + i); | ||
| 123 | } | ||
| 124 | irq_domain_free_irqs_top(domain, virq, nr_irqs); | ||
| 125 | } | ||
| 126 | |||
| 127 | static struct irq_domain_ops msi_domain_ops = { | ||
| 128 | .alloc = msi_domain_alloc, | ||
| 129 | .free = msi_domain_free, | ||
| 130 | .activate = msi_domain_activate, | ||
| 131 | .deactivate = msi_domain_deactivate, | ||
| 132 | }; | ||
| 133 | |||
| 134 | #ifdef GENERIC_MSI_DOMAIN_OPS | ||
| 135 | static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, | ||
| 136 | msi_alloc_info_t *arg) | ||
| 137 | { | ||
| 138 | return arg->hwirq; | ||
| 139 | } | ||
| 140 | |||
| 141 | static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, | ||
| 142 | int nvec, msi_alloc_info_t *arg) | ||
| 143 | { | ||
| 144 | memset(arg, 0, sizeof(*arg)); | ||
| 145 | return 0; | ||
| 146 | } | ||
| 147 | |||
| 148 | static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, | ||
| 149 | struct msi_desc *desc) | ||
| 150 | { | ||
| 151 | arg->desc = desc; | ||
| 152 | } | ||
| 153 | #else | ||
| 154 | #define msi_domain_ops_get_hwirq NULL | ||
| 155 | #define msi_domain_ops_prepare NULL | ||
| 156 | #define msi_domain_ops_set_desc NULL | ||
| 157 | #endif /* !GENERIC_MSI_DOMAIN_OPS */ | ||
| 158 | |||
| 159 | static int msi_domain_ops_init(struct irq_domain *domain, | ||
| 160 | struct msi_domain_info *info, | ||
| 161 | unsigned int virq, irq_hw_number_t hwirq, | ||
| 162 | msi_alloc_info_t *arg) | ||
| 163 | { | ||
| 164 | irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, | ||
| 165 | info->chip_data); | ||
| 166 | if (info->handler && info->handler_name) { | ||
| 167 | __irq_set_handler(virq, info->handler, 0, info->handler_name); | ||
| 168 | if (info->handler_data) | ||
| 169 | irq_set_handler_data(virq, info->handler_data); | ||
| 170 | } | ||
| 171 | return 0; | ||
| 172 | } | ||
| 173 | |||
| 174 | static int msi_domain_ops_check(struct irq_domain *domain, | ||
| 175 | struct msi_domain_info *info, | ||
| 176 | struct device *dev) | ||
| 177 | { | ||
| 178 | return 0; | ||
| 179 | } | ||
| 180 | |||
| 181 | static struct msi_domain_ops msi_domain_ops_default = { | ||
| 182 | .get_hwirq = msi_domain_ops_get_hwirq, | ||
| 183 | .msi_init = msi_domain_ops_init, | ||
| 184 | .msi_check = msi_domain_ops_check, | ||
| 185 | .msi_prepare = msi_domain_ops_prepare, | ||
| 186 | .set_desc = msi_domain_ops_set_desc, | ||
| 187 | }; | ||
| 188 | |||
| 189 | static void msi_domain_update_dom_ops(struct msi_domain_info *info) | ||
| 190 | { | ||
| 191 | struct msi_domain_ops *ops = info->ops; | ||
| 192 | |||
| 193 | if (ops == NULL) { | ||
| 194 | info->ops = &msi_domain_ops_default; | ||
| 195 | return; | ||
| 196 | } | ||
| 197 | |||
| 198 | if (ops->get_hwirq == NULL) | ||
| 199 | ops->get_hwirq = msi_domain_ops_default.get_hwirq; | ||
| 200 | if (ops->msi_init == NULL) | ||
| 201 | ops->msi_init = msi_domain_ops_default.msi_init; | ||
| 202 | if (ops->msi_check == NULL) | ||
| 203 | ops->msi_check = msi_domain_ops_default.msi_check; | ||
| 204 | if (ops->msi_prepare == NULL) | ||
| 205 | ops->msi_prepare = msi_domain_ops_default.msi_prepare; | ||
| 206 | if (ops->set_desc == NULL) | ||
| 207 | ops->set_desc = msi_domain_ops_default.set_desc; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void msi_domain_update_chip_ops(struct msi_domain_info *info) | ||
| 211 | { | ||
| 212 | struct irq_chip *chip = info->chip; | ||
| 213 | |||
| 214 | BUG_ON(!chip); | ||
| 215 | if (!chip->irq_mask) | ||
| 216 | chip->irq_mask = pci_msi_mask_irq; | ||
| 217 | if (!chip->irq_unmask) | ||
| 218 | chip->irq_unmask = pci_msi_unmask_irq; | ||
| 219 | if (!chip->irq_set_affinity) | ||
| 220 | chip->irq_set_affinity = msi_domain_set_affinity; | ||
| 221 | } | ||
| 222 | |||
| 223 | /** | ||
| 224 | * msi_create_irq_domain - Create a MSI interrupt domain | ||
| 225 | * @of_node: Optional device-tree node of the interrupt controller | ||
| 226 | * @info: MSI domain info | ||
| 227 | * @parent: Parent irq domain | ||
| 228 | */ | ||
| 229 | struct irq_domain *msi_create_irq_domain(struct device_node *node, | ||
| 230 | struct msi_domain_info *info, | ||
| 231 | struct irq_domain *parent) | ||
| 232 | { | ||
| 233 | if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) | ||
| 234 | msi_domain_update_dom_ops(info); | ||
| 235 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) | ||
| 236 | msi_domain_update_chip_ops(info); | ||
| 237 | |||
| 238 | return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, | ||
| 239 | info); | ||
| 240 | } | ||
| 241 | |||
| 242 | /** | ||
| 243 | * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain | ||
| 244 | * @domain: The domain to allocate from | ||
| 245 | * @dev: Pointer to device struct of the device for which the interrupts | ||
| 246 | * are allocated | ||
| 247 | * @nvec: The number of interrupts to allocate | ||
| 248 | * | ||
| 249 | * Returns 0 on success or an error code. | ||
| 250 | */ | ||
| 251 | int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | ||
| 252 | int nvec) | ||
| 253 | { | ||
| 254 | struct msi_domain_info *info = domain->host_data; | ||
| 255 | struct msi_domain_ops *ops = info->ops; | ||
| 256 | msi_alloc_info_t arg; | ||
| 257 | struct msi_desc *desc; | ||
| 258 | int i, ret, virq = -1; | ||
| 259 | |||
| 260 | ret = ops->msi_check(domain, info, dev); | ||
| 261 | if (ret == 0) | ||
| 262 | ret = ops->msi_prepare(domain, dev, nvec, &arg); | ||
| 263 | if (ret) | ||
| 264 | return ret; | ||
| 265 | |||
| 266 | for_each_msi_entry(desc, dev) { | ||
| 267 | ops->set_desc(&arg, desc); | ||
| 268 | if (info->flags & MSI_FLAG_IDENTITY_MAP) | ||
| 269 | virq = (int)ops->get_hwirq(info, &arg); | ||
| 270 | else | ||
| 271 | virq = -1; | ||
| 272 | |||
| 273 | virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, | ||
| 274 | dev_to_node(dev), &arg, false); | ||
| 275 | if (virq < 0) { | ||
| 276 | ret = -ENOSPC; | ||
| 277 | if (ops->handle_error) | ||
| 278 | ret = ops->handle_error(domain, desc, ret); | ||
| 279 | if (ops->msi_finish) | ||
| 280 | ops->msi_finish(&arg, ret); | ||
| 281 | return ret; | ||
| 282 | } | ||
| 283 | |||
| 284 | for (i = 0; i < desc->nvec_used; i++) | ||
| 285 | irq_set_msi_desc_off(virq, i, desc); | ||
| 286 | } | ||
| 287 | |||
| 288 | if (ops->msi_finish) | ||
| 289 | ops->msi_finish(&arg, 0); | ||
| 290 | |||
| 291 | for_each_msi_entry(desc, dev) { | ||
| 292 | if (desc->nvec_used == 1) | ||
| 293 | dev_dbg(dev, "irq %d for MSI\n", virq); | ||
| 294 | else | ||
| 295 | dev_dbg(dev, "irq [%d-%d] for MSI\n", | ||
| 296 | virq, virq + desc->nvec_used - 1); | ||
| 297 | } | ||
| 298 | |||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | |||
| 302 | /** | ||
| 303 | * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev | ||
| 304 | * @domain: The domain to managing the interrupts | ||
| 305 | * @dev: Pointer to device struct of the device for which the interrupts | ||
| 306 | * are free | ||
| 307 | */ | ||
| 308 | void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) | ||
| 309 | { | ||
| 310 | struct msi_desc *desc; | ||
| 311 | |||
| 312 | for_each_msi_entry(desc, dev) { | ||
| 313 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | ||
| 314 | desc->irq = 0; | ||
| 315 | } | ||
| 316 | } | ||
| 317 | |||
| 318 | /** | ||
| 319 | * msi_get_domain_info - Get the MSI interrupt domain info for @domain | ||
| 320 | * @domain: The interrupt domain to retrieve data from | ||
| 321 | * | ||
| 322 | * Returns the pointer to the msi_domain_info stored in | ||
| 323 | * @domain->host_data. | ||
| 324 | */ | ||
| 325 | struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) | ||
| 326 | { | ||
| 327 | return (struct msi_domain_info *)domain->host_data; | ||
| 328 | } | ||
| 329 | |||
| 330 | #endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ | ||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca86cb7..3ca532592704 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
| @@ -9,17 +9,105 @@ | |||
| 9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
| 10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | #include <linux/suspend.h> | ||
| 12 | #include <linux/syscore_ops.h> | 13 | #include <linux/syscore_ops.h> |
| 13 | 14 | ||
| 14 | #include "internals.h" | 15 | #include "internals.h" |
| 15 | 16 | ||
| 17 | bool irq_pm_check_wakeup(struct irq_desc *desc) | ||
| 18 | { | ||
| 19 | if (irqd_is_wakeup_armed(&desc->irq_data)) { | ||
| 20 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 21 | desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; | ||
| 22 | desc->depth++; | ||
| 23 | irq_disable(desc); | ||
| 24 | pm_system_wakeup(); | ||
| 25 | return true; | ||
| 26 | } | ||
| 27 | return false; | ||
| 28 | } | ||
| 29 | |||
| 30 | /* | ||
| 31 | * Called from __setup_irq() with desc->lock held after @action has | ||
| 32 | * been installed in the action chain. | ||
| 33 | */ | ||
| 34 | void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) | ||
| 35 | { | ||
| 36 | desc->nr_actions++; | ||
| 37 | |||
| 38 | if (action->flags & IRQF_FORCE_RESUME) | ||
| 39 | desc->force_resume_depth++; | ||
| 40 | |||
| 41 | WARN_ON_ONCE(desc->force_resume_depth && | ||
| 42 | desc->force_resume_depth != desc->nr_actions); | ||
| 43 | |||
| 44 | if (action->flags & IRQF_NO_SUSPEND) | ||
| 45 | desc->no_suspend_depth++; | ||
| 46 | |||
| 47 | WARN_ON_ONCE(desc->no_suspend_depth && | ||
| 48 | desc->no_suspend_depth != desc->nr_actions); | ||
| 49 | } | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Called from __free_irq() with desc->lock held after @action has | ||
| 53 | * been removed from the action chain. | ||
| 54 | */ | ||
| 55 | void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) | ||
| 56 | { | ||
| 57 | desc->nr_actions--; | ||
| 58 | |||
| 59 | if (action->flags & IRQF_FORCE_RESUME) | ||
| 60 | desc->force_resume_depth--; | ||
| 61 | |||
| 62 | if (action->flags & IRQF_NO_SUSPEND) | ||
| 63 | desc->no_suspend_depth--; | ||
| 64 | } | ||
| 65 | |||
| 66 | static bool suspend_device_irq(struct irq_desc *desc, int irq) | ||
| 67 | { | ||
| 68 | if (!desc->action || desc->no_suspend_depth) | ||
| 69 | return false; | ||
| 70 | |||
| 71 | if (irqd_is_wakeup_set(&desc->irq_data)) { | ||
| 72 | irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 73 | /* | ||
| 74 | * We return true here to force the caller to issue | ||
| 75 | * synchronize_irq(). We need to make sure that the | ||
| 76 | * IRQD_WAKEUP_ARMED is visible before we return from | ||
| 77 | * suspend_device_irqs(). | ||
| 78 | */ | ||
| 79 | return true; | ||
| 80 | } | ||
| 81 | |||
| 82 | desc->istate |= IRQS_SUSPENDED; | ||
| 83 | __disable_irq(desc, irq); | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Hardware which has no wakeup source configuration facility | ||
| 87 | * requires that the non wakeup interrupts are masked at the | ||
| 88 | * chip level. The chip implementation indicates that with | ||
| 89 | * IRQCHIP_MASK_ON_SUSPEND. | ||
| 90 | */ | ||
| 91 | if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
| 92 | mask_irq(desc); | ||
| 93 | return true; | ||
| 94 | } | ||
| 95 | |||
| 16 | /** | 96 | /** |
| 17 | * suspend_device_irqs - disable all currently enabled interrupt lines | 97 | * suspend_device_irqs - disable all currently enabled interrupt lines |
| 18 | * | 98 | * |
| 19 | * During system-wide suspend or hibernation device drivers need to be prevented | 99 | * During system-wide suspend or hibernation device drivers need to be |
| 20 | * from receiving interrupts and this function is provided for this purpose. | 100 | * prevented from receiving interrupts and this function is provided |
| 21 | * It marks all interrupt lines in use, except for the timer ones, as disabled | 101 | * for this purpose. |
| 22 | * and sets the IRQS_SUSPENDED flag for each of them. | 102 | * |
| 103 | * So we disable all interrupts and mark them IRQS_SUSPENDED except | ||
| 104 | * for those which are unused, those which are marked as not | ||
| 105 | * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND | ||
| 106 | * set and those which are marked as active wakeup sources. | ||
| 107 | * | ||
| 108 | * The active wakeup sources are handled by the flow handler entry | ||
| 109 | * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the | ||
| 110 | * interrupt and notifies the pm core about the wakeup. | ||
| 23 | */ | 111 | */ |
| 24 | void suspend_device_irqs(void) | 112 | void suspend_device_irqs(void) |
| 25 | { | 113 | { |
| @@ -28,18 +116,36 @@ void suspend_device_irqs(void) | |||
| 28 | 116 | ||
| 29 | for_each_irq_desc(irq, desc) { | 117 | for_each_irq_desc(irq, desc) { |
| 30 | unsigned long flags; | 118 | unsigned long flags; |
| 119 | bool sync; | ||
| 31 | 120 | ||
| 32 | raw_spin_lock_irqsave(&desc->lock, flags); | 121 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 33 | __disable_irq(desc, irq, true); | 122 | sync = suspend_device_irq(desc, irq); |
| 34 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 123 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 35 | } | ||
| 36 | 124 | ||
| 37 | for_each_irq_desc(irq, desc) | 125 | if (sync) |
| 38 | if (desc->istate & IRQS_SUSPENDED) | ||
| 39 | synchronize_irq(irq); | 126 | synchronize_irq(irq); |
| 127 | } | ||
| 40 | } | 128 | } |
| 41 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 129 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
| 42 | 130 | ||
| 131 | static void resume_irq(struct irq_desc *desc, int irq) | ||
| 132 | { | ||
| 133 | irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); | ||
| 134 | |||
| 135 | if (desc->istate & IRQS_SUSPENDED) | ||
| 136 | goto resume; | ||
| 137 | |||
| 138 | /* Force resume the interrupt? */ | ||
| 139 | if (!desc->force_resume_depth) | ||
| 140 | return; | ||
| 141 | |||
| 142 | /* Pretend that it got disabled ! */ | ||
| 143 | desc->depth++; | ||
| 144 | resume: | ||
| 145 | desc->istate &= ~IRQS_SUSPENDED; | ||
| 146 | __enable_irq(desc, irq); | ||
| 147 | } | ||
| 148 | |||
| 43 | static void resume_irqs(bool want_early) | 149 | static void resume_irqs(bool want_early) |
| 44 | { | 150 | { |
| 45 | struct irq_desc *desc; | 151 | struct irq_desc *desc; |
| @@ -54,7 +160,7 @@ static void resume_irqs(bool want_early) | |||
| 54 | continue; | 160 | continue; |
| 55 | 161 | ||
| 56 | raw_spin_lock_irqsave(&desc->lock, flags); | 162 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 57 | __enable_irq(desc, irq, true); | 163 | resume_irq(desc, irq); |
| 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 164 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 59 | } | 165 | } |
| 60 | } | 166 | } |
| @@ -93,38 +199,3 @@ void resume_device_irqs(void) | |||
| 93 | resume_irqs(false); | 199 | resume_irqs(false); |
| 94 | } | 200 | } |
| 95 | EXPORT_SYMBOL_GPL(resume_device_irqs); | 201 | EXPORT_SYMBOL_GPL(resume_device_irqs); |
| 96 | |||
| 97 | /** | ||
| 98 | * check_wakeup_irqs - check if any wake-up interrupts are pending | ||
| 99 | */ | ||
| 100 | int check_wakeup_irqs(void) | ||
| 101 | { | ||
| 102 | struct irq_desc *desc; | ||
| 103 | int irq; | ||
| 104 | |||
| 105 | for_each_irq_desc(irq, desc) { | ||
| 106 | /* | ||
| 107 | * Only interrupts which are marked as wakeup source | ||
| 108 | * and have not been disabled before the suspend check | ||
| 109 | * can abort suspend. | ||
| 110 | */ | ||
| 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { | ||
| 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) | ||
| 113 | return -EBUSY; | ||
| 114 | continue; | ||
| 115 | } | ||
| 116 | /* | ||
| 117 | * Check the non wakeup interrupts whether they need | ||
| 118 | * to be masked before finally going into suspend | ||
| 119 | * state. That's for hardware which has no wakeup | ||
| 120 | * source configuration facility. The chip | ||
| 121 | * implementation indicates that with | ||
| 122 | * IRQCHIP_MASK_ON_SUSPEND. | ||
| 123 | */ | ||
| 124 | if (desc->istate & IRQS_SUSPENDED && | ||
| 125 | irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) | ||
| 126 | mask_irq(desc); | ||
| 127 | } | ||
| 128 | |||
| 129 | return 0; | ||
| 130 | } | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ac1ba2f11032..9dc9bfd8a678 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -15,6 +15,23 @@ | |||
| 15 | 15 | ||
| 16 | #include "internals.h" | 16 | #include "internals.h" |
| 17 | 17 | ||
| 18 | /* | ||
| 19 | * Access rules: | ||
| 20 | * | ||
| 21 | * procfs protects read/write of /proc/irq/N/ files against a | ||
| 22 | * concurrent free of the interrupt descriptor. remove_proc_entry() | ||
| 23 | * immediately prevents new read/writes to happen and waits for | ||
| 24 | * already running read/write functions to complete. | ||
| 25 | * | ||
| 26 | * We remove the proc entries first and then delete the interrupt | ||
| 27 | * descriptor from the radix tree and free it. So it is guaranteed | ||
| 28 | * that irq_to_desc(N) is valid as long as the read/writes are | ||
| 29 | * permitted by procfs. | ||
| 30 | * | ||
| 31 | * The read from /proc/interrupts is a different problem because there | ||
| 32 | * is no protection. So the lookup and the access to irqdesc | ||
| 33 | * information must be protected by sparse_irq_lock. | ||
| 34 | */ | ||
| 18 | static struct proc_dir_entry *root_irq_dir; | 35 | static struct proc_dir_entry *root_irq_dir; |
| 19 | 36 | ||
| 20 | #ifdef CONFIG_SMP | 37 | #ifdef CONFIG_SMP |
| @@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v) | |||
| 437 | seq_putc(p, '\n'); | 454 | seq_putc(p, '\n'); |
| 438 | } | 455 | } |
| 439 | 456 | ||
| 457 | irq_lock_sparse(); | ||
| 440 | desc = irq_to_desc(i); | 458 | desc = irq_to_desc(i); |
| 441 | if (!desc) | 459 | if (!desc) |
| 442 | return 0; | 460 | goto outsparse; |
| 443 | 461 | ||
| 444 | raw_spin_lock_irqsave(&desc->lock, flags); | 462 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 445 | for_each_online_cpu(j) | 463 | for_each_online_cpu(j) |
| @@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v) | |||
| 479 | seq_putc(p, '\n'); | 497 | seq_putc(p, '\n'); |
| 480 | out: | 498 | out: |
| 481 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 499 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 500 | outsparse: | ||
| 501 | irq_unlock_sparse(); | ||
| 482 | return 0; | 502 | return 0; |
| 483 | } | 503 | } |
| 484 | #endif | 504 | #endif |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..cbf9fb899d92 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work) | |||
| 95 | 95 | ||
| 96 | /* If the work is "lazy", handle it from next tick if any */ | 96 | /* If the work is "lazy", handle it from next tick if any */ |
| 97 | if (work->flags & IRQ_WORK_LAZY) { | 97 | if (work->flags & IRQ_WORK_LAZY) { |
| 98 | if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && | 98 | if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && |
| 99 | tick_nohz_tick_stopped()) | 99 | tick_nohz_tick_stopped()) |
| 100 | arch_irq_work_raise(); | 100 | arch_irq_work_raise(); |
| 101 | } else { | 101 | } else { |
| 102 | if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) | 102 | if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) |
| 103 | arch_irq_work_raise(); | 103 | arch_irq_work_raise(); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| @@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void) | |||
| 113 | { | 113 | { |
| 114 | struct llist_head *raised, *lazy; | 114 | struct llist_head *raised, *lazy; |
| 115 | 115 | ||
| 116 | raised = &__get_cpu_var(raised_list); | 116 | raised = this_cpu_ptr(&raised_list); |
| 117 | lazy = &__get_cpu_var(lazy_list); | 117 | lazy = this_cpu_ptr(&lazy_list); |
| 118 | if (llist_empty(raised) && llist_empty(lazy)) | 118 | |
| 119 | return false; | 119 | if (llist_empty(raised) || arch_irq_work_has_interrupt()) |
| 120 | if (llist_empty(lazy)) | ||
| 121 | return false; | ||
| 120 | 122 | ||
| 121 | /* All work should have been flushed before going offline */ | 123 | /* All work should have been flushed before going offline */ |
| 122 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 124 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
| @@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list) | |||
| 166 | */ | 168 | */ |
| 167 | void irq_work_run(void) | 169 | void irq_work_run(void) |
| 168 | { | 170 | { |
| 169 | irq_work_run_list(&__get_cpu_var(raised_list)); | 171 | irq_work_run_list(this_cpu_ptr(&raised_list)); |
| 170 | irq_work_run_list(&__get_cpu_var(lazy_list)); | 172 | irq_work_run_list(this_cpu_ptr(&lazy_list)); |
| 171 | } | 173 | } |
| 172 | EXPORT_SYMBOL_GPL(irq_work_run); | 174 | EXPORT_SYMBOL_GPL(irq_work_run); |
| 173 | 175 | ||
| 176 | void irq_work_tick(void) | ||
| 177 | { | ||
| 178 | struct llist_head *raised = this_cpu_ptr(&raised_list); | ||
| 179 | |||
| 180 | if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) | ||
| 181 | irq_work_run_list(raised); | ||
| 182 | irq_work_run_list(this_cpu_ptr(&lazy_list)); | ||
| 183 | } | ||
| 184 | |||
| 174 | /* | 185 | /* |
| 175 | * Synchronize against the irq_work @entry, ensures the entry is not | 186 | * Synchronize against the irq_work @entry, ensures the entry is not |
| 176 | * currently in use. | 187 | * currently in use. |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ae5167087845..5c5987f10819 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file) | |||
| 565 | * using get_symbol_offset for every symbol. | 565 | * using get_symbol_offset for every symbol. |
| 566 | */ | 566 | */ |
| 567 | struct kallsym_iter *iter; | 567 | struct kallsym_iter *iter; |
| 568 | int ret; | 568 | iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); |
| 569 | |||
| 570 | iter = kmalloc(sizeof(*iter), GFP_KERNEL); | ||
| 571 | if (!iter) | 569 | if (!iter) |
| 572 | return -ENOMEM; | 570 | return -ENOMEM; |
| 573 | reset_iter(iter, 0); | 571 | reset_iter(iter, 0); |
| 574 | 572 | ||
| 575 | ret = seq_open(file, &kallsyms_op); | 573 | return 0; |
| 576 | if (ret == 0) | ||
| 577 | ((struct seq_file *)file->private_data)->private = iter; | ||
| 578 | else | ||
| 579 | kfree(iter); | ||
| 580 | return ret; | ||
| 581 | } | 574 | } |
| 582 | 575 | ||
| 583 | #ifdef CONFIG_KGDB_KDB | 576 | #ifdef CONFIG_KGDB_KDB |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2bee072268d9..9a8a01abbaed 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | |||
| 600 | if (!kexec_on_panic) { | 600 | if (!kexec_on_panic) { |
| 601 | image->swap_page = kimage_alloc_control_pages(image, 0); | 601 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 602 | if (!image->swap_page) { | 602 | if (!image->swap_page) { |
| 603 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | 603 | pr_err("Could not allocate swap buffer\n"); |
| 604 | goto out_free_control_pages; | 604 | goto out_free_control_pages; |
| 605 | } | 605 | } |
| 606 | } | 606 | } |
| @@ -1759,7 +1759,6 @@ static __initdata char *suffix_tbl[] = { | |||
| 1759 | */ | 1759 | */ |
| 1760 | static int __init parse_crashkernel_suffix(char *cmdline, | 1760 | static int __init parse_crashkernel_suffix(char *cmdline, |
| 1761 | unsigned long long *crash_size, | 1761 | unsigned long long *crash_size, |
| 1762 | unsigned long long *crash_base, | ||
| 1763 | const char *suffix) | 1762 | const char *suffix) |
| 1764 | { | 1763 | { |
| 1765 | char *cur = cmdline; | 1764 | char *cur = cmdline; |
| @@ -1848,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline, | |||
| 1848 | 1847 | ||
| 1849 | if (suffix) | 1848 | if (suffix) |
| 1850 | return parse_crashkernel_suffix(ck_cmdline, crash_size, | 1849 | return parse_crashkernel_suffix(ck_cmdline, crash_size, |
| 1851 | crash_base, suffix); | 1850 | suffix); |
| 1852 | /* | 1851 | /* |
| 1853 | * if the commandline contains a ':', then that's the extended | 1852 | * if the commandline contains a ':', then that's the extended |
| 1854 | * syntax -- if not, it must be the classic syntax | 1853 | * syntax -- if not, it must be the classic syntax |
| @@ -2016,22 +2015,6 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 2016 | subsys_initcall(crash_save_vmcoreinfo_init); | 2015 | subsys_initcall(crash_save_vmcoreinfo_init); |
| 2017 | 2016 | ||
| 2018 | #ifdef CONFIG_KEXEC_FILE | 2017 | #ifdef CONFIG_KEXEC_FILE |
| 2019 | static int __kexec_add_segment(struct kimage *image, char *buf, | ||
| 2020 | unsigned long bufsz, unsigned long mem, | ||
| 2021 | unsigned long memsz) | ||
| 2022 | { | ||
| 2023 | struct kexec_segment *ksegment; | ||
| 2024 | |||
| 2025 | ksegment = &image->segment[image->nr_segments]; | ||
| 2026 | ksegment->kbuf = buf; | ||
| 2027 | ksegment->bufsz = bufsz; | ||
| 2028 | ksegment->mem = mem; | ||
| 2029 | ksegment->memsz = memsz; | ||
| 2030 | image->nr_segments++; | ||
| 2031 | |||
| 2032 | return 0; | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | 2018 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, |
| 2036 | struct kexec_buf *kbuf) | 2019 | struct kexec_buf *kbuf) |
| 2037 | { | 2020 | { |
| @@ -2064,8 +2047,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | |||
| 2064 | } while (1); | 2047 | } while (1); |
| 2065 | 2048 | ||
| 2066 | /* If we are here, we found a suitable memory range */ | 2049 | /* If we are here, we found a suitable memory range */ |
| 2067 | __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, | 2050 | kbuf->mem = temp_start; |
| 2068 | kbuf->memsz); | ||
| 2069 | 2051 | ||
| 2070 | /* Success, stop navigating through remaining System RAM ranges */ | 2052 | /* Success, stop navigating through remaining System RAM ranges */ |
| 2071 | return 1; | 2053 | return 1; |
| @@ -2099,8 +2081,7 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, | |||
| 2099 | } while (1); | 2081 | } while (1); |
| 2100 | 2082 | ||
| 2101 | /* If we are here, we found a suitable memory range */ | 2083 | /* If we are here, we found a suitable memory range */ |
| 2102 | __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, | 2084 | kbuf->mem = temp_start; |
| 2103 | kbuf->memsz); | ||
| 2104 | 2085 | ||
| 2105 | /* Success, stop navigating through remaining System RAM ranges */ | 2086 | /* Success, stop navigating through remaining System RAM ranges */ |
| 2106 | return 1; | 2087 | return 1; |
| @@ -2187,7 +2168,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, | |||
| 2187 | } | 2168 | } |
| 2188 | 2169 | ||
| 2189 | /* Found a suitable memory range */ | 2170 | /* Found a suitable memory range */ |
| 2190 | ksegment = &image->segment[image->nr_segments - 1]; | 2171 | ksegment = &image->segment[image->nr_segments]; |
| 2172 | ksegment->kbuf = kbuf->buffer; | ||
| 2173 | ksegment->bufsz = kbuf->bufsz; | ||
| 2174 | ksegment->mem = kbuf->mem; | ||
| 2175 | ksegment->memsz = kbuf->memsz; | ||
| 2176 | image->nr_segments++; | ||
| 2191 | *load_addr = ksegment->mem; | 2177 | *load_addr = ksegment->mem; |
| 2192 | return 0; | 2178 | return 0; |
| 2193 | } | 2179 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..2777f40a9c7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -47,13 +47,6 @@ extern int max_threads; | |||
| 47 | 47 | ||
| 48 | static struct workqueue_struct *khelper_wq; | 48 | static struct workqueue_struct *khelper_wq; |
| 49 | 49 | ||
| 50 | /* | ||
| 51 | * kmod_thread_locker is used for deadlock avoidance. There is no explicit | ||
| 52 | * locking to protect this global - it is private to the singleton khelper | ||
| 53 | * thread and should only ever be modified by that thread. | ||
| 54 | */ | ||
| 55 | static const struct task_struct *kmod_thread_locker; | ||
| 56 | |||
| 57 | #define CAP_BSET (void *)1 | 50 | #define CAP_BSET (void *)1 |
| 58 | #define CAP_PI (void *)2 | 51 | #define CAP_PI (void *)2 |
| 59 | 52 | ||
| @@ -196,6 +189,27 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 196 | EXPORT_SYMBOL(__request_module); | 189 | EXPORT_SYMBOL(__request_module); |
| 197 | #endif /* CONFIG_MODULES */ | 190 | #endif /* CONFIG_MODULES */ |
| 198 | 191 | ||
| 192 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
| 193 | { | ||
| 194 | if (info->cleanup) | ||
| 195 | (*info->cleanup)(info); | ||
| 196 | kfree(info); | ||
| 197 | } | ||
| 198 | |||
| 199 | static void umh_complete(struct subprocess_info *sub_info) | ||
| 200 | { | ||
| 201 | struct completion *comp = xchg(&sub_info->complete, NULL); | ||
| 202 | /* | ||
| 203 | * See call_usermodehelper_exec(). If xchg() returns NULL | ||
| 204 | * we own sub_info, the UMH_KILLABLE caller has gone away | ||
| 205 | * or the caller used UMH_NO_WAIT. | ||
| 206 | */ | ||
| 207 | if (comp) | ||
| 208 | complete(comp); | ||
| 209 | else | ||
| 210 | call_usermodehelper_freeinfo(sub_info); | ||
| 211 | } | ||
| 212 | |||
| 199 | /* | 213 | /* |
| 200 | * This is the task which runs the usermode application | 214 | * This is the task which runs the usermode application |
| 201 | */ | 215 | */ |
| @@ -221,7 +235,7 @@ static int ____call_usermodehelper(void *data) | |||
| 221 | retval = -ENOMEM; | 235 | retval = -ENOMEM; |
| 222 | new = prepare_kernel_cred(current); | 236 | new = prepare_kernel_cred(current); |
| 223 | if (!new) | 237 | if (!new) |
| 224 | goto fail; | 238 | goto out; |
| 225 | 239 | ||
| 226 | spin_lock(&umh_sysctl_lock); | 240 | spin_lock(&umh_sysctl_lock); |
| 227 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); | 241 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); |
| @@ -233,7 +247,7 @@ static int ____call_usermodehelper(void *data) | |||
| 233 | retval = sub_info->init(sub_info, new); | 247 | retval = sub_info->init(sub_info, new); |
| 234 | if (retval) { | 248 | if (retval) { |
| 235 | abort_creds(new); | 249 | abort_creds(new); |
| 236 | goto fail; | 250 | goto out; |
| 237 | } | 251 | } |
| 238 | } | 252 | } |
| 239 | 253 | ||
| @@ -242,42 +256,16 @@ static int ____call_usermodehelper(void *data) | |||
| 242 | retval = do_execve(getname_kernel(sub_info->path), | 256 | retval = do_execve(getname_kernel(sub_info->path), |
| 243 | (const char __user *const __user *)sub_info->argv, | 257 | (const char __user *const __user *)sub_info->argv, |
| 244 | (const char __user *const __user *)sub_info->envp); | 258 | (const char __user *const __user *)sub_info->envp); |
| 259 | out: | ||
| 260 | sub_info->retval = retval; | ||
| 261 | /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ | ||
| 262 | if (!(sub_info->wait & UMH_WAIT_PROC)) | ||
| 263 | umh_complete(sub_info); | ||
| 245 | if (!retval) | 264 | if (!retval) |
| 246 | return 0; | 265 | return 0; |
| 247 | |||
| 248 | /* Exec failed? */ | ||
| 249 | fail: | ||
| 250 | sub_info->retval = retval; | ||
| 251 | do_exit(0); | 266 | do_exit(0); |
| 252 | } | 267 | } |
| 253 | 268 | ||
| 254 | static int call_helper(void *data) | ||
| 255 | { | ||
| 256 | /* Worker thread started blocking khelper thread. */ | ||
| 257 | kmod_thread_locker = current; | ||
| 258 | return ____call_usermodehelper(data); | ||
| 259 | } | ||
| 260 | |||
| 261 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | ||
| 262 | { | ||
| 263 | if (info->cleanup) | ||
| 264 | (*info->cleanup)(info); | ||
| 265 | kfree(info); | ||
| 266 | } | ||
| 267 | |||
| 268 | static void umh_complete(struct subprocess_info *sub_info) | ||
| 269 | { | ||
| 270 | struct completion *comp = xchg(&sub_info->complete, NULL); | ||
| 271 | /* | ||
| 272 | * See call_usermodehelper_exec(). If xchg() returns NULL | ||
| 273 | * we own sub_info, the UMH_KILLABLE caller has gone away. | ||
| 274 | */ | ||
| 275 | if (comp) | ||
| 276 | complete(comp); | ||
| 277 | else | ||
| 278 | call_usermodehelper_freeinfo(sub_info); | ||
| 279 | } | ||
| 280 | |||
| 281 | /* Keventd can't block, but this (a child) can. */ | 269 | /* Keventd can't block, but this (a child) can. */ |
| 282 | static int wait_for_helper(void *data) | 270 | static int wait_for_helper(void *data) |
| 283 | { | 271 | { |
| @@ -320,34 +308,17 @@ static void __call_usermodehelper(struct work_struct *work) | |||
| 320 | { | 308 | { |
| 321 | struct subprocess_info *sub_info = | 309 | struct subprocess_info *sub_info = |
| 322 | container_of(work, struct subprocess_info, work); | 310 | container_of(work, struct subprocess_info, work); |
| 323 | int wait = sub_info->wait & ~UMH_KILLABLE; | ||
| 324 | pid_t pid; | 311 | pid_t pid; |
| 325 | 312 | ||
| 326 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 313 | if (sub_info->wait & UMH_WAIT_PROC) |
| 327 | * successfully We need the data structures to stay around | ||
| 328 | * until that is done. */ | ||
| 329 | if (wait == UMH_WAIT_PROC) | ||
| 330 | pid = kernel_thread(wait_for_helper, sub_info, | 314 | pid = kernel_thread(wait_for_helper, sub_info, |
| 331 | CLONE_FS | CLONE_FILES | SIGCHLD); | 315 | CLONE_FS | CLONE_FILES | SIGCHLD); |
| 332 | else { | 316 | else |
| 333 | pid = kernel_thread(call_helper, sub_info, | 317 | pid = kernel_thread(____call_usermodehelper, sub_info, |
| 334 | CLONE_VFORK | SIGCHLD); | 318 | SIGCHLD); |
| 335 | /* Worker thread stopped blocking khelper thread. */ | ||
| 336 | kmod_thread_locker = NULL; | ||
| 337 | } | ||
| 338 | |||
| 339 | switch (wait) { | ||
| 340 | case UMH_NO_WAIT: | ||
| 341 | call_usermodehelper_freeinfo(sub_info); | ||
| 342 | break; | ||
| 343 | 319 | ||
| 344 | case UMH_WAIT_PROC: | 320 | if (pid < 0) { |
| 345 | if (pid > 0) | 321 | sub_info->retval = pid; |
| 346 | break; | ||
| 347 | /* FALLTHROUGH */ | ||
| 348 | case UMH_WAIT_EXEC: | ||
| 349 | if (pid < 0) | ||
| 350 | sub_info->retval = pid; | ||
| 351 | umh_complete(sub_info); | 322 | umh_complete(sub_info); |
| 352 | } | 323 | } |
| 353 | } | 324 | } |
| @@ -578,17 +549,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
| 578 | goto out; | 549 | goto out; |
| 579 | } | 550 | } |
| 580 | /* | 551 | /* |
| 581 | * Worker thread must not wait for khelper thread at below | 552 | * Set the completion pointer only if there is a waiter. |
| 582 | * wait_for_completion() if the thread was created with CLONE_VFORK | 553 | * This makes it possible to use umh_complete to free |
| 583 | * flag, for khelper thread is already waiting for the thread at | 554 | * the data structure in case of UMH_NO_WAIT. |
| 584 | * wait_for_completion() in do_fork(). | ||
| 585 | */ | 555 | */ |
| 586 | if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { | 556 | sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; |
| 587 | retval = -EBUSY; | ||
| 588 | goto out; | ||
| 589 | } | ||
| 590 | |||
| 591 | sub_info->complete = &done; | ||
| 592 | sub_info->wait = wait; | 557 | sub_info->wait = wait; |
| 593 | 558 | ||
| 594 | queue_work(khelper_wq, &sub_info->work); | 559 | queue_work(khelper_wq, &sub_info->work); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3995f546d0f3..06f58309fed2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
| 915 | #ifdef CONFIG_KPROBES_ON_FTRACE | 915 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| 916 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | 916 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { |
| 917 | .func = kprobe_ftrace_handler, | 917 | .func = kprobe_ftrace_handler, |
| 918 | .flags = FTRACE_OPS_FL_SAVE_REGS, | 918 | .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, |
| 919 | }; | 919 | }; |
| 920 | static int kprobe_ftrace_enabled; | 920 | static int kprobe_ftrace_enabled; |
| 921 | 921 | ||
| @@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p) | |||
| 1410 | return ret; | 1410 | return ret; |
| 1411 | } | 1411 | } |
| 1412 | 1412 | ||
| 1413 | static int check_kprobe_address_safe(struct kprobe *p, | 1413 | int __weak arch_check_ftrace_location(struct kprobe *p) |
| 1414 | struct module **probed_mod) | ||
| 1415 | { | 1414 | { |
| 1416 | int ret = 0; | ||
| 1417 | unsigned long ftrace_addr; | 1415 | unsigned long ftrace_addr; |
| 1418 | 1416 | ||
| 1419 | /* | ||
| 1420 | * If the address is located on a ftrace nop, set the | ||
| 1421 | * breakpoint to the following instruction. | ||
| 1422 | */ | ||
| 1423 | ftrace_addr = ftrace_location((unsigned long)p->addr); | 1417 | ftrace_addr = ftrace_location((unsigned long)p->addr); |
| 1424 | if (ftrace_addr) { | 1418 | if (ftrace_addr) { |
| 1425 | #ifdef CONFIG_KPROBES_ON_FTRACE | 1419 | #ifdef CONFIG_KPROBES_ON_FTRACE |
| @@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p, | |||
| 1431 | return -EINVAL; | 1425 | return -EINVAL; |
| 1432 | #endif | 1426 | #endif |
| 1433 | } | 1427 | } |
| 1428 | return 0; | ||
| 1429 | } | ||
| 1434 | 1430 | ||
| 1431 | static int check_kprobe_address_safe(struct kprobe *p, | ||
| 1432 | struct module **probed_mod) | ||
| 1433 | { | ||
| 1434 | int ret; | ||
| 1435 | |||
| 1436 | ret = arch_check_ftrace_location(p); | ||
| 1437 | if (ret) | ||
| 1438 | return ret; | ||
| 1435 | jump_label_lock(); | 1439 | jump_label_lock(); |
| 1436 | preempt_disable(); | 1440 | preempt_disable(); |
| 1437 | 1441 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index ef483220e855..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
| 369 | { | 369 | { |
| 370 | struct task_struct *p; | 370 | struct task_struct *p; |
| 371 | 371 | ||
| 372 | p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, | 372 | p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, |
| 373 | cpu); | 373 | cpu); |
| 374 | if (IS_ERR(p)) | 374 | if (IS_ERR(p)) |
| 375 | return p; | 375 | return p; |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0955b885d0dc..ec8cce259779 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
| @@ -20,30 +20,20 @@ | |||
| 20 | * Author: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Author: Paul E. McKenney <paulmck@us.ibm.com> |
| 21 | * Based on kernel/rcu/torture.c. | 21 | * Based on kernel/rcu/torture.c. |
| 22 | */ | 22 | */ |
| 23 | #include <linux/types.h> | ||
| 24 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
| 25 | #include <linux/init.h> | ||
| 26 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 27 | #include <linux/kthread.h> | 25 | #include <linux/kthread.h> |
| 28 | #include <linux/err.h> | ||
| 29 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
| 27 | #include <linux/rwlock.h> | ||
| 28 | #include <linux/mutex.h> | ||
| 29 | #include <linux/rwsem.h> | ||
| 30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
| 31 | #include <linux/interrupt.h> | 31 | #include <linux/interrupt.h> |
| 32 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
| 33 | #include <linux/atomic.h> | 33 | #include <linux/atomic.h> |
| 34 | #include <linux/bitops.h> | ||
| 35 | #include <linux/completion.h> | ||
| 36 | #include <linux/moduleparam.h> | 34 | #include <linux/moduleparam.h> |
| 37 | #include <linux/percpu.h> | ||
| 38 | #include <linux/notifier.h> | ||
| 39 | #include <linux/reboot.h> | ||
| 40 | #include <linux/freezer.h> | ||
| 41 | #include <linux/cpu.h> | ||
| 42 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
| 43 | #include <linux/stat.h> | ||
| 44 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
| 45 | #include <linux/trace_clock.h> | ||
| 46 | #include <asm/byteorder.h> | ||
| 47 | #include <linux/torture.h> | 37 | #include <linux/torture.h> |
| 48 | 38 | ||
| 49 | MODULE_LICENSE("GPL"); | 39 | MODULE_LICENSE("GPL"); |
| @@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); | |||
| 51 | 41 | ||
| 52 | torture_param(int, nwriters_stress, -1, | 42 | torture_param(int, nwriters_stress, -1, |
| 53 | "Number of write-locking stress-test threads"); | 43 | "Number of write-locking stress-test threads"); |
| 44 | torture_param(int, nreaders_stress, -1, | ||
| 45 | "Number of read-locking stress-test threads"); | ||
| 54 | torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); | 46 | torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); |
| 55 | torture_param(int, onoff_interval, 0, | 47 | torture_param(int, onoff_interval, 0, |
| 56 | "Time between CPU hotplugs (s), 0=disable"); | 48 | "Time between CPU hotplugs (s), 0=disable"); |
| @@ -66,30 +58,28 @@ torture_param(bool, verbose, true, | |||
| 66 | static char *torture_type = "spin_lock"; | 58 | static char *torture_type = "spin_lock"; |
| 67 | module_param(torture_type, charp, 0444); | 59 | module_param(torture_type, charp, 0444); |
| 68 | MODULE_PARM_DESC(torture_type, | 60 | MODULE_PARM_DESC(torture_type, |
| 69 | "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); | 61 | "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); |
| 70 | |||
| 71 | static atomic_t n_lock_torture_errors; | ||
| 72 | 62 | ||
| 73 | static struct task_struct *stats_task; | 63 | static struct task_struct *stats_task; |
| 74 | static struct task_struct **writer_tasks; | 64 | static struct task_struct **writer_tasks; |
| 65 | static struct task_struct **reader_tasks; | ||
| 75 | 66 | ||
| 76 | static int nrealwriters_stress; | ||
| 77 | static bool lock_is_write_held; | 67 | static bool lock_is_write_held; |
| 68 | static bool lock_is_read_held; | ||
| 78 | 69 | ||
| 79 | struct lock_writer_stress_stats { | 70 | struct lock_stress_stats { |
| 80 | long n_write_lock_fail; | 71 | long n_lock_fail; |
| 81 | long n_write_lock_acquired; | 72 | long n_lock_acquired; |
| 82 | }; | 73 | }; |
| 83 | static struct lock_writer_stress_stats *lwsa; | ||
| 84 | 74 | ||
| 85 | #if defined(MODULE) | 75 | #if defined(MODULE) |
| 86 | #define LOCKTORTURE_RUNNABLE_INIT 1 | 76 | #define LOCKTORTURE_RUNNABLE_INIT 1 |
| 87 | #else | 77 | #else |
| 88 | #define LOCKTORTURE_RUNNABLE_INIT 0 | 78 | #define LOCKTORTURE_RUNNABLE_INIT 0 |
| 89 | #endif | 79 | #endif |
| 90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; | 80 | int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; |
| 91 | module_param(locktorture_runnable, int, 0444); | 81 | module_param(torture_runnable, int, 0444); |
| 92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); | 82 | MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); |
| 93 | 83 | ||
| 94 | /* Forward reference. */ | 84 | /* Forward reference. */ |
| 95 | static void lock_torture_cleanup(void); | 85 | static void lock_torture_cleanup(void); |
| @@ -102,12 +92,25 @@ struct lock_torture_ops { | |||
| 102 | int (*writelock)(void); | 92 | int (*writelock)(void); |
| 103 | void (*write_delay)(struct torture_random_state *trsp); | 93 | void (*write_delay)(struct torture_random_state *trsp); |
| 104 | void (*writeunlock)(void); | 94 | void (*writeunlock)(void); |
| 95 | int (*readlock)(void); | ||
| 96 | void (*read_delay)(struct torture_random_state *trsp); | ||
| 97 | void (*readunlock)(void); | ||
| 105 | unsigned long flags; | 98 | unsigned long flags; |
| 106 | const char *name; | 99 | const char *name; |
| 107 | }; | 100 | }; |
| 108 | 101 | ||
| 109 | static struct lock_torture_ops *cur_ops; | 102 | struct lock_torture_cxt { |
| 110 | 103 | int nrealwriters_stress; | |
| 104 | int nrealreaders_stress; | ||
| 105 | bool debug_lock; | ||
| 106 | atomic_t n_lock_torture_errors; | ||
| 107 | struct lock_torture_ops *cur_ops; | ||
| 108 | struct lock_stress_stats *lwsa; /* writer statistics */ | ||
| 109 | struct lock_stress_stats *lrsa; /* reader statistics */ | ||
| 110 | }; | ||
| 111 | static struct lock_torture_cxt cxt = { 0, 0, false, | ||
| 112 | ATOMIC_INIT(0), | ||
| 113 | NULL, NULL}; | ||
| 111 | /* | 114 | /* |
| 112 | * Definitions for lock torture testing. | 115 | * Definitions for lock torture testing. |
| 113 | */ | 116 | */ |
| @@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) | |||
| 123 | 126 | ||
| 124 | /* We want a long delay occasionally to force massive contention. */ | 127 | /* We want a long delay occasionally to force massive contention. */ |
| 125 | if (!(torture_random(trsp) % | 128 | if (!(torture_random(trsp) % |
| 126 | (nrealwriters_stress * 2000 * longdelay_us))) | 129 | (cxt.nrealwriters_stress * 2000 * longdelay_us))) |
| 127 | mdelay(longdelay_us); | 130 | mdelay(longdelay_us); |
| 128 | #ifdef CONFIG_PREEMPT | 131 | #ifdef CONFIG_PREEMPT |
| 129 | if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) | 132 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) |
| 130 | preempt_schedule(); /* Allow test to be preempted. */ | 133 | preempt_schedule(); /* Allow test to be preempted. */ |
| 131 | #endif | 134 | #endif |
| 132 | } | 135 | } |
| @@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = { | |||
| 140 | .writelock = torture_lock_busted_write_lock, | 143 | .writelock = torture_lock_busted_write_lock, |
| 141 | .write_delay = torture_lock_busted_write_delay, | 144 | .write_delay = torture_lock_busted_write_delay, |
| 142 | .writeunlock = torture_lock_busted_write_unlock, | 145 | .writeunlock = torture_lock_busted_write_unlock, |
| 146 | .readlock = NULL, | ||
| 147 | .read_delay = NULL, | ||
| 148 | .readunlock = NULL, | ||
| 143 | .name = "lock_busted" | 149 | .name = "lock_busted" |
| 144 | }; | 150 | }; |
| 145 | 151 | ||
| @@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) | |||
| 160 | * we want a long delay occasionally to force massive contention. | 166 | * we want a long delay occasionally to force massive contention. |
| 161 | */ | 167 | */ |
| 162 | if (!(torture_random(trsp) % | 168 | if (!(torture_random(trsp) % |
| 163 | (nrealwriters_stress * 2000 * longdelay_us))) | 169 | (cxt.nrealwriters_stress * 2000 * longdelay_us))) |
| 164 | mdelay(longdelay_us); | 170 | mdelay(longdelay_us); |
| 165 | if (!(torture_random(trsp) % | 171 | if (!(torture_random(trsp) % |
| 166 | (nrealwriters_stress * 2 * shortdelay_us))) | 172 | (cxt.nrealwriters_stress * 2 * shortdelay_us))) |
| 167 | udelay(shortdelay_us); | 173 | udelay(shortdelay_us); |
| 168 | #ifdef CONFIG_PREEMPT | 174 | #ifdef CONFIG_PREEMPT |
| 169 | if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) | 175 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) |
| 170 | preempt_schedule(); /* Allow test to be preempted. */ | 176 | preempt_schedule(); /* Allow test to be preempted. */ |
| 171 | #endif | 177 | #endif |
| 172 | } | 178 | } |
| @@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = { | |||
| 180 | .writelock = torture_spin_lock_write_lock, | 186 | .writelock = torture_spin_lock_write_lock, |
| 181 | .write_delay = torture_spin_lock_write_delay, | 187 | .write_delay = torture_spin_lock_write_delay, |
| 182 | .writeunlock = torture_spin_lock_write_unlock, | 188 | .writeunlock = torture_spin_lock_write_unlock, |
| 189 | .readlock = NULL, | ||
| 190 | .read_delay = NULL, | ||
| 191 | .readunlock = NULL, | ||
| 183 | .name = "spin_lock" | 192 | .name = "spin_lock" |
| 184 | }; | 193 | }; |
| 185 | 194 | ||
| 186 | static int torture_spin_lock_write_lock_irq(void) | 195 | static int torture_spin_lock_write_lock_irq(void) |
| 187 | __acquires(torture_spinlock_irq) | 196 | __acquires(torture_spinlock) |
| 188 | { | 197 | { |
| 189 | unsigned long flags; | 198 | unsigned long flags; |
| 190 | 199 | ||
| 191 | spin_lock_irqsave(&torture_spinlock, flags); | 200 | spin_lock_irqsave(&torture_spinlock, flags); |
| 192 | cur_ops->flags = flags; | 201 | cxt.cur_ops->flags = flags; |
| 193 | return 0; | 202 | return 0; |
| 194 | } | 203 | } |
| 195 | 204 | ||
| 196 | static void torture_lock_spin_write_unlock_irq(void) | 205 | static void torture_lock_spin_write_unlock_irq(void) |
| 197 | __releases(torture_spinlock) | 206 | __releases(torture_spinlock) |
| 198 | { | 207 | { |
| 199 | spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); | 208 | spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); |
| 200 | } | 209 | } |
| 201 | 210 | ||
| 202 | static struct lock_torture_ops spin_lock_irq_ops = { | 211 | static struct lock_torture_ops spin_lock_irq_ops = { |
| 203 | .writelock = torture_spin_lock_write_lock_irq, | 212 | .writelock = torture_spin_lock_write_lock_irq, |
| 204 | .write_delay = torture_spin_lock_write_delay, | 213 | .write_delay = torture_spin_lock_write_delay, |
| 205 | .writeunlock = torture_lock_spin_write_unlock_irq, | 214 | .writeunlock = torture_lock_spin_write_unlock_irq, |
| 215 | .readlock = NULL, | ||
| 216 | .read_delay = NULL, | ||
| 217 | .readunlock = NULL, | ||
| 206 | .name = "spin_lock_irq" | 218 | .name = "spin_lock_irq" |
| 207 | }; | 219 | }; |
| 208 | 220 | ||
| 221 | static DEFINE_RWLOCK(torture_rwlock); | ||
| 222 | |||
| 223 | static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) | ||
| 224 | { | ||
| 225 | write_lock(&torture_rwlock); | ||
| 226 | return 0; | ||
| 227 | } | ||
| 228 | |||
| 229 | static void torture_rwlock_write_delay(struct torture_random_state *trsp) | ||
| 230 | { | ||
| 231 | const unsigned long shortdelay_us = 2; | ||
| 232 | const unsigned long longdelay_ms = 100; | ||
| 233 | |||
| 234 | /* We want a short delay mostly to emulate likely code, and | ||
| 235 | * we want a long delay occasionally to force massive contention. | ||
| 236 | */ | ||
| 237 | if (!(torture_random(trsp) % | ||
| 238 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 239 | mdelay(longdelay_ms); | ||
| 240 | else | ||
| 241 | udelay(shortdelay_us); | ||
| 242 | } | ||
| 243 | |||
| 244 | static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) | ||
| 245 | { | ||
| 246 | write_unlock(&torture_rwlock); | ||
| 247 | } | ||
| 248 | |||
| 249 | static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) | ||
| 250 | { | ||
| 251 | read_lock(&torture_rwlock); | ||
| 252 | return 0; | ||
| 253 | } | ||
| 254 | |||
| 255 | static void torture_rwlock_read_delay(struct torture_random_state *trsp) | ||
| 256 | { | ||
| 257 | const unsigned long shortdelay_us = 10; | ||
| 258 | const unsigned long longdelay_ms = 100; | ||
| 259 | |||
| 260 | /* We want a short delay mostly to emulate likely code, and | ||
| 261 | * we want a long delay occasionally to force massive contention. | ||
| 262 | */ | ||
| 263 | if (!(torture_random(trsp) % | ||
| 264 | (cxt.nrealreaders_stress * 2000 * longdelay_ms))) | ||
| 265 | mdelay(longdelay_ms); | ||
| 266 | else | ||
| 267 | udelay(shortdelay_us); | ||
| 268 | } | ||
| 269 | |||
| 270 | static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) | ||
| 271 | { | ||
| 272 | read_unlock(&torture_rwlock); | ||
| 273 | } | ||
| 274 | |||
| 275 | static struct lock_torture_ops rw_lock_ops = { | ||
| 276 | .writelock = torture_rwlock_write_lock, | ||
| 277 | .write_delay = torture_rwlock_write_delay, | ||
| 278 | .writeunlock = torture_rwlock_write_unlock, | ||
| 279 | .readlock = torture_rwlock_read_lock, | ||
| 280 | .read_delay = torture_rwlock_read_delay, | ||
| 281 | .readunlock = torture_rwlock_read_unlock, | ||
| 282 | .name = "rw_lock" | ||
| 283 | }; | ||
| 284 | |||
| 285 | static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) | ||
| 286 | { | ||
| 287 | unsigned long flags; | ||
| 288 | |||
| 289 | write_lock_irqsave(&torture_rwlock, flags); | ||
| 290 | cxt.cur_ops->flags = flags; | ||
| 291 | return 0; | ||
| 292 | } | ||
| 293 | |||
| 294 | static void torture_rwlock_write_unlock_irq(void) | ||
| 295 | __releases(torture_rwlock) | ||
| 296 | { | ||
| 297 | write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); | ||
| 298 | } | ||
| 299 | |||
| 300 | static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) | ||
| 301 | { | ||
| 302 | unsigned long flags; | ||
| 303 | |||
| 304 | read_lock_irqsave(&torture_rwlock, flags); | ||
| 305 | cxt.cur_ops->flags = flags; | ||
| 306 | return 0; | ||
| 307 | } | ||
| 308 | |||
| 309 | static void torture_rwlock_read_unlock_irq(void) | ||
| 310 | __releases(torture_rwlock) | ||
| 311 | { | ||
| 312 | write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); | ||
| 313 | } | ||
| 314 | |||
| 315 | static struct lock_torture_ops rw_lock_irq_ops = { | ||
| 316 | .writelock = torture_rwlock_write_lock_irq, | ||
| 317 | .write_delay = torture_rwlock_write_delay, | ||
| 318 | .writeunlock = torture_rwlock_write_unlock_irq, | ||
| 319 | .readlock = torture_rwlock_read_lock_irq, | ||
| 320 | .read_delay = torture_rwlock_read_delay, | ||
| 321 | .readunlock = torture_rwlock_read_unlock_irq, | ||
| 322 | .name = "rw_lock_irq" | ||
| 323 | }; | ||
| 324 | |||
| 325 | static DEFINE_MUTEX(torture_mutex); | ||
| 326 | |||
| 327 | static int torture_mutex_lock(void) __acquires(torture_mutex) | ||
| 328 | { | ||
| 329 | mutex_lock(&torture_mutex); | ||
| 330 | return 0; | ||
| 331 | } | ||
| 332 | |||
| 333 | static void torture_mutex_delay(struct torture_random_state *trsp) | ||
| 334 | { | ||
| 335 | const unsigned long longdelay_ms = 100; | ||
| 336 | |||
| 337 | /* We want a long delay occasionally to force massive contention. */ | ||
| 338 | if (!(torture_random(trsp) % | ||
| 339 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 340 | mdelay(longdelay_ms * 5); | ||
| 341 | else | ||
| 342 | mdelay(longdelay_ms / 5); | ||
| 343 | #ifdef CONFIG_PREEMPT | ||
| 344 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) | ||
| 345 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 346 | #endif | ||
| 347 | } | ||
| 348 | |||
| 349 | static void torture_mutex_unlock(void) __releases(torture_mutex) | ||
| 350 | { | ||
| 351 | mutex_unlock(&torture_mutex); | ||
| 352 | } | ||
| 353 | |||
| 354 | static struct lock_torture_ops mutex_lock_ops = { | ||
| 355 | .writelock = torture_mutex_lock, | ||
| 356 | .write_delay = torture_mutex_delay, | ||
| 357 | .writeunlock = torture_mutex_unlock, | ||
| 358 | .readlock = NULL, | ||
| 359 | .read_delay = NULL, | ||
| 360 | .readunlock = NULL, | ||
| 361 | .name = "mutex_lock" | ||
| 362 | }; | ||
| 363 | |||
| 364 | static DECLARE_RWSEM(torture_rwsem); | ||
| 365 | static int torture_rwsem_down_write(void) __acquires(torture_rwsem) | ||
| 366 | { | ||
| 367 | down_write(&torture_rwsem); | ||
| 368 | return 0; | ||
| 369 | } | ||
| 370 | |||
| 371 | static void torture_rwsem_write_delay(struct torture_random_state *trsp) | ||
| 372 | { | ||
| 373 | const unsigned long longdelay_ms = 100; | ||
| 374 | |||
| 375 | /* We want a long delay occasionally to force massive contention. */ | ||
| 376 | if (!(torture_random(trsp) % | ||
| 377 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 378 | mdelay(longdelay_ms * 10); | ||
| 379 | else | ||
| 380 | mdelay(longdelay_ms / 10); | ||
| 381 | #ifdef CONFIG_PREEMPT | ||
| 382 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) | ||
| 383 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 384 | #endif | ||
| 385 | } | ||
| 386 | |||
| 387 | static void torture_rwsem_up_write(void) __releases(torture_rwsem) | ||
| 388 | { | ||
| 389 | up_write(&torture_rwsem); | ||
| 390 | } | ||
| 391 | |||
| 392 | static int torture_rwsem_down_read(void) __acquires(torture_rwsem) | ||
| 393 | { | ||
| 394 | down_read(&torture_rwsem); | ||
| 395 | return 0; | ||
| 396 | } | ||
| 397 | |||
| 398 | static void torture_rwsem_read_delay(struct torture_random_state *trsp) | ||
| 399 | { | ||
| 400 | const unsigned long longdelay_ms = 100; | ||
| 401 | |||
| 402 | /* We want a long delay occasionally to force massive contention. */ | ||
| 403 | if (!(torture_random(trsp) % | ||
| 404 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
| 405 | mdelay(longdelay_ms * 2); | ||
| 406 | else | ||
| 407 | mdelay(longdelay_ms / 2); | ||
| 408 | #ifdef CONFIG_PREEMPT | ||
| 409 | if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) | ||
| 410 | preempt_schedule(); /* Allow test to be preempted. */ | ||
| 411 | #endif | ||
| 412 | } | ||
| 413 | |||
| 414 | static void torture_rwsem_up_read(void) __releases(torture_rwsem) | ||
| 415 | { | ||
| 416 | up_read(&torture_rwsem); | ||
| 417 | } | ||
| 418 | |||
| 419 | static struct lock_torture_ops rwsem_lock_ops = { | ||
| 420 | .writelock = torture_rwsem_down_write, | ||
| 421 | .write_delay = torture_rwsem_write_delay, | ||
| 422 | .writeunlock = torture_rwsem_up_write, | ||
| 423 | .readlock = torture_rwsem_down_read, | ||
| 424 | .read_delay = torture_rwsem_read_delay, | ||
| 425 | .readunlock = torture_rwsem_up_read, | ||
| 426 | .name = "rwsem_lock" | ||
| 427 | }; | ||
| 428 | |||
| 209 | /* | 429 | /* |
| 210 | * Lock torture writer kthread. Repeatedly acquires and releases | 430 | * Lock torture writer kthread. Repeatedly acquires and releases |
| 211 | * the lock, checking for duplicate acquisitions. | 431 | * the lock, checking for duplicate acquisitions. |
| 212 | */ | 432 | */ |
| 213 | static int lock_torture_writer(void *arg) | 433 | static int lock_torture_writer(void *arg) |
| 214 | { | 434 | { |
| 215 | struct lock_writer_stress_stats *lwsp = arg; | 435 | struct lock_stress_stats *lwsp = arg; |
| 216 | static DEFINE_TORTURE_RANDOM(rand); | 436 | static DEFINE_TORTURE_RANDOM(rand); |
| 217 | 437 | ||
| 218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); | 438 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); |
| @@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg) | |||
| 221 | do { | 441 | do { |
| 222 | if ((torture_random(&rand) & 0xfffff) == 0) | 442 | if ((torture_random(&rand) & 0xfffff) == 0) |
| 223 | schedule_timeout_uninterruptible(1); | 443 | schedule_timeout_uninterruptible(1); |
| 224 | cur_ops->writelock(); | 444 | |
| 445 | cxt.cur_ops->writelock(); | ||
| 225 | if (WARN_ON_ONCE(lock_is_write_held)) | 446 | if (WARN_ON_ONCE(lock_is_write_held)) |
| 226 | lwsp->n_write_lock_fail++; | 447 | lwsp->n_lock_fail++; |
| 227 | lock_is_write_held = 1; | 448 | lock_is_write_held = 1; |
| 228 | lwsp->n_write_lock_acquired++; | 449 | if (WARN_ON_ONCE(lock_is_read_held)) |
| 229 | cur_ops->write_delay(&rand); | 450 | lwsp->n_lock_fail++; /* rare, but... */ |
| 451 | |||
| 452 | lwsp->n_lock_acquired++; | ||
| 453 | cxt.cur_ops->write_delay(&rand); | ||
| 230 | lock_is_write_held = 0; | 454 | lock_is_write_held = 0; |
| 231 | cur_ops->writeunlock(); | 455 | cxt.cur_ops->writeunlock(); |
| 456 | |||
| 232 | stutter_wait("lock_torture_writer"); | 457 | stutter_wait("lock_torture_writer"); |
| 233 | } while (!torture_must_stop()); | 458 | } while (!torture_must_stop()); |
| 234 | torture_kthread_stopping("lock_torture_writer"); | 459 | torture_kthread_stopping("lock_torture_writer"); |
| @@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg) | |||
| 236 | } | 461 | } |
| 237 | 462 | ||
| 238 | /* | 463 | /* |
| 464 | * Lock torture reader kthread. Repeatedly acquires and releases | ||
| 465 | * the reader lock. | ||
| 466 | */ | ||
| 467 | static int lock_torture_reader(void *arg) | ||
| 468 | { | ||
| 469 | struct lock_stress_stats *lrsp = arg; | ||
| 470 | static DEFINE_TORTURE_RANDOM(rand); | ||
| 471 | |||
| 472 | VERBOSE_TOROUT_STRING("lock_torture_reader task started"); | ||
| 473 | set_user_nice(current, MAX_NICE); | ||
| 474 | |||
| 475 | do { | ||
| 476 | if ((torture_random(&rand) & 0xfffff) == 0) | ||
| 477 | schedule_timeout_uninterruptible(1); | ||
| 478 | |||
| 479 | cxt.cur_ops->readlock(); | ||
| 480 | lock_is_read_held = 1; | ||
| 481 | if (WARN_ON_ONCE(lock_is_write_held)) | ||
| 482 | lrsp->n_lock_fail++; /* rare, but... */ | ||
| 483 | |||
| 484 | lrsp->n_lock_acquired++; | ||
| 485 | cxt.cur_ops->read_delay(&rand); | ||
| 486 | lock_is_read_held = 0; | ||
| 487 | cxt.cur_ops->readunlock(); | ||
| 488 | |||
| 489 | stutter_wait("lock_torture_reader"); | ||
| 490 | } while (!torture_must_stop()); | ||
| 491 | torture_kthread_stopping("lock_torture_reader"); | ||
| 492 | return 0; | ||
| 493 | } | ||
| 494 | |||
| 495 | /* | ||
| 239 | * Create an lock-torture-statistics message in the specified buffer. | 496 | * Create an lock-torture-statistics message in the specified buffer. |
| 240 | */ | 497 | */ |
| 241 | static void lock_torture_printk(char *page) | 498 | static void __torture_print_stats(char *page, |
| 499 | struct lock_stress_stats *statp, bool write) | ||
| 242 | { | 500 | { |
| 243 | bool fail = 0; | 501 | bool fail = 0; |
| 244 | int i; | 502 | int i, n_stress; |
| 245 | long max = 0; | 503 | long max = 0; |
| 246 | long min = lwsa[0].n_write_lock_acquired; | 504 | long min = statp[0].n_lock_acquired; |
| 247 | long long sum = 0; | 505 | long long sum = 0; |
| 248 | 506 | ||
| 249 | for (i = 0; i < nrealwriters_stress; i++) { | 507 | n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; |
| 250 | if (lwsa[i].n_write_lock_fail) | 508 | for (i = 0; i < n_stress; i++) { |
| 509 | if (statp[i].n_lock_fail) | ||
| 251 | fail = true; | 510 | fail = true; |
| 252 | sum += lwsa[i].n_write_lock_acquired; | 511 | sum += statp[i].n_lock_acquired; |
| 253 | if (max < lwsa[i].n_write_lock_fail) | 512 | if (max < statp[i].n_lock_fail) |
| 254 | max = lwsa[i].n_write_lock_fail; | 513 | max = statp[i].n_lock_fail; |
| 255 | if (min > lwsa[i].n_write_lock_fail) | 514 | if (min > statp[i].n_lock_fail) |
| 256 | min = lwsa[i].n_write_lock_fail; | 515 | min = statp[i].n_lock_fail; |
| 257 | } | 516 | } |
| 258 | page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); | ||
| 259 | page += sprintf(page, | 517 | page += sprintf(page, |
| 260 | "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", | 518 | "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", |
| 519 | write ? "Writes" : "Reads ", | ||
| 261 | sum, max, min, max / 2 > min ? "???" : "", | 520 | sum, max, min, max / 2 > min ? "???" : "", |
| 262 | fail, fail ? "!!!" : ""); | 521 | fail, fail ? "!!!" : ""); |
| 263 | if (fail) | 522 | if (fail) |
| 264 | atomic_inc(&n_lock_torture_errors); | 523 | atomic_inc(&cxt.n_lock_torture_errors); |
| 265 | } | 524 | } |
| 266 | 525 | ||
| 267 | /* | 526 | /* |
| @@ -274,18 +533,35 @@ static void lock_torture_printk(char *page) | |||
| 274 | */ | 533 | */ |
| 275 | static void lock_torture_stats_print(void) | 534 | static void lock_torture_stats_print(void) |
| 276 | { | 535 | { |
| 277 | int size = nrealwriters_stress * 200 + 8192; | 536 | int size = cxt.nrealwriters_stress * 200 + 8192; |
| 278 | char *buf; | 537 | char *buf; |
| 279 | 538 | ||
| 539 | if (cxt.cur_ops->readlock) | ||
| 540 | size += cxt.nrealreaders_stress * 200 + 8192; | ||
| 541 | |||
| 280 | buf = kmalloc(size, GFP_KERNEL); | 542 | buf = kmalloc(size, GFP_KERNEL); |
| 281 | if (!buf) { | 543 | if (!buf) { |
| 282 | pr_err("lock_torture_stats_print: Out of memory, need: %d", | 544 | pr_err("lock_torture_stats_print: Out of memory, need: %d", |
| 283 | size); | 545 | size); |
| 284 | return; | 546 | return; |
| 285 | } | 547 | } |
| 286 | lock_torture_printk(buf); | 548 | |
| 549 | __torture_print_stats(buf, cxt.lwsa, true); | ||
| 287 | pr_alert("%s", buf); | 550 | pr_alert("%s", buf); |
| 288 | kfree(buf); | 551 | kfree(buf); |
| 552 | |||
| 553 | if (cxt.cur_ops->readlock) { | ||
| 554 | buf = kmalloc(size, GFP_KERNEL); | ||
| 555 | if (!buf) { | ||
| 556 | pr_err("lock_torture_stats_print: Out of memory, need: %d", | ||
| 557 | size); | ||
| 558 | return; | ||
| 559 | } | ||
| 560 | |||
| 561 | __torture_print_stats(buf, cxt.lrsa, false); | ||
| 562 | pr_alert("%s", buf); | ||
| 563 | kfree(buf); | ||
| 564 | } | ||
| 289 | } | 565 | } |
| 290 | 566 | ||
| 291 | /* | 567 | /* |
| @@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, | |||
| 312 | const char *tag) | 588 | const char *tag) |
| 313 | { | 589 | { |
| 314 | pr_alert("%s" TORTURE_FLAG | 590 | pr_alert("%s" TORTURE_FLAG |
| 315 | "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", | 591 | "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", |
| 316 | torture_type, tag, nrealwriters_stress, stat_interval, verbose, | 592 | torture_type, tag, cxt.debug_lock ? " [debug]": "", |
| 317 | shuffle_interval, stutter, shutdown_secs, | 593 | cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, |
| 594 | verbose, shuffle_interval, stutter, shutdown_secs, | ||
| 318 | onoff_interval, onoff_holdoff); | 595 | onoff_interval, onoff_holdoff); |
| 319 | } | 596 | } |
| 320 | 597 | ||
| @@ -322,46 +599,59 @@ static void lock_torture_cleanup(void) | |||
| 322 | { | 599 | { |
| 323 | int i; | 600 | int i; |
| 324 | 601 | ||
| 325 | if (torture_cleanup()) | 602 | if (torture_cleanup_begin()) |
| 326 | return; | 603 | return; |
| 327 | 604 | ||
| 328 | if (writer_tasks) { | 605 | if (writer_tasks) { |
| 329 | for (i = 0; i < nrealwriters_stress; i++) | 606 | for (i = 0; i < cxt.nrealwriters_stress; i++) |
| 330 | torture_stop_kthread(lock_torture_writer, | 607 | torture_stop_kthread(lock_torture_writer, |
| 331 | writer_tasks[i]); | 608 | writer_tasks[i]); |
| 332 | kfree(writer_tasks); | 609 | kfree(writer_tasks); |
| 333 | writer_tasks = NULL; | 610 | writer_tasks = NULL; |
| 334 | } | 611 | } |
| 335 | 612 | ||
| 613 | if (reader_tasks) { | ||
| 614 | for (i = 0; i < cxt.nrealreaders_stress; i++) | ||
| 615 | torture_stop_kthread(lock_torture_reader, | ||
| 616 | reader_tasks[i]); | ||
| 617 | kfree(reader_tasks); | ||
| 618 | reader_tasks = NULL; | ||
| 619 | } | ||
| 620 | |||
| 336 | torture_stop_kthread(lock_torture_stats, stats_task); | 621 | torture_stop_kthread(lock_torture_stats, stats_task); |
| 337 | lock_torture_stats_print(); /* -After- the stats thread is stopped! */ | 622 | lock_torture_stats_print(); /* -After- the stats thread is stopped! */ |
| 338 | 623 | ||
| 339 | if (atomic_read(&n_lock_torture_errors)) | 624 | if (atomic_read(&cxt.n_lock_torture_errors)) |
| 340 | lock_torture_print_module_parms(cur_ops, | 625 | lock_torture_print_module_parms(cxt.cur_ops, |
| 341 | "End of test: FAILURE"); | 626 | "End of test: FAILURE"); |
| 342 | else if (torture_onoff_failures()) | 627 | else if (torture_onoff_failures()) |
| 343 | lock_torture_print_module_parms(cur_ops, | 628 | lock_torture_print_module_parms(cxt.cur_ops, |
| 344 | "End of test: LOCK_HOTPLUG"); | 629 | "End of test: LOCK_HOTPLUG"); |
| 345 | else | 630 | else |
| 346 | lock_torture_print_module_parms(cur_ops, | 631 | lock_torture_print_module_parms(cxt.cur_ops, |
| 347 | "End of test: SUCCESS"); | 632 | "End of test: SUCCESS"); |
| 633 | torture_cleanup_end(); | ||
| 348 | } | 634 | } |
| 349 | 635 | ||
| 350 | static int __init lock_torture_init(void) | 636 | static int __init lock_torture_init(void) |
| 351 | { | 637 | { |
| 352 | int i; | 638 | int i, j; |
| 353 | int firsterr = 0; | 639 | int firsterr = 0; |
| 354 | static struct lock_torture_ops *torture_ops[] = { | 640 | static struct lock_torture_ops *torture_ops[] = { |
| 355 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, | 641 | &lock_busted_ops, |
| 642 | &spin_lock_ops, &spin_lock_irq_ops, | ||
| 643 | &rw_lock_ops, &rw_lock_irq_ops, | ||
| 644 | &mutex_lock_ops, | ||
| 645 | &rwsem_lock_ops, | ||
| 356 | }; | 646 | }; |
| 357 | 647 | ||
| 358 | if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) | 648 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) |
| 359 | return -EBUSY; | 649 | return -EBUSY; |
| 360 | 650 | ||
| 361 | /* Process args and tell the world that the torturer is on the job. */ | 651 | /* Process args and tell the world that the torturer is on the job. */ |
| 362 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 652 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
| 363 | cur_ops = torture_ops[i]; | 653 | cxt.cur_ops = torture_ops[i]; |
| 364 | if (strcmp(torture_type, cur_ops->name) == 0) | 654 | if (strcmp(torture_type, cxt.cur_ops->name) == 0) |
| 365 | break; | 655 | break; |
| 366 | } | 656 | } |
| 367 | if (i == ARRAY_SIZE(torture_ops)) { | 657 | if (i == ARRAY_SIZE(torture_ops)) { |
| @@ -374,31 +664,69 @@ static int __init lock_torture_init(void) | |||
| 374 | torture_init_end(); | 664 | torture_init_end(); |
| 375 | return -EINVAL; | 665 | return -EINVAL; |
| 376 | } | 666 | } |
| 377 | if (cur_ops->init) | 667 | if (cxt.cur_ops->init) |
| 378 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 668 | cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
| 379 | 669 | ||
| 380 | if (nwriters_stress >= 0) | 670 | if (nwriters_stress >= 0) |
| 381 | nrealwriters_stress = nwriters_stress; | 671 | cxt.nrealwriters_stress = nwriters_stress; |
| 382 | else | 672 | else |
| 383 | nrealwriters_stress = 2 * num_online_cpus(); | 673 | cxt.nrealwriters_stress = 2 * num_online_cpus(); |
| 384 | lock_torture_print_module_parms(cur_ops, "Start of test"); | 674 | |
| 675 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 676 | if (strncmp(torture_type, "mutex", 5) == 0) | ||
| 677 | cxt.debug_lock = true; | ||
| 678 | #endif | ||
| 679 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
| 680 | if ((strncmp(torture_type, "spin", 4) == 0) || | ||
| 681 | (strncmp(torture_type, "rw_lock", 7) == 0)) | ||
| 682 | cxt.debug_lock = true; | ||
| 683 | #endif | ||
| 385 | 684 | ||
| 386 | /* Initialize the statistics so that each run gets its own numbers. */ | 685 | /* Initialize the statistics so that each run gets its own numbers. */ |
| 387 | 686 | ||
| 388 | lock_is_write_held = 0; | 687 | lock_is_write_held = 0; |
| 389 | lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); | 688 | cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); |
| 390 | if (lwsa == NULL) { | 689 | if (cxt.lwsa == NULL) { |
| 391 | VERBOSE_TOROUT_STRING("lwsa: Out of memory"); | 690 | VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); |
| 392 | firsterr = -ENOMEM; | 691 | firsterr = -ENOMEM; |
| 393 | goto unwind; | 692 | goto unwind; |
| 394 | } | 693 | } |
| 395 | for (i = 0; i < nrealwriters_stress; i++) { | 694 | for (i = 0; i < cxt.nrealwriters_stress; i++) { |
| 396 | lwsa[i].n_write_lock_fail = 0; | 695 | cxt.lwsa[i].n_lock_fail = 0; |
| 397 | lwsa[i].n_write_lock_acquired = 0; | 696 | cxt.lwsa[i].n_lock_acquired = 0; |
| 398 | } | 697 | } |
| 399 | 698 | ||
| 400 | /* Start up the kthreads. */ | 699 | if (cxt.cur_ops->readlock) { |
| 700 | if (nreaders_stress >= 0) | ||
| 701 | cxt.nrealreaders_stress = nreaders_stress; | ||
| 702 | else { | ||
| 703 | /* | ||
| 704 | * By default distribute evenly the number of | ||
| 705 | * readers and writers. We still run the same number | ||
| 706 | * of threads as the writer-only locks default. | ||
| 707 | */ | ||
| 708 | if (nwriters_stress < 0) /* user doesn't care */ | ||
| 709 | cxt.nrealwriters_stress = num_online_cpus(); | ||
| 710 | cxt.nrealreaders_stress = cxt.nrealwriters_stress; | ||
| 711 | } | ||
| 712 | |||
| 713 | lock_is_read_held = 0; | ||
| 714 | cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); | ||
| 715 | if (cxt.lrsa == NULL) { | ||
| 716 | VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); | ||
| 717 | firsterr = -ENOMEM; | ||
| 718 | kfree(cxt.lwsa); | ||
| 719 | goto unwind; | ||
| 720 | } | ||
| 721 | |||
| 722 | for (i = 0; i < cxt.nrealreaders_stress; i++) { | ||
| 723 | cxt.lrsa[i].n_lock_fail = 0; | ||
| 724 | cxt.lrsa[i].n_lock_acquired = 0; | ||
| 725 | } | ||
| 726 | } | ||
| 727 | lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); | ||
| 401 | 728 | ||
| 729 | /* Prepare torture context. */ | ||
| 402 | if (onoff_interval > 0) { | 730 | if (onoff_interval > 0) { |
| 403 | firsterr = torture_onoff_init(onoff_holdoff * HZ, | 731 | firsterr = torture_onoff_init(onoff_holdoff * HZ, |
| 404 | onoff_interval * HZ); | 732 | onoff_interval * HZ); |
| @@ -422,18 +750,51 @@ static int __init lock_torture_init(void) | |||
| 422 | goto unwind; | 750 | goto unwind; |
| 423 | } | 751 | } |
| 424 | 752 | ||
| 425 | writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), | 753 | writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), |
| 426 | GFP_KERNEL); | 754 | GFP_KERNEL); |
| 427 | if (writer_tasks == NULL) { | 755 | if (writer_tasks == NULL) { |
| 428 | VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); | 756 | VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); |
| 429 | firsterr = -ENOMEM; | 757 | firsterr = -ENOMEM; |
| 430 | goto unwind; | 758 | goto unwind; |
| 431 | } | 759 | } |
| 432 | for (i = 0; i < nrealwriters_stress; i++) { | 760 | |
| 433 | firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], | 761 | if (cxt.cur_ops->readlock) { |
| 762 | reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), | ||
| 763 | GFP_KERNEL); | ||
| 764 | if (reader_tasks == NULL) { | ||
| 765 | VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); | ||
| 766 | firsterr = -ENOMEM; | ||
| 767 | goto unwind; | ||
| 768 | } | ||
| 769 | } | ||
| 770 | |||
| 771 | /* | ||
| 772 | * Create the kthreads and start torturing (oh, those poor little locks). | ||
| 773 | * | ||
| 774 | * TODO: Note that we interleave writers with readers, giving writers a | ||
| 775 | * slight advantage, by creating its kthread first. This can be modified | ||
| 776 | * for very specific needs, or even let the user choose the policy, if | ||
| 777 | * ever wanted. | ||
| 778 | */ | ||
| 779 | for (i = 0, j = 0; i < cxt.nrealwriters_stress || | ||
| 780 | j < cxt.nrealreaders_stress; i++, j++) { | ||
| 781 | if (i >= cxt.nrealwriters_stress) | ||
| 782 | goto create_reader; | ||
| 783 | |||
| 784 | /* Create writer. */ | ||
| 785 | firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], | ||
| 434 | writer_tasks[i]); | 786 | writer_tasks[i]); |
| 435 | if (firsterr) | 787 | if (firsterr) |
| 436 | goto unwind; | 788 | goto unwind; |
| 789 | |||
| 790 | create_reader: | ||
| 791 | if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) | ||
| 792 | continue; | ||
| 793 | /* Create reader. */ | ||
| 794 | firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], | ||
| 795 | reader_tasks[j]); | ||
| 796 | if (firsterr) | ||
| 797 | goto unwind; | ||
| 437 | } | 798 | } |
| 438 | if (stat_interval > 0) { | 799 | if (stat_interval > 0) { |
| 439 | firsterr = torture_create_kthread(lock_torture_stats, NULL, | 800 | firsterr = torture_create_kthread(lock_torture_stats, NULL, |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 23e89c5930e9..4d60986fcbee 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
| @@ -56,9 +56,6 @@ do { \ | |||
| 56 | * If the lock has already been acquired, then this will proceed to spin | 56 | * If the lock has already been acquired, then this will proceed to spin |
| 57 | * on this node->locked until the previous lock holder sets the node->locked | 57 | * on this node->locked until the previous lock holder sets the node->locked |
| 58 | * in mcs_spin_unlock(). | 58 | * in mcs_spin_unlock(). |
| 59 | * | ||
| 60 | * We don't inline mcs_spin_lock() so that perf can correctly account for the | ||
| 61 | * time spent in this lock function. | ||
| 62 | */ | 59 | */ |
| 63 | static inline | 60 | static inline |
| 64 | void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 61 | void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) |
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 5cf6731b98e9..3ef3736002d8 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
| @@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock) | |||
| 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
| 81 | 81 | ||
| 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
| 83 | mutex_clear_owner(lock); | ||
| 84 | } | 83 | } |
| 85 | 84 | ||
| 86 | /* | 85 | /* |
| 87 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug | 86 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug |
| 88 | * mutexes so that we can do it here after we've verified state. | 87 | * mutexes so that we can do it here after we've verified state. |
| 89 | */ | 88 | */ |
| 89 | mutex_clear_owner(lock); | ||
| 90 | atomic_set(&lock->count, 1); | 90 | atomic_set(&lock->count, 1); |
| 91 | } | 91 | } |
| 92 | 92 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ae712b25e492..454195194d4a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -15,7 +15,7 @@ | |||
| 15 | * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale | 15 | * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale |
| 16 | * and Sven Dietrich. | 16 | * and Sven Dietrich. |
| 17 | * | 17 | * |
| 18 | * Also see Documentation/mutex-design.txt. | 18 | * Also see Documentation/locking/mutex-design.txt. |
| 19 | */ | 19 | */ |
| 20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
| 21 | #include <linux/ww_mutex.h> | 21 | #include <linux/ww_mutex.h> |
| @@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock) | |||
| 106 | EXPORT_SYMBOL(mutex_lock); | 106 | EXPORT_SYMBOL(mutex_lock); |
| 107 | #endif | 107 | #endif |
| 108 | 108 | ||
| 109 | static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | ||
| 110 | struct ww_acquire_ctx *ww_ctx) | ||
| 111 | { | ||
| 112 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 113 | /* | ||
| 114 | * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | ||
| 115 | * but released with a normal mutex_unlock in this call. | ||
| 116 | * | ||
| 117 | * This should never happen, always use ww_mutex_unlock. | ||
| 118 | */ | ||
| 119 | DEBUG_LOCKS_WARN_ON(ww->ctx); | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Not quite done after calling ww_acquire_done() ? | ||
| 123 | */ | ||
| 124 | DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | ||
| 125 | |||
| 126 | if (ww_ctx->contending_lock) { | ||
| 127 | /* | ||
| 128 | * After -EDEADLK you tried to | ||
| 129 | * acquire a different ww_mutex? Bad! | ||
| 130 | */ | ||
| 131 | DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | ||
| 132 | |||
| 133 | /* | ||
| 134 | * You called ww_mutex_lock after receiving -EDEADLK, | ||
| 135 | * but 'forgot' to unlock everything else first? | ||
| 136 | */ | ||
| 137 | DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | ||
| 138 | ww_ctx->contending_lock = NULL; | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * Naughty, using a different class will lead to undefined behavior! | ||
| 143 | */ | ||
| 144 | DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | ||
| 145 | #endif | ||
| 146 | ww_ctx->acquired++; | ||
| 147 | } | ||
| 148 | |||
| 149 | /* | ||
| 150 | * after acquiring lock with fastpath or when we lost out in contested | ||
| 151 | * slowpath, set ctx and wake up any waiters so they can recheck. | ||
| 152 | * | ||
| 153 | * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, | ||
| 154 | * as the fastpath and opportunistic spinning are disabled in that case. | ||
| 155 | */ | ||
| 156 | static __always_inline void | ||
| 157 | ww_mutex_set_context_fastpath(struct ww_mutex *lock, | ||
| 158 | struct ww_acquire_ctx *ctx) | ||
| 159 | { | ||
| 160 | unsigned long flags; | ||
| 161 | struct mutex_waiter *cur; | ||
| 162 | |||
| 163 | ww_mutex_lock_acquired(lock, ctx); | ||
| 164 | |||
| 165 | lock->ctx = ctx; | ||
| 166 | |||
| 167 | /* | ||
| 168 | * The lock->ctx update should be visible on all cores before | ||
| 169 | * the atomic read is done, otherwise contended waiters might be | ||
| 170 | * missed. The contended waiters will either see ww_ctx == NULL | ||
| 171 | * and keep spinning, or it will acquire wait_lock, add itself | ||
| 172 | * to waiter list and sleep. | ||
| 173 | */ | ||
| 174 | smp_mb(); /* ^^^ */ | ||
| 175 | |||
| 176 | /* | ||
| 177 | * Check if lock is contended, if not there is nobody to wake up | ||
| 178 | */ | ||
| 179 | if (likely(atomic_read(&lock->base.count) == 0)) | ||
| 180 | return; | ||
| 181 | |||
| 182 | /* | ||
| 183 | * Uh oh, we raced in fastpath, wake up everyone in this case, | ||
| 184 | * so they can see the new lock->ctx. | ||
| 185 | */ | ||
| 186 | spin_lock_mutex(&lock->base.wait_lock, flags); | ||
| 187 | list_for_each_entry(cur, &lock->base.wait_list, list) { | ||
| 188 | debug_mutex_wake_waiter(&lock->base, cur); | ||
| 189 | wake_up_process(cur->task); | ||
| 190 | } | ||
| 191 | spin_unlock_mutex(&lock->base.wait_lock, flags); | ||
| 192 | } | ||
| 193 | |||
| 194 | |||
| 109 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 195 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 110 | /* | 196 | /* |
| 111 | * In order to avoid a stampede of mutex spinners from acquiring the mutex | 197 | * In order to avoid a stampede of mutex spinners from acquiring the mutex |
| @@ -180,6 +266,135 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
| 180 | */ | 266 | */ |
| 181 | return retval; | 267 | return retval; |
| 182 | } | 268 | } |
| 269 | |||
| 270 | /* | ||
| 271 | * Atomically try to take the lock when it is available | ||
| 272 | */ | ||
| 273 | static inline bool mutex_try_to_acquire(struct mutex *lock) | ||
| 274 | { | ||
| 275 | return !mutex_is_locked(lock) && | ||
| 276 | (atomic_cmpxchg(&lock->count, 1, 0) == 1); | ||
| 277 | } | ||
| 278 | |||
| 279 | /* | ||
| 280 | * Optimistic spinning. | ||
| 281 | * | ||
| 282 | * We try to spin for acquisition when we find that the lock owner | ||
| 283 | * is currently running on a (different) CPU and while we don't | ||
| 284 | * need to reschedule. The rationale is that if the lock owner is | ||
| 285 | * running, it is likely to release the lock soon. | ||
| 286 | * | ||
| 287 | * Since this needs the lock owner, and this mutex implementation | ||
| 288 | * doesn't track the owner atomically in the lock field, we need to | ||
| 289 | * track it non-atomically. | ||
| 290 | * | ||
| 291 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock | ||
| 292 | * to serialize everything. | ||
| 293 | * | ||
| 294 | * The mutex spinners are queued up using MCS lock so that only one | ||
| 295 | * spinner can compete for the mutex. However, if mutex spinning isn't | ||
| 296 | * going to happen, there is no point in going through the lock/unlock | ||
| 297 | * overhead. | ||
| 298 | * | ||
| 299 | * Returns true when the lock was taken, otherwise false, indicating | ||
| 300 | * that we need to jump to the slowpath and sleep. | ||
| 301 | */ | ||
| 302 | static bool mutex_optimistic_spin(struct mutex *lock, | ||
| 303 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) | ||
| 304 | { | ||
| 305 | struct task_struct *task = current; | ||
| 306 | |||
| 307 | if (!mutex_can_spin_on_owner(lock)) | ||
| 308 | goto done; | ||
| 309 | |||
| 310 | if (!osq_lock(&lock->osq)) | ||
| 311 | goto done; | ||
| 312 | |||
| 313 | while (true) { | ||
| 314 | struct task_struct *owner; | ||
| 315 | |||
| 316 | if (use_ww_ctx && ww_ctx->acquired > 0) { | ||
| 317 | struct ww_mutex *ww; | ||
| 318 | |||
| 319 | ww = container_of(lock, struct ww_mutex, base); | ||
| 320 | /* | ||
| 321 | * If ww->ctx is set the contents are undefined, only | ||
| 322 | * by acquiring wait_lock there is a guarantee that | ||
| 323 | * they are not invalid when reading. | ||
| 324 | * | ||
| 325 | * As such, when deadlock detection needs to be | ||
| 326 | * performed the optimistic spinning cannot be done. | ||
| 327 | */ | ||
| 328 | if (ACCESS_ONCE(ww->ctx)) | ||
| 329 | break; | ||
| 330 | } | ||
| 331 | |||
| 332 | /* | ||
| 333 | * If there's an owner, wait for it to either | ||
| 334 | * release the lock or go to sleep. | ||
| 335 | */ | ||
| 336 | owner = ACCESS_ONCE(lock->owner); | ||
| 337 | if (owner && !mutex_spin_on_owner(lock, owner)) | ||
| 338 | break; | ||
| 339 | |||
| 340 | /* Try to acquire the mutex if it is unlocked. */ | ||
| 341 | if (mutex_try_to_acquire(lock)) { | ||
| 342 | lock_acquired(&lock->dep_map, ip); | ||
| 343 | |||
| 344 | if (use_ww_ctx) { | ||
| 345 | struct ww_mutex *ww; | ||
| 346 | ww = container_of(lock, struct ww_mutex, base); | ||
| 347 | |||
| 348 | ww_mutex_set_context_fastpath(ww, ww_ctx); | ||
| 349 | } | ||
| 350 | |||
| 351 | mutex_set_owner(lock); | ||
| 352 | osq_unlock(&lock->osq); | ||
| 353 | return true; | ||
| 354 | } | ||
| 355 | |||
| 356 | /* | ||
| 357 | * When there's no owner, we might have preempted between the | ||
| 358 | * owner acquiring the lock and setting the owner field. If | ||
| 359 | * we're an RT task that will live-lock because we won't let | ||
| 360 | * the owner complete. | ||
| 361 | */ | ||
| 362 | if (!owner && (need_resched() || rt_task(task))) | ||
| 363 | break; | ||
| 364 | |||
| 365 | /* | ||
| 366 | * The cpu_relax() call is a compiler barrier which forces | ||
| 367 | * everything in this loop to be re-loaded. We don't need | ||
| 368 | * memory barriers as we'll eventually observe the right | ||
| 369 | * values at the cost of a few extra spins. | ||
| 370 | */ | ||
| 371 | cpu_relax_lowlatency(); | ||
| 372 | } | ||
| 373 | |||
| 374 | osq_unlock(&lock->osq); | ||
| 375 | done: | ||
| 376 | /* | ||
| 377 | * If we fell out of the spin path because of need_resched(), | ||
| 378 | * reschedule now, before we try-lock the mutex. This avoids getting | ||
| 379 | * scheduled out right after we obtained the mutex. | ||
| 380 | */ | ||
| 381 | if (need_resched()) { | ||
| 382 | /* | ||
| 383 | * We _should_ have TASK_RUNNING here, but just in case | ||
| 384 | * we do not, make it so, otherwise we might get stuck. | ||
| 385 | */ | ||
| 386 | __set_current_state(TASK_RUNNING); | ||
| 387 | schedule_preempt_disabled(); | ||
| 388 | } | ||
| 389 | |||
| 390 | return false; | ||
| 391 | } | ||
| 392 | #else | ||
| 393 | static bool mutex_optimistic_spin(struct mutex *lock, | ||
| 394 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) | ||
| 395 | { | ||
| 396 | return false; | ||
| 397 | } | ||
| 183 | #endif | 398 | #endif |
| 184 | 399 | ||
| 185 | __visible __used noinline | 400 | __visible __used noinline |
| @@ -277,91 +492,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 277 | return 0; | 492 | return 0; |
| 278 | } | 493 | } |
| 279 | 494 | ||
| 280 | static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | ||
| 281 | struct ww_acquire_ctx *ww_ctx) | ||
| 282 | { | ||
| 283 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 284 | /* | ||
| 285 | * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | ||
| 286 | * but released with a normal mutex_unlock in this call. | ||
| 287 | * | ||
| 288 | * This should never happen, always use ww_mutex_unlock. | ||
| 289 | */ | ||
| 290 | DEBUG_LOCKS_WARN_ON(ww->ctx); | ||
| 291 | |||
| 292 | /* | ||
| 293 | * Not quite done after calling ww_acquire_done() ? | ||
| 294 | */ | ||
| 295 | DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | ||
| 296 | |||
| 297 | if (ww_ctx->contending_lock) { | ||
| 298 | /* | ||
| 299 | * After -EDEADLK you tried to | ||
| 300 | * acquire a different ww_mutex? Bad! | ||
| 301 | */ | ||
| 302 | DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | ||
| 303 | |||
| 304 | /* | ||
| 305 | * You called ww_mutex_lock after receiving -EDEADLK, | ||
| 306 | * but 'forgot' to unlock everything else first? | ||
| 307 | */ | ||
| 308 | DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | ||
| 309 | ww_ctx->contending_lock = NULL; | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Naughty, using a different class will lead to undefined behavior! | ||
| 314 | */ | ||
| 315 | DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | ||
| 316 | #endif | ||
| 317 | ww_ctx->acquired++; | ||
| 318 | } | ||
| 319 | |||
| 320 | /* | ||
| 321 | * after acquiring lock with fastpath or when we lost out in contested | ||
| 322 | * slowpath, set ctx and wake up any waiters so they can recheck. | ||
| 323 | * | ||
| 324 | * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, | ||
| 325 | * as the fastpath and opportunistic spinning are disabled in that case. | ||
| 326 | */ | ||
| 327 | static __always_inline void | ||
| 328 | ww_mutex_set_context_fastpath(struct ww_mutex *lock, | ||
| 329 | struct ww_acquire_ctx *ctx) | ||
| 330 | { | ||
| 331 | unsigned long flags; | ||
| 332 | struct mutex_waiter *cur; | ||
| 333 | |||
| 334 | ww_mutex_lock_acquired(lock, ctx); | ||
| 335 | |||
| 336 | lock->ctx = ctx; | ||
| 337 | |||
| 338 | /* | ||
| 339 | * The lock->ctx update should be visible on all cores before | ||
| 340 | * the atomic read is done, otherwise contended waiters might be | ||
| 341 | * missed. The contended waiters will either see ww_ctx == NULL | ||
| 342 | * and keep spinning, or it will acquire wait_lock, add itself | ||
| 343 | * to waiter list and sleep. | ||
| 344 | */ | ||
| 345 | smp_mb(); /* ^^^ */ | ||
| 346 | |||
| 347 | /* | ||
| 348 | * Check if lock is contended, if not there is nobody to wake up | ||
| 349 | */ | ||
| 350 | if (likely(atomic_read(&lock->base.count) == 0)) | ||
| 351 | return; | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Uh oh, we raced in fastpath, wake up everyone in this case, | ||
| 355 | * so they can see the new lock->ctx. | ||
| 356 | */ | ||
| 357 | spin_lock_mutex(&lock->base.wait_lock, flags); | ||
| 358 | list_for_each_entry(cur, &lock->base.wait_list, list) { | ||
| 359 | debug_mutex_wake_waiter(&lock->base, cur); | ||
| 360 | wake_up_process(cur->task); | ||
| 361 | } | ||
| 362 | spin_unlock_mutex(&lock->base.wait_lock, flags); | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | 495 | /* |
| 366 | * Lock a mutex (possibly interruptible), slowpath: | 496 | * Lock a mutex (possibly interruptible), slowpath: |
| 367 | */ | 497 | */ |
| @@ -378,104 +508,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 378 | preempt_disable(); | 508 | preempt_disable(); |
| 379 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); | 509 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
| 380 | 510 | ||
| 381 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 511 | if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { |
| 382 | /* | 512 | /* got the lock, yay! */ |
| 383 | * Optimistic spinning. | 513 | preempt_enable(); |
| 384 | * | 514 | return 0; |
| 385 | * We try to spin for acquisition when we find that the lock owner | ||
| 386 | * is currently running on a (different) CPU and while we don't | ||
| 387 | * need to reschedule. The rationale is that if the lock owner is | ||
| 388 | * running, it is likely to release the lock soon. | ||
| 389 | * | ||
| 390 | * Since this needs the lock owner, and this mutex implementation | ||
| 391 | * doesn't track the owner atomically in the lock field, we need to | ||
| 392 | * track it non-atomically. | ||
| 393 | * | ||
| 394 | * We can't do this for DEBUG_MUTEXES because that relies on wait_lock | ||
| 395 | * to serialize everything. | ||
| 396 | * | ||
| 397 | * The mutex spinners are queued up using MCS lock so that only one | ||
| 398 | * spinner can compete for the mutex. However, if mutex spinning isn't | ||
| 399 | * going to happen, there is no point in going through the lock/unlock | ||
| 400 | * overhead. | ||
| 401 | */ | ||
| 402 | if (!mutex_can_spin_on_owner(lock)) | ||
| 403 | goto slowpath; | ||
| 404 | |||
| 405 | if (!osq_lock(&lock->osq)) | ||
| 406 | goto slowpath; | ||
| 407 | |||
| 408 | for (;;) { | ||
| 409 | struct task_struct *owner; | ||
| 410 | |||
| 411 | if (use_ww_ctx && ww_ctx->acquired > 0) { | ||
| 412 | struct ww_mutex *ww; | ||
| 413 | |||
| 414 | ww = container_of(lock, struct ww_mutex, base); | ||
| 415 | /* | ||
| 416 | * If ww->ctx is set the contents are undefined, only | ||
| 417 | * by acquiring wait_lock there is a guarantee that | ||
| 418 | * they are not invalid when reading. | ||
| 419 | * | ||
| 420 | * As such, when deadlock detection needs to be | ||
| 421 | * performed the optimistic spinning cannot be done. | ||
| 422 | */ | ||
| 423 | if (ACCESS_ONCE(ww->ctx)) | ||
| 424 | break; | ||
| 425 | } | ||
| 426 | |||
| 427 | /* | ||
| 428 | * If there's an owner, wait for it to either | ||
| 429 | * release the lock or go to sleep. | ||
| 430 | */ | ||
| 431 | owner = ACCESS_ONCE(lock->owner); | ||
| 432 | if (owner && !mutex_spin_on_owner(lock, owner)) | ||
| 433 | break; | ||
| 434 | |||
| 435 | /* Try to acquire the mutex if it is unlocked. */ | ||
| 436 | if (!mutex_is_locked(lock) && | ||
| 437 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | ||
| 438 | lock_acquired(&lock->dep_map, ip); | ||
| 439 | if (use_ww_ctx) { | ||
| 440 | struct ww_mutex *ww; | ||
| 441 | ww = container_of(lock, struct ww_mutex, base); | ||
| 442 | |||
| 443 | ww_mutex_set_context_fastpath(ww, ww_ctx); | ||
| 444 | } | ||
| 445 | |||
| 446 | mutex_set_owner(lock); | ||
| 447 | osq_unlock(&lock->osq); | ||
| 448 | preempt_enable(); | ||
| 449 | return 0; | ||
| 450 | } | ||
| 451 | |||
| 452 | /* | ||
| 453 | * When there's no owner, we might have preempted between the | ||
| 454 | * owner acquiring the lock and setting the owner field. If | ||
| 455 | * we're an RT task that will live-lock because we won't let | ||
| 456 | * the owner complete. | ||
| 457 | */ | ||
| 458 | if (!owner && (need_resched() || rt_task(task))) | ||
| 459 | break; | ||
| 460 | |||
| 461 | /* | ||
| 462 | * The cpu_relax() call is a compiler barrier which forces | ||
| 463 | * everything in this loop to be re-loaded. We don't need | ||
| 464 | * memory barriers as we'll eventually observe the right | ||
| 465 | * values at the cost of a few extra spins. | ||
| 466 | */ | ||
| 467 | cpu_relax_lowlatency(); | ||
| 468 | } | 515 | } |
| 469 | osq_unlock(&lock->osq); | 516 | |
| 470 | slowpath: | ||
| 471 | /* | ||
| 472 | * If we fell out of the spin path because of need_resched(), | ||
| 473 | * reschedule now, before we try-lock the mutex. This avoids getting | ||
| 474 | * scheduled out right after we obtained the mutex. | ||
| 475 | */ | ||
| 476 | if (need_resched()) | ||
| 477 | schedule_preempt_disabled(); | ||
| 478 | #endif | ||
| 479 | spin_lock_mutex(&lock->wait_lock, flags); | 517 | spin_lock_mutex(&lock->wait_lock, flags); |
| 480 | 518 | ||
| 481 | /* | 519 | /* |
| @@ -679,15 +717,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); | |||
| 679 | * Release the lock, slowpath: | 717 | * Release the lock, slowpath: |
| 680 | */ | 718 | */ |
| 681 | static inline void | 719 | static inline void |
| 682 | __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) | 720 | __mutex_unlock_common_slowpath(struct mutex *lock, int nested) |
| 683 | { | 721 | { |
| 684 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 685 | unsigned long flags; | 722 | unsigned long flags; |
| 686 | 723 | ||
| 687 | /* | 724 | /* |
| 688 | * some architectures leave the lock unlocked in the fastpath failure | 725 | * As a performance measurement, release the lock before doing other |
| 726 | * wakeup related duties to follow. This allows other tasks to acquire | ||
| 727 | * the lock sooner, while still handling cleanups in past unlock calls. | ||
| 728 | * This can be done as we do not enforce strict equivalence between the | ||
| 729 | * mutex counter and wait_list. | ||
| 730 | * | ||
| 731 | * | ||
| 732 | * Some architectures leave the lock unlocked in the fastpath failure | ||
| 689 | * case, others need to leave it locked. In the later case we have to | 733 | * case, others need to leave it locked. In the later case we have to |
| 690 | * unlock it here | 734 | * unlock it here - as the lock counter is currently 0 or negative. |
| 691 | */ | 735 | */ |
| 692 | if (__mutex_slowpath_needs_to_unlock()) | 736 | if (__mutex_slowpath_needs_to_unlock()) |
| 693 | atomic_set(&lock->count, 1); | 737 | atomic_set(&lock->count, 1); |
| @@ -716,7 +760,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) | |||
| 716 | __visible void | 760 | __visible void |
| 717 | __mutex_unlock_slowpath(atomic_t *lock_count) | 761 | __mutex_unlock_slowpath(atomic_t *lock_count) |
| 718 | { | 762 | { |
| 719 | __mutex_unlock_common_slowpath(lock_count, 1); | 763 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 764 | |||
| 765 | __mutex_unlock_common_slowpath(lock, 1); | ||
| 720 | } | 766 | } |
| 721 | 767 | ||
| 722 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | 768 | #ifndef CONFIG_DEBUG_LOCK_ALLOC |
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h | |||
| @@ -16,7 +16,7 @@ | |||
| 16 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
| 17 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
| 18 | 18 | ||
| 19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
| 21 | { | 21 | { |
| 22 | lock->owner = current; | 22 | lock->owner = current; |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -8,7 +8,7 @@ | |||
| 8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | 8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt |
| 9 | * Copyright (C) 2006 Esben Nielsen | 9 | * Copyright (C) 2006 Esben Nielsen |
| 10 | * | 10 | * |
| 11 | * See Documentation/rt-mutex-design.txt for details. | 11 | * See Documentation/locking/rt-mutex-design.txt for details. |
| 12 | */ | 12 | */ |
| 13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
| 14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index d6203faf2eb1..7628c3fc37ca 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
| 246 | 246 | ||
| 247 | return sem; | 247 | return sem; |
| 248 | } | 248 | } |
| 249 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
| 249 | 250 | ||
| 250 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | 251 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) |
| 251 | { | 252 | { |
| 252 | if (!(count & RWSEM_ACTIVE_MASK)) { | 253 | /* |
| 253 | /* try acquiring the write lock */ | 254 | * Try acquiring the write lock. Check count first in order |
| 254 | if (sem->count == RWSEM_WAITING_BIAS && | 255 | * to reduce unnecessary expensive cmpxchg() operations. |
| 255 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, | 256 | */ |
| 256 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 257 | if (count == RWSEM_WAITING_BIAS && |
| 257 | if (!list_is_singular(&sem->wait_list)) | 258 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, |
| 258 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 259 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { |
| 259 | return true; | 260 | if (!list_is_singular(&sem->wait_list)) |
| 260 | } | 261 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
| 262 | return true; | ||
| 261 | } | 263 | } |
| 264 | |||
| 262 | return false; | 265 | return false; |
| 263 | } | 266 | } |
| 264 | 267 | ||
| @@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | |||
| 465 | 468 | ||
| 466 | return sem; | 469 | return sem; |
| 467 | } | 470 | } |
| 471 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
| 468 | 472 | ||
| 469 | /* | 473 | /* |
| 470 | * handle waking up a waiter on the semaphore | 474 | * handle waking up a waiter on the semaphore |
| @@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | |||
| 485 | 489 | ||
| 486 | return sem; | 490 | return sem; |
| 487 | } | 491 | } |
| 492 | EXPORT_SYMBOL(rwsem_wake); | ||
| 488 | 493 | ||
| 489 | /* | 494 | /* |
| 490 | * downgrade a write lock into a read lock | 495 | * downgrade a write lock into a read lock |
| @@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | |||
| 506 | 511 | ||
| 507 | return sem; | 512 | return sem; |
| 508 | } | 513 | } |
| 509 | |||
| 510 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
| 511 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
| 512 | EXPORT_SYMBOL(rwsem_wake); | ||
| 513 | EXPORT_SYMBOL(rwsem_downgrade_wake); | 514 | EXPORT_SYMBOL(rwsem_downgrade_wake); |
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..b8120abe594b 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c | |||
| @@ -36,7 +36,7 @@ | |||
| 36 | static noinline void __down(struct semaphore *sem); | 36 | static noinline void __down(struct semaphore *sem); |
| 37 | static noinline int __down_interruptible(struct semaphore *sem); | 37 | static noinline int __down_interruptible(struct semaphore *sem); |
| 38 | static noinline int __down_killable(struct semaphore *sem); | 38 | static noinline int __down_killable(struct semaphore *sem); |
| 39 | static noinline int __down_timeout(struct semaphore *sem, long jiffies); | 39 | static noinline int __down_timeout(struct semaphore *sem, long timeout); |
| 40 | static noinline void __up(struct semaphore *sem); | 40 | static noinline void __up(struct semaphore *sem); |
| 41 | 41 | ||
| 42 | /** | 42 | /** |
| @@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock); | |||
| 145 | /** | 145 | /** |
| 146 | * down_timeout - acquire the semaphore within a specified time | 146 | * down_timeout - acquire the semaphore within a specified time |
| 147 | * @sem: the semaphore to be acquired | 147 | * @sem: the semaphore to be acquired |
| 148 | * @jiffies: how long to wait before failing | 148 | * @timeout: how long to wait before failing |
| 149 | * | 149 | * |
| 150 | * Attempts to acquire the semaphore. If no more tasks are allowed to | 150 | * Attempts to acquire the semaphore. If no more tasks are allowed to |
| 151 | * acquire the semaphore, calling this function will put the task to sleep. | 151 | * acquire the semaphore, calling this function will put the task to sleep. |
| 152 | * If the semaphore is not released within the specified number of jiffies, | 152 | * If the semaphore is not released within the specified number of jiffies, |
| 153 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. | 153 | * this function returns -ETIME. It returns 0 if the semaphore was acquired. |
| 154 | */ | 154 | */ |
| 155 | int down_timeout(struct semaphore *sem, long jiffies) | 155 | int down_timeout(struct semaphore *sem, long timeout) |
| 156 | { | 156 | { |
| 157 | unsigned long flags; | 157 | unsigned long flags; |
| 158 | int result = 0; | 158 | int result = 0; |
| @@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies) | |||
| 161 | if (likely(sem->count > 0)) | 161 | if (likely(sem->count > 0)) |
| 162 | sem->count--; | 162 | sem->count--; |
| 163 | else | 163 | else |
| 164 | result = __down_timeout(sem, jiffies); | 164 | result = __down_timeout(sem, timeout); |
| 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); | 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
| 166 | 166 | ||
| 167 | return result; | 167 | return result; |
| @@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem) | |||
| 248 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); | 248 | return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); |
| 249 | } | 249 | } |
| 250 | 250 | ||
| 251 | static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) | 251 | static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) |
| 252 | { | 252 | { |
| 253 | return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); | 253 | return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); |
| 254 | } | 254 | } |
| 255 | 255 | ||
| 256 | static noinline void __sched __up(struct semaphore *sem) | 256 | static noinline void __sched __up(struct semaphore *sem) |
diff --git a/kernel/module.c b/kernel/module.c index 03214bd288e9..3965511ae133 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -42,7 +42,6 @@ | |||
| 42 | #include <linux/vermagic.h> | 42 | #include <linux/vermagic.h> |
| 43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
| 44 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
| 45 | #include <linux/stop_machine.h> | ||
| 46 | #include <linux/device.h> | 45 | #include <linux/device.h> |
| 47 | #include <linux/string.h> | 46 | #include <linux/string.h> |
| 48 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
| @@ -98,7 +97,7 @@ | |||
| 98 | * 1) List of modules (also safely readable with preempt_disable), | 97 | * 1) List of modules (also safely readable with preempt_disable), |
| 99 | * 2) module_use links, | 98 | * 2) module_use links, |
| 100 | * 3) module_addr_min/module_addr_max. | 99 | * 3) module_addr_min/module_addr_max. |
| 101 | * (delete uses stop_machine/add uses RCU list operations). */ | 100 | * (delete and add uses RCU list operations). */ |
| 102 | DEFINE_MUTEX(module_mutex); | 101 | DEFINE_MUTEX(module_mutex); |
| 103 | EXPORT_SYMBOL_GPL(module_mutex); | 102 | EXPORT_SYMBOL_GPL(module_mutex); |
| 104 | static LIST_HEAD(modules); | 103 | static LIST_HEAD(modules); |
| @@ -135,7 +134,7 @@ static int param_set_bool_enable_only(const char *val, | |||
| 135 | } | 134 | } |
| 136 | 135 | ||
| 137 | static const struct kernel_param_ops param_ops_bool_enable_only = { | 136 | static const struct kernel_param_ops param_ops_bool_enable_only = { |
| 138 | .flags = KERNEL_PARAM_FL_NOARG, | 137 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 139 | .set = param_set_bool_enable_only, | 138 | .set = param_set_bool_enable_only, |
| 140 | .get = param_get_bool, | 139 | .get = param_get_bool, |
| 141 | }; | 140 | }; |
| @@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); | |||
| 158 | * Protected by module_mutex. */ | 157 | * Protected by module_mutex. */ |
| 159 | static unsigned long module_addr_min = -1UL, module_addr_max = 0; | 158 | static unsigned long module_addr_min = -1UL, module_addr_max = 0; |
| 160 | 159 | ||
| 161 | int register_module_notifier(struct notifier_block * nb) | 160 | int register_module_notifier(struct notifier_block *nb) |
| 162 | { | 161 | { |
| 163 | return blocking_notifier_chain_register(&module_notify_list, nb); | 162 | return blocking_notifier_chain_register(&module_notify_list, nb); |
| 164 | } | 163 | } |
| 165 | EXPORT_SYMBOL(register_module_notifier); | 164 | EXPORT_SYMBOL(register_module_notifier); |
| 166 | 165 | ||
| 167 | int unregister_module_notifier(struct notifier_block * nb) | 166 | int unregister_module_notifier(struct notifier_block *nb) |
| 168 | { | 167 | { |
| 169 | return blocking_notifier_chain_unregister(&module_notify_list, nb); | 168 | return blocking_notifier_chain_unregister(&module_notify_list, nb); |
| 170 | } | 169 | } |
| @@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; | |||
| 628 | 627 | ||
| 629 | EXPORT_TRACEPOINT_SYMBOL(module_get); | 628 | EXPORT_TRACEPOINT_SYMBOL(module_get); |
| 630 | 629 | ||
| 630 | /* MODULE_REF_BASE is the base reference count by kmodule loader. */ | ||
| 631 | #define MODULE_REF_BASE 1 | ||
| 632 | |||
| 631 | /* Init the unload section of the module. */ | 633 | /* Init the unload section of the module. */ |
| 632 | static int module_unload_init(struct module *mod) | 634 | static int module_unload_init(struct module *mod) |
| 633 | { | 635 | { |
| 634 | mod->refptr = alloc_percpu(struct module_ref); | 636 | /* |
| 635 | if (!mod->refptr) | 637 | * Initialize reference counter to MODULE_REF_BASE. |
| 636 | return -ENOMEM; | 638 | * refcnt == 0 means module is going. |
| 639 | */ | ||
| 640 | atomic_set(&mod->refcnt, MODULE_REF_BASE); | ||
| 637 | 641 | ||
| 638 | INIT_LIST_HEAD(&mod->source_list); | 642 | INIT_LIST_HEAD(&mod->source_list); |
| 639 | INIT_LIST_HEAD(&mod->target_list); | 643 | INIT_LIST_HEAD(&mod->target_list); |
| 640 | 644 | ||
| 641 | /* Hold reference count during initialization. */ | 645 | /* Hold reference count during initialization. */ |
| 642 | raw_cpu_write(mod->refptr->incs, 1); | 646 | atomic_inc(&mod->refcnt); |
| 643 | 647 | ||
| 644 | return 0; | 648 | return 0; |
| 645 | } | 649 | } |
| @@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod) | |||
| 721 | kfree(use); | 725 | kfree(use); |
| 722 | } | 726 | } |
| 723 | mutex_unlock(&module_mutex); | 727 | mutex_unlock(&module_mutex); |
| 724 | |||
| 725 | free_percpu(mod->refptr); | ||
| 726 | } | 728 | } |
| 727 | 729 | ||
| 728 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 730 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
| @@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags) | |||
| 740 | } | 742 | } |
| 741 | #endif /* CONFIG_MODULE_FORCE_UNLOAD */ | 743 | #endif /* CONFIG_MODULE_FORCE_UNLOAD */ |
| 742 | 744 | ||
| 743 | struct stopref | 745 | /* Try to release refcount of module, 0 means success. */ |
| 746 | static int try_release_module_ref(struct module *mod) | ||
| 744 | { | 747 | { |
| 745 | struct module *mod; | 748 | int ret; |
| 746 | int flags; | ||
| 747 | int *forced; | ||
| 748 | }; | ||
| 749 | 749 | ||
| 750 | /* Whole machine is stopped with interrupts off when this runs. */ | 750 | /* Try to decrement refcnt which we set at loading */ |
| 751 | static int __try_stop_module(void *_sref) | 751 | ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); |
| 752 | { | 752 | BUG_ON(ret < 0); |
| 753 | struct stopref *sref = _sref; | 753 | if (ret) |
| 754 | /* Someone can put this right now, recover with checking */ | ||
| 755 | ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); | ||
| 756 | |||
| 757 | return ret; | ||
| 758 | } | ||
| 754 | 759 | ||
| 760 | static int try_stop_module(struct module *mod, int flags, int *forced) | ||
| 761 | { | ||
| 755 | /* If it's not unused, quit unless we're forcing. */ | 762 | /* If it's not unused, quit unless we're forcing. */ |
| 756 | if (module_refcount(sref->mod) != 0) { | 763 | if (try_release_module_ref(mod) != 0) { |
| 757 | if (!(*sref->forced = try_force_unload(sref->flags))) | 764 | *forced = try_force_unload(flags); |
| 765 | if (!(*forced)) | ||
| 758 | return -EWOULDBLOCK; | 766 | return -EWOULDBLOCK; |
| 759 | } | 767 | } |
| 760 | 768 | ||
| 761 | /* Mark it as dying. */ | 769 | /* Mark it as dying. */ |
| 762 | sref->mod->state = MODULE_STATE_GOING; | 770 | mod->state = MODULE_STATE_GOING; |
| 763 | return 0; | ||
| 764 | } | ||
| 765 | 771 | ||
| 766 | static int try_stop_module(struct module *mod, int flags, int *forced) | 772 | return 0; |
| 767 | { | ||
| 768 | struct stopref sref = { mod, flags, forced }; | ||
| 769 | |||
| 770 | return stop_machine(__try_stop_module, &sref, NULL); | ||
| 771 | } | 773 | } |
| 772 | 774 | ||
| 773 | unsigned long module_refcount(struct module *mod) | 775 | unsigned long module_refcount(struct module *mod) |
| 774 | { | 776 | { |
| 775 | unsigned long incs = 0, decs = 0; | 777 | return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; |
| 776 | int cpu; | ||
| 777 | |||
| 778 | for_each_possible_cpu(cpu) | ||
| 779 | decs += per_cpu_ptr(mod->refptr, cpu)->decs; | ||
| 780 | /* | ||
| 781 | * ensure the incs are added up after the decs. | ||
| 782 | * module_put ensures incs are visible before decs with smp_wmb. | ||
| 783 | * | ||
| 784 | * This 2-count scheme avoids the situation where the refcount | ||
| 785 | * for CPU0 is read, then CPU0 increments the module refcount, | ||
| 786 | * then CPU1 drops that refcount, then the refcount for CPU1 is | ||
| 787 | * read. We would record a decrement but not its corresponding | ||
| 788 | * increment so we would see a low count (disaster). | ||
| 789 | * | ||
| 790 | * Rare situation? But module_refcount can be preempted, and we | ||
| 791 | * might be tallying up 4096+ CPUs. So it is not impossible. | ||
| 792 | */ | ||
| 793 | smp_rmb(); | ||
| 794 | for_each_possible_cpu(cpu) | ||
| 795 | incs += per_cpu_ptr(mod->refptr, cpu)->incs; | ||
| 796 | return incs - decs; | ||
| 797 | } | 778 | } |
| 798 | EXPORT_SYMBOL(module_refcount); | 779 | EXPORT_SYMBOL(module_refcount); |
| 799 | 780 | ||
| @@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) | |||
| 877 | 858 | ||
| 878 | seq_printf(m, " %lu ", module_refcount(mod)); | 859 | seq_printf(m, " %lu ", module_refcount(mod)); |
| 879 | 860 | ||
| 880 | /* Always include a trailing , so userspace can differentiate | 861 | /* |
| 881 | between this and the old multi-field proc format. */ | 862 | * Always include a trailing , so userspace can differentiate |
| 863 | * between this and the old multi-field proc format. | ||
| 864 | */ | ||
| 882 | list_for_each_entry(use, &mod->source_list, source_list) { | 865 | list_for_each_entry(use, &mod->source_list, source_list) { |
| 883 | printed_something = 1; | 866 | printed_something = 1; |
| 884 | seq_printf(m, "%s,", use->source->name); | 867 | seq_printf(m, "%s,", use->source->name); |
| @@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) | |||
| 886 | 869 | ||
| 887 | if (mod->init != NULL && mod->exit == NULL) { | 870 | if (mod->init != NULL && mod->exit == NULL) { |
| 888 | printed_something = 1; | 871 | printed_something = 1; |
| 889 | seq_printf(m, "[permanent],"); | 872 | seq_puts(m, "[permanent],"); |
| 890 | } | 873 | } |
| 891 | 874 | ||
| 892 | if (!printed_something) | 875 | if (!printed_something) |
| 893 | seq_printf(m, "-"); | 876 | seq_puts(m, "-"); |
| 894 | } | 877 | } |
| 895 | 878 | ||
| 896 | void __symbol_put(const char *symbol) | 879 | void __symbol_put(const char *symbol) |
| @@ -935,7 +918,7 @@ void __module_get(struct module *module) | |||
| 935 | { | 918 | { |
| 936 | if (module) { | 919 | if (module) { |
| 937 | preempt_disable(); | 920 | preempt_disable(); |
| 938 | __this_cpu_inc(module->refptr->incs); | 921 | atomic_inc(&module->refcnt); |
| 939 | trace_module_get(module, _RET_IP_); | 922 | trace_module_get(module, _RET_IP_); |
| 940 | preempt_enable(); | 923 | preempt_enable(); |
| 941 | } | 924 | } |
| @@ -948,11 +931,11 @@ bool try_module_get(struct module *module) | |||
| 948 | 931 | ||
| 949 | if (module) { | 932 | if (module) { |
| 950 | preempt_disable(); | 933 | preempt_disable(); |
| 951 | 934 | /* Note: here, we can fail to get a reference */ | |
| 952 | if (likely(module_is_live(module))) { | 935 | if (likely(module_is_live(module) && |
| 953 | __this_cpu_inc(module->refptr->incs); | 936 | atomic_inc_not_zero(&module->refcnt) != 0)) |
| 954 | trace_module_get(module, _RET_IP_); | 937 | trace_module_get(module, _RET_IP_); |
| 955 | } else | 938 | else |
| 956 | ret = false; | 939 | ret = false; |
| 957 | 940 | ||
| 958 | preempt_enable(); | 941 | preempt_enable(); |
| @@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get); | |||
| 963 | 946 | ||
| 964 | void module_put(struct module *module) | 947 | void module_put(struct module *module) |
| 965 | { | 948 | { |
| 949 | int ret; | ||
| 950 | |||
| 966 | if (module) { | 951 | if (module) { |
| 967 | preempt_disable(); | 952 | preempt_disable(); |
| 968 | smp_wmb(); /* see comment in module_refcount */ | 953 | ret = atomic_dec_if_positive(&module->refcnt); |
| 969 | __this_cpu_inc(module->refptr->decs); | 954 | WARN_ON(ret < 0); /* Failed to put refcount */ |
| 970 | |||
| 971 | trace_module_put(module, _RET_IP_); | 955 | trace_module_put(module, _RET_IP_); |
| 972 | preempt_enable(); | 956 | preempt_enable(); |
| 973 | } | 957 | } |
| @@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put); | |||
| 978 | static inline void print_unload_info(struct seq_file *m, struct module *mod) | 962 | static inline void print_unload_info(struct seq_file *m, struct module *mod) |
| 979 | { | 963 | { |
| 980 | /* We don't know the usage count, or what modules are using. */ | 964 | /* We don't know the usage count, or what modules are using. */ |
| 981 | seq_printf(m, " - -"); | 965 | seq_puts(m, " - -"); |
| 982 | } | 966 | } |
| 983 | 967 | ||
| 984 | static inline void module_unload_free(struct module *mod) | 968 | static inline void module_unload_free(struct module *mod) |
| @@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc, | |||
| 1131 | static int check_version(Elf_Shdr *sechdrs, | 1115 | static int check_version(Elf_Shdr *sechdrs, |
| 1132 | unsigned int versindex, | 1116 | unsigned int versindex, |
| 1133 | const char *symname, | 1117 | const char *symname, |
| 1134 | struct module *mod, | 1118 | struct module *mod, |
| 1135 | const unsigned long *crc, | 1119 | const unsigned long *crc, |
| 1136 | const struct module *crc_owner) | 1120 | const struct module *crc_owner) |
| 1137 | { | 1121 | { |
| @@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
| 1165 | return 0; | 1149 | return 0; |
| 1166 | 1150 | ||
| 1167 | bad_version: | 1151 | bad_version: |
| 1168 | printk("%s: disagrees about version of symbol %s\n", | 1152 | pr_warn("%s: disagrees about version of symbol %s\n", |
| 1169 | mod->name, symname); | 1153 | mod->name, symname); |
| 1170 | return 0; | 1154 | return 0; |
| 1171 | } | 1155 | } |
| @@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
| 1200 | static inline int check_version(Elf_Shdr *sechdrs, | 1184 | static inline int check_version(Elf_Shdr *sechdrs, |
| 1201 | unsigned int versindex, | 1185 | unsigned int versindex, |
| 1202 | const char *symname, | 1186 | const char *symname, |
| 1203 | struct module *mod, | 1187 | struct module *mod, |
| 1204 | const unsigned long *crc, | 1188 | const unsigned long *crc, |
| 1205 | const struct module *crc_owner) | 1189 | const struct module *crc_owner) |
| 1206 | { | 1190 | { |
| @@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) | |||
| 1288 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; | 1272 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; |
| 1289 | } | 1273 | } |
| 1290 | 1274 | ||
| 1291 | struct module_sect_attr | 1275 | struct module_sect_attr { |
| 1292 | { | ||
| 1293 | struct module_attribute mattr; | 1276 | struct module_attribute mattr; |
| 1294 | char *name; | 1277 | char *name; |
| 1295 | unsigned long address; | 1278 | unsigned long address; |
| 1296 | }; | 1279 | }; |
| 1297 | 1280 | ||
| 1298 | struct module_sect_attrs | 1281 | struct module_sect_attrs { |
| 1299 | { | ||
| 1300 | struct attribute_group grp; | 1282 | struct attribute_group grp; |
| 1301 | unsigned int nsections; | 1283 | unsigned int nsections; |
| 1302 | struct module_sect_attr attrs[0]; | 1284 | struct module_sect_attr attrs[0]; |
| @@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod) | |||
| 1550 | (attr->test && attr->test(mod))) { | 1532 | (attr->test && attr->test(mod))) { |
| 1551 | memcpy(temp_attr, attr, sizeof(*temp_attr)); | 1533 | memcpy(temp_attr, attr, sizeof(*temp_attr)); |
| 1552 | sysfs_attr_init(&temp_attr->attr); | 1534 | sysfs_attr_init(&temp_attr->attr); |
| 1553 | error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); | 1535 | error = sysfs_create_file(&mod->mkobj.kobj, |
| 1536 | &temp_attr->attr); | ||
| 1554 | ++temp_attr; | 1537 | ++temp_attr; |
| 1555 | } | 1538 | } |
| 1556 | } | 1539 | } |
| @@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
| 1566 | /* pick a field to test for end of list */ | 1549 | /* pick a field to test for end of list */ |
| 1567 | if (!attr->attr.name) | 1550 | if (!attr->attr.name) |
| 1568 | break; | 1551 | break; |
| 1569 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); | 1552 | sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); |
| 1570 | if (attr->free) | 1553 | if (attr->free) |
| 1571 | attr->free(mod); | 1554 | attr->free(mod); |
| 1572 | } | 1555 | } |
| @@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod) | |||
| 1697 | mod_sysfs_fini(mod); | 1680 | mod_sysfs_fini(mod); |
| 1698 | } | 1681 | } |
| 1699 | 1682 | ||
| 1700 | /* | ||
| 1701 | * unlink the module with the whole machine is stopped with interrupts off | ||
| 1702 | * - this defends against kallsyms not taking locks | ||
| 1703 | */ | ||
| 1704 | static int __unlink_module(void *_mod) | ||
| 1705 | { | ||
| 1706 | struct module *mod = _mod; | ||
| 1707 | list_del(&mod->list); | ||
| 1708 | module_bug_cleanup(mod); | ||
| 1709 | return 0; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX | 1683 | #ifdef CONFIG_DEBUG_SET_MODULE_RONX |
| 1713 | /* | 1684 | /* |
| 1714 | * LKM RO/NX protection: protect module's text/ro-data | 1685 | * LKM RO/NX protection: protect module's text/ro-data |
| @@ -1842,7 +1813,9 @@ static void free_module(struct module *mod) | |||
| 1842 | 1813 | ||
| 1843 | /* We leave it in list to prevent duplicate loads, but make sure | 1814 | /* We leave it in list to prevent duplicate loads, but make sure |
| 1844 | * that noone uses it while it's being deconstructed. */ | 1815 | * that noone uses it while it's being deconstructed. */ |
| 1816 | mutex_lock(&module_mutex); | ||
| 1845 | mod->state = MODULE_STATE_UNFORMED; | 1817 | mod->state = MODULE_STATE_UNFORMED; |
| 1818 | mutex_unlock(&module_mutex); | ||
| 1846 | 1819 | ||
| 1847 | /* Remove dynamic debug info */ | 1820 | /* Remove dynamic debug info */ |
| 1848 | ddebug_remove_module(mod->name); | 1821 | ddebug_remove_module(mod->name); |
| @@ -1858,7 +1831,12 @@ static void free_module(struct module *mod) | |||
| 1858 | 1831 | ||
| 1859 | /* Now we can delete it from the lists */ | 1832 | /* Now we can delete it from the lists */ |
| 1860 | mutex_lock(&module_mutex); | 1833 | mutex_lock(&module_mutex); |
| 1861 | stop_machine(__unlink_module, mod, NULL); | 1834 | /* Unlink carefully: kallsyms could be walking list. */ |
| 1835 | list_del_rcu(&mod->list); | ||
| 1836 | /* Remove this module from bug list, this uses list_del_rcu */ | ||
| 1837 | module_bug_cleanup(mod); | ||
| 1838 | /* Wait for RCU synchronizing before releasing mod->list and buglist. */ | ||
| 1839 | synchronize_rcu(); | ||
| 1862 | mutex_unlock(&module_mutex); | 1840 | mutex_unlock(&module_mutex); |
| 1863 | 1841 | ||
| 1864 | /* This may be NULL, but that's OK */ | 1842 | /* This may be NULL, but that's OK */ |
| @@ -1953,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
| 1953 | /* We compiled with -fno-common. These are not | 1931 | /* We compiled with -fno-common. These are not |
| 1954 | supposed to happen. */ | 1932 | supposed to happen. */ |
| 1955 | pr_debug("Common symbol: %s\n", name); | 1933 | pr_debug("Common symbol: %s\n", name); |
| 1956 | printk("%s: please compile with -fno-common\n", | 1934 | pr_warn("%s: please compile with -fno-common\n", |
| 1957 | mod->name); | 1935 | mod->name); |
| 1958 | ret = -ENOEXEC; | 1936 | ret = -ENOEXEC; |
| 1959 | break; | 1937 | break; |
| @@ -2257,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) | |||
| 2257 | } | 2235 | } |
| 2258 | 2236 | ||
| 2259 | static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | 2237 | static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, |
| 2260 | unsigned int shnum) | 2238 | unsigned int shnum) |
| 2261 | { | 2239 | { |
| 2262 | const Elf_Shdr *sec; | 2240 | const Elf_Shdr *sec; |
| 2263 | 2241 | ||
| @@ -2733,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 2733 | * This shouldn't happen with same compiler and binutils | 2711 | * This shouldn't happen with same compiler and binutils |
| 2734 | * building all parts of the module. | 2712 | * building all parts of the module. |
| 2735 | */ | 2713 | */ |
| 2736 | printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", | 2714 | pr_warn("%s: has both .ctors and .init_array.\n", |
| 2737 | mod->name); | 2715 | mod->name); |
| 2738 | return -EINVAL; | 2716 | return -EINVAL; |
| 2739 | } | 2717 | } |
| @@ -3021,8 +2999,10 @@ static int do_init_module(struct module *mod) | |||
| 3021 | if (mod->init != NULL) | 2999 | if (mod->init != NULL) |
| 3022 | ret = do_one_initcall(mod->init); | 3000 | ret = do_one_initcall(mod->init); |
| 3023 | if (ret < 0) { | 3001 | if (ret < 0) { |
| 3024 | /* Init routine failed: abort. Try to protect us from | 3002 | /* |
| 3025 | buggy refcounters. */ | 3003 | * Init routine failed: abort. Try to protect us from |
| 3004 | * buggy refcounters. | ||
| 3005 | */ | ||
| 3026 | mod->state = MODULE_STATE_GOING; | 3006 | mod->state = MODULE_STATE_GOING; |
| 3027 | synchronize_sched(); | 3007 | synchronize_sched(); |
| 3028 | module_put(mod); | 3008 | module_put(mod); |
| @@ -3095,6 +3075,32 @@ static int may_init_module(void) | |||
| 3095 | } | 3075 | } |
| 3096 | 3076 | ||
| 3097 | /* | 3077 | /* |
| 3078 | * Can't use wait_event_interruptible() because our condition | ||
| 3079 | * 'finished_loading()' contains a blocking primitive itself (mutex_lock). | ||
| 3080 | */ | ||
| 3081 | static int wait_finished_loading(struct module *mod) | ||
| 3082 | { | ||
| 3083 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | ||
| 3084 | int ret = 0; | ||
| 3085 | |||
| 3086 | add_wait_queue(&module_wq, &wait); | ||
| 3087 | for (;;) { | ||
| 3088 | if (finished_loading(mod->name)) | ||
| 3089 | break; | ||
| 3090 | |||
| 3091 | if (signal_pending(current)) { | ||
| 3092 | ret = -ERESTARTSYS; | ||
| 3093 | break; | ||
| 3094 | } | ||
| 3095 | |||
| 3096 | wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | ||
| 3097 | } | ||
| 3098 | remove_wait_queue(&module_wq, &wait); | ||
| 3099 | |||
| 3100 | return ret; | ||
| 3101 | } | ||
| 3102 | |||
| 3103 | /* | ||
| 3098 | * We try to place it in the list now to make sure it's unique before | 3104 | * We try to place it in the list now to make sure it's unique before |
| 3099 | * we dedicate too many resources. In particular, temporary percpu | 3105 | * we dedicate too many resources. In particular, temporary percpu |
| 3100 | * memory exhaustion. | 3106 | * memory exhaustion. |
| @@ -3114,8 +3120,8 @@ again: | |||
| 3114 | || old->state == MODULE_STATE_UNFORMED) { | 3120 | || old->state == MODULE_STATE_UNFORMED) { |
| 3115 | /* Wait in case it fails to load. */ | 3121 | /* Wait in case it fails to load. */ |
| 3116 | mutex_unlock(&module_mutex); | 3122 | mutex_unlock(&module_mutex); |
| 3117 | err = wait_event_interruptible(module_wq, | 3123 | |
| 3118 | finished_loading(mod->name)); | 3124 | err = wait_finished_loading(mod); |
| 3119 | if (err) | 3125 | if (err) |
| 3120 | goto out_unlocked; | 3126 | goto out_unlocked; |
| 3121 | goto again; | 3127 | goto again; |
| @@ -3174,7 +3180,7 @@ out: | |||
| 3174 | 3180 | ||
| 3175 | static int unknown_module_param_cb(char *param, char *val, const char *modname) | 3181 | static int unknown_module_param_cb(char *param, char *val, const char *modname) |
| 3176 | { | 3182 | { |
| 3177 | /* Check for magic 'dyndbg' arg */ | 3183 | /* Check for magic 'dyndbg' arg */ |
| 3178 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); | 3184 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); |
| 3179 | if (ret != 0) | 3185 | if (ret != 0) |
| 3180 | pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); | 3186 | pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); |
| @@ -3324,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3324 | /* Unlink carefully: kallsyms could be walking list. */ | 3330 | /* Unlink carefully: kallsyms could be walking list. */ |
| 3325 | list_del_rcu(&mod->list); | 3331 | list_del_rcu(&mod->list); |
| 3326 | wake_up_all(&module_wq); | 3332 | wake_up_all(&module_wq); |
| 3333 | /* Wait for RCU synchronizing before releasing mod->list. */ | ||
| 3334 | synchronize_rcu(); | ||
| 3327 | mutex_unlock(&module_mutex); | 3335 | mutex_unlock(&module_mutex); |
| 3328 | free_module: | 3336 | free_module: |
| 3329 | module_deallocate(mod, info); | 3337 | module_deallocate(mod, info); |
| @@ -3388,7 +3396,7 @@ static inline int is_arm_mapping_symbol(const char *str) | |||
| 3388 | { | 3396 | { |
| 3389 | if (str[0] == '.' && str[1] == 'L') | 3397 | if (str[0] == '.' && str[1] == 'L') |
| 3390 | return true; | 3398 | return true; |
| 3391 | return str[0] == '$' && strchr("atd", str[1]) | 3399 | return str[0] == '$' && strchr("axtd", str[1]) |
| 3392 | && (str[2] == '\0' || str[2] == '.'); | 3400 | && (str[2] == '\0' || str[2] == '.'); |
| 3393 | } | 3401 | } |
| 3394 | 3402 | ||
| @@ -3657,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3657 | 3665 | ||
| 3658 | /* Informative for users. */ | 3666 | /* Informative for users. */ |
| 3659 | seq_printf(m, " %s", | 3667 | seq_printf(m, " %s", |
| 3660 | mod->state == MODULE_STATE_GOING ? "Unloading": | 3668 | mod->state == MODULE_STATE_GOING ? "Unloading" : |
| 3661 | mod->state == MODULE_STATE_COMING ? "Loading": | 3669 | mod->state == MODULE_STATE_COMING ? "Loading" : |
| 3662 | "Live"); | 3670 | "Live"); |
| 3663 | /* Used by oprofile and other similar tools. */ | 3671 | /* Used by oprofile and other similar tools. */ |
| 3664 | seq_printf(m, " 0x%pK", mod->module_core); | 3672 | seq_printf(m, " 0x%pK", mod->module_core); |
| @@ -3667,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p) | |||
| 3667 | if (mod->taints) | 3675 | if (mod->taints) |
| 3668 | seq_printf(m, " %s", module_flags(mod, buf)); | 3676 | seq_printf(m, " %s", module_flags(mod, buf)); |
| 3669 | 3677 | ||
| 3670 | seq_printf(m, "\n"); | 3678 | seq_puts(m, "\n"); |
| 3671 | return 0; | 3679 | return 0; |
| 3672 | } | 3680 | } |
| 3673 | 3681 | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ef42d0ab3115..49746c81ad8d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) | |||
| 220 | 220 | ||
| 221 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) | 221 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) |
| 222 | { | 222 | { |
| 223 | const struct proc_ns_operations *ops; | ||
| 224 | struct task_struct *tsk = current; | 223 | struct task_struct *tsk = current; |
| 225 | struct nsproxy *new_nsproxy; | 224 | struct nsproxy *new_nsproxy; |
| 226 | struct proc_ns *ei; | ||
| 227 | struct file *file; | 225 | struct file *file; |
| 226 | struct ns_common *ns; | ||
| 228 | int err; | 227 | int err; |
| 229 | 228 | ||
| 230 | file = proc_ns_fget(fd); | 229 | file = proc_ns_fget(fd); |
| @@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
| 232 | return PTR_ERR(file); | 231 | return PTR_ERR(file); |
| 233 | 232 | ||
| 234 | err = -EINVAL; | 233 | err = -EINVAL; |
| 235 | ei = get_proc_ns(file_inode(file)); | 234 | ns = get_proc_ns(file_inode(file)); |
| 236 | ops = ei->ns_ops; | 235 | if (nstype && (ns->ops->type != nstype)) |
| 237 | if (nstype && (ops->type != nstype)) | ||
| 238 | goto out; | 236 | goto out; |
| 239 | 237 | ||
| 240 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); | 238 | new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); |
| @@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) | |||
| 243 | goto out; | 241 | goto out; |
| 244 | } | 242 | } |
| 245 | 243 | ||
| 246 | err = ops->install(new_nsproxy, ei->ns); | 244 | err = ns->ops->install(new_nsproxy, ns); |
| 247 | if (err) { | 245 | if (err) { |
| 248 | free_nsproxy(new_nsproxy); | 246 | free_nsproxy(new_nsproxy); |
| 249 | goto out; | 247 | goto out; |
diff --git a/kernel/panic.c b/kernel/panic.c index d09dc5c32c67..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -33,6 +33,7 @@ static int pause_on_oops; | |||
| 33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
| 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
| 35 | static bool crash_kexec_post_notifiers; | 35 | static bool crash_kexec_post_notifiers; |
| 36 | int panic_on_warn __read_mostly; | ||
| 36 | 37 | ||
| 37 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | 38 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
| 38 | EXPORT_SYMBOL_GPL(panic_timeout); | 39 | EXPORT_SYMBOL_GPL(panic_timeout); |
| @@ -244,6 +245,7 @@ static const struct tnt tnts[] = { | |||
| 244 | * 'I' - Working around severe firmware bug. | 245 | * 'I' - Working around severe firmware bug. |
| 245 | * 'O' - Out-of-tree module has been loaded. | 246 | * 'O' - Out-of-tree module has been loaded. |
| 246 | * 'E' - Unsigned module has been loaded. | 247 | * 'E' - Unsigned module has been loaded. |
| 248 | * 'L' - A soft lockup has previously occurred. | ||
| 247 | * | 249 | * |
| 248 | * The string is overwritten by the next call to print_tainted(). | 250 | * The string is overwritten by the next call to print_tainted(). |
| 249 | */ | 251 | */ |
| @@ -427,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, | |||
| 427 | if (args) | 429 | if (args) |
| 428 | vprintk(args->fmt, args->args); | 430 | vprintk(args->fmt, args->args); |
| 429 | 431 | ||
| 432 | if (panic_on_warn) { | ||
| 433 | /* | ||
| 434 | * This thread may hit another WARN() in the panic path. | ||
| 435 | * Resetting this prevents additional WARN() from panicking the | ||
| 436 | * system on this thread. Other threads are blocked by the | ||
| 437 | * panic_mutex in panic(). | ||
| 438 | */ | ||
| 439 | panic_on_warn = 0; | ||
| 440 | panic("panic_on_warn set ...\n"); | ||
| 441 | } | ||
| 442 | |||
| 430 | print_modules(); | 443 | print_modules(); |
| 431 | dump_stack(); | 444 | dump_stack(); |
| 432 | print_oops_end_marker(); | 445 | print_oops_end_marker(); |
| @@ -484,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
| 484 | 497 | ||
| 485 | core_param(panic, panic_timeout, int, 0644); | 498 | core_param(panic, panic_timeout, int, 0644); |
| 486 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 499 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
| 500 | core_param(panic_on_warn, panic_on_warn, int, 0644); | ||
| 487 | 501 | ||
| 488 | static int __init setup_crash_kexec_post_notifiers(char *s) | 502 | static int __init setup_crash_kexec_post_notifiers(char *s) |
| 489 | { | 503 | { |
diff --git a/kernel/params.c b/kernel/params.c index 34f527023794..0af9b2c4e56c 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
| 20 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 22 | #include <linux/moduleparam.h> | ||
| 22 | #include <linux/device.h> | 23 | #include <linux/device.h> |
| 23 | #include <linux/err.h> | 24 | #include <linux/err.h> |
| 24 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| @@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b) | |||
| 83 | return parameqn(a, b, strlen(a)+1); | 84 | return parameqn(a, b, strlen(a)+1); |
| 84 | } | 85 | } |
| 85 | 86 | ||
| 87 | static void param_check_unsafe(const struct kernel_param *kp) | ||
| 88 | { | ||
| 89 | if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { | ||
| 90 | pr_warn("Setting dangerous option %s - tainting kernel\n", | ||
| 91 | kp->name); | ||
| 92 | add_taint(TAINT_USER, LOCKDEP_STILL_OK); | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 86 | static int parse_one(char *param, | 96 | static int parse_one(char *param, |
| 87 | char *val, | 97 | char *val, |
| 88 | const char *doing, | 98 | const char *doing, |
| @@ -104,11 +114,12 @@ static int parse_one(char *param, | |||
| 104 | return 0; | 114 | return 0; |
| 105 | /* No one handled NULL, so do it here. */ | 115 | /* No one handled NULL, so do it here. */ |
| 106 | if (!val && | 116 | if (!val && |
| 107 | !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) | 117 | !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) |
| 108 | return -EINVAL; | 118 | return -EINVAL; |
| 109 | pr_debug("handling %s with %p\n", param, | 119 | pr_debug("handling %s with %p\n", param, |
| 110 | params[i].ops->set); | 120 | params[i].ops->set); |
| 111 | mutex_lock(¶m_lock); | 121 | mutex_lock(¶m_lock); |
| 122 | param_check_unsafe(¶ms[i]); | ||
| 112 | err = params[i].ops->set(val, ¶ms[i]); | 123 | err = params[i].ops->set(val, ¶ms[i]); |
| 113 | mutex_unlock(¶m_lock); | 124 | mutex_unlock(¶m_lock); |
| 114 | return err; | 125 | return err; |
| @@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) | |||
| 318 | EXPORT_SYMBOL(param_get_bool); | 329 | EXPORT_SYMBOL(param_get_bool); |
| 319 | 330 | ||
| 320 | struct kernel_param_ops param_ops_bool = { | 331 | struct kernel_param_ops param_ops_bool = { |
| 321 | .flags = KERNEL_PARAM_FL_NOARG, | 332 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 322 | .set = param_set_bool, | 333 | .set = param_set_bool, |
| 323 | .get = param_get_bool, | 334 | .get = param_get_bool, |
| 324 | }; | 335 | }; |
| @@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) | |||
| 369 | EXPORT_SYMBOL(param_set_bint); | 380 | EXPORT_SYMBOL(param_set_bint); |
| 370 | 381 | ||
| 371 | struct kernel_param_ops param_ops_bint = { | 382 | struct kernel_param_ops param_ops_bint = { |
| 372 | .flags = KERNEL_PARAM_FL_NOARG, | 383 | .flags = KERNEL_PARAM_OPS_FL_NOARG, |
| 373 | .set = param_set_bint, | 384 | .set = param_set_bint, |
| 374 | .get = param_get_int, | 385 | .get = param_get_int, |
| 375 | }; | 386 | }; |
| @@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string); | |||
| 503 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) | 514 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) |
| 504 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) | 515 | #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) |
| 505 | 516 | ||
| 506 | extern struct kernel_param __start___param[], __stop___param[]; | ||
| 507 | |||
| 508 | struct param_attribute | 517 | struct param_attribute |
| 509 | { | 518 | { |
| 510 | struct module_attribute mattr; | 519 | struct module_attribute mattr; |
| @@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
| 552 | return -EPERM; | 561 | return -EPERM; |
| 553 | 562 | ||
| 554 | mutex_lock(¶m_lock); | 563 | mutex_lock(¶m_lock); |
| 564 | param_check_unsafe(attribute->param); | ||
| 555 | err = attribute->param->ops->set(buf, attribute->param); | 565 | err = attribute->param->ops->set(buf, attribute->param); |
| 556 | mutex_unlock(¶m_lock); | 566 | mutex_unlock(¶m_lock); |
| 557 | if (!err) | 567 | if (!err) |
| @@ -593,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, | |||
| 593 | const struct kernel_param *kp, | 603 | const struct kernel_param *kp, |
| 594 | const char *name) | 604 | const char *name) |
| 595 | { | 605 | { |
| 596 | struct module_param_attrs *new; | 606 | struct module_param_attrs *new_mp; |
| 597 | struct attribute **attrs; | 607 | struct attribute **new_attrs; |
| 598 | int err, num; | 608 | unsigned int i; |
| 599 | 609 | ||
| 600 | /* We don't bother calling this with invisible parameters. */ | 610 | /* We don't bother calling this with invisible parameters. */ |
| 601 | BUG_ON(!kp->perm); | 611 | BUG_ON(!kp->perm); |
| 602 | 612 | ||
| 603 | if (!mk->mp) { | 613 | if (!mk->mp) { |
| 604 | num = 0; | 614 | /* First allocation. */ |
| 605 | attrs = NULL; | 615 | mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); |
| 606 | } else { | 616 | if (!mk->mp) |
| 607 | num = mk->mp->num; | 617 | return -ENOMEM; |
| 608 | attrs = mk->mp->grp.attrs; | 618 | mk->mp->grp.name = "parameters"; |
| 619 | /* NULL-terminated attribute array. */ | ||
| 620 | mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), | ||
| 621 | GFP_KERNEL); | ||
| 622 | /* Caller will cleanup via free_module_param_attrs */ | ||
| 623 | if (!mk->mp->grp.attrs) | ||
| 624 | return -ENOMEM; | ||
| 609 | } | 625 | } |
| 610 | 626 | ||
| 611 | /* Enlarge. */ | 627 | /* Enlarge allocations. */ |
| 612 | new = krealloc(mk->mp, | 628 | new_mp = krealloc(mk->mp, |
| 613 | sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), | 629 | sizeof(*mk->mp) + |
| 614 | GFP_KERNEL); | 630 | sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), |
| 615 | if (!new) { | 631 | GFP_KERNEL); |
| 616 | kfree(attrs); | 632 | if (!new_mp) |
| 617 | err = -ENOMEM; | 633 | return -ENOMEM; |
| 618 | goto fail; | 634 | mk->mp = new_mp; |
| 619 | } | ||
| 620 | /* Despite looking like the typical realloc() bug, this is safe. | ||
| 621 | * We *want* the old 'attrs' to be freed either way, and we'll store | ||
| 622 | * the new one in the success case. */ | ||
| 623 | attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); | ||
| 624 | if (!attrs) { | ||
| 625 | err = -ENOMEM; | ||
| 626 | goto fail_free_new; | ||
| 627 | } | ||
| 628 | 635 | ||
| 629 | /* Sysfs wants everything zeroed. */ | 636 | /* Extra pointer for NULL terminator */ |
| 630 | memset(new, 0, sizeof(*new)); | 637 | new_attrs = krealloc(mk->mp->grp.attrs, |
| 631 | memset(&new->attrs[num], 0, sizeof(new->attrs[num])); | 638 | sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), |
| 632 | memset(&attrs[num], 0, sizeof(attrs[num])); | 639 | GFP_KERNEL); |
| 633 | new->grp.name = "parameters"; | 640 | if (!new_attrs) |
| 634 | new->grp.attrs = attrs; | 641 | return -ENOMEM; |
| 642 | mk->mp->grp.attrs = new_attrs; | ||
| 635 | 643 | ||
| 636 | /* Tack new one on the end. */ | 644 | /* Tack new one on the end. */ |
| 637 | sysfs_attr_init(&new->attrs[num].mattr.attr); | 645 | sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); |
| 638 | new->attrs[num].param = kp; | 646 | mk->mp->attrs[mk->mp->num].param = kp; |
| 639 | new->attrs[num].mattr.show = param_attr_show; | 647 | mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; |
| 640 | new->attrs[num].mattr.store = param_attr_store; | 648 | /* Do not allow runtime DAC changes to make param writable. */ |
| 641 | new->attrs[num].mattr.attr.name = (char *)name; | 649 | if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) |
| 642 | new->attrs[num].mattr.attr.mode = kp->perm; | 650 | mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; |
| 643 | new->num = num+1; | 651 | mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; |
| 652 | mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; | ||
| 653 | mk->mp->num++; | ||
| 644 | 654 | ||
| 645 | /* Fix up all the pointers, since krealloc can move us */ | 655 | /* Fix up all the pointers, since krealloc can move us */ |
| 646 | for (num = 0; num < new->num; num++) | 656 | for (i = 0; i < mk->mp->num; i++) |
| 647 | new->grp.attrs[num] = &new->attrs[num].mattr.attr; | 657 | mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; |
| 648 | new->grp.attrs[num] = NULL; | 658 | mk->mp->grp.attrs[mk->mp->num] = NULL; |
| 649 | |||
| 650 | mk->mp = new; | ||
| 651 | return 0; | 659 | return 0; |
| 652 | |||
| 653 | fail_free_new: | ||
| 654 | kfree(new); | ||
| 655 | fail: | ||
| 656 | mk->mp = NULL; | ||
| 657 | return err; | ||
| 658 | } | 660 | } |
| 659 | 661 | ||
| 660 | #ifdef CONFIG_MODULES | 662 | #ifdef CONFIG_MODULES |
| 661 | static void free_module_param_attrs(struct module_kobject *mk) | 663 | static void free_module_param_attrs(struct module_kobject *mk) |
| 662 | { | 664 | { |
| 663 | kfree(mk->mp->grp.attrs); | 665 | if (mk->mp) |
| 666 | kfree(mk->mp->grp.attrs); | ||
| 664 | kfree(mk->mp); | 667 | kfree(mk->mp); |
| 665 | mk->mp = NULL; | 668 | mk->mp = NULL; |
| 666 | } | 669 | } |
| @@ -685,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod, | |||
| 685 | if (kparam[i].perm == 0) | 688 | if (kparam[i].perm == 0) |
| 686 | continue; | 689 | continue; |
| 687 | err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); | 690 | err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); |
| 688 | if (err) | 691 | if (err) { |
| 692 | free_module_param_attrs(&mod->mkobj); | ||
| 689 | return err; | 693 | return err; |
| 694 | } | ||
| 690 | params = true; | 695 | params = true; |
| 691 | } | 696 | } |
| 692 | 697 | ||
| @@ -763,7 +768,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
| 763 | } | 768 | } |
| 764 | 769 | ||
| 765 | static void __init kernel_add_sysfs_param(const char *name, | 770 | static void __init kernel_add_sysfs_param(const char *name, |
| 766 | struct kernel_param *kparam, | 771 | const struct kernel_param *kparam, |
| 767 | unsigned int name_skip) | 772 | unsigned int name_skip) |
| 768 | { | 773 | { |
| 769 | struct module_kobject *mk; | 774 | struct module_kobject *mk; |
| @@ -798,7 +803,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
| 798 | */ | 803 | */ |
| 799 | static void __init param_sysfs_builtin(void) | 804 | static void __init param_sysfs_builtin(void) |
| 800 | { | 805 | { |
| 801 | struct kernel_param *kp; | 806 | const struct kernel_param *kp; |
| 802 | unsigned int name_len; | 807 | unsigned int name_len; |
| 803 | char modname[MODULE_NAME_LEN]; | 808 | char modname[MODULE_NAME_LEN]; |
| 804 | 809 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a26698144..cd36a5e0d173 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { | |||
| 79 | .level = 0, | 79 | .level = 0, |
| 80 | .child_reaper = &init_task, | 80 | .child_reaper = &init_task, |
| 81 | .user_ns = &init_user_ns, | 81 | .user_ns = &init_user_ns, |
| 82 | .proc_inum = PROC_PID_INIT_INO, | 82 | .ns.inum = PROC_PID_INIT_INO, |
| 83 | #ifdef CONFIG_PID_NS | ||
| 84 | .ns.ops = &pidns_operations, | ||
| 85 | #endif | ||
| 83 | }; | 86 | }; |
| 84 | EXPORT_SYMBOL_GPL(init_pid_ns); | 87 | EXPORT_SYMBOL_GPL(init_pid_ns); |
| 85 | 88 | ||
| @@ -341,6 +344,8 @@ out: | |||
| 341 | 344 | ||
| 342 | out_unlock: | 345 | out_unlock: |
| 343 | spin_unlock_irq(&pidmap_lock); | 346 | spin_unlock_irq(&pidmap_lock); |
| 347 | put_pid_ns(ns); | ||
| 348 | |||
| 344 | out_free: | 349 | out_free: |
| 345 | while (++i <= ns->level) | 350 | while (++i <= ns->level) |
| 346 | free_pidmap(pid->numbers + i); | 351 | free_pidmap(pid->numbers + i); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761b..a65ba137fd15 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||
| 105 | if (ns->pid_cachep == NULL) | 105 | if (ns->pid_cachep == NULL) |
| 106 | goto out_free_map; | 106 | goto out_free_map; |
| 107 | 107 | ||
| 108 | err = proc_alloc_inum(&ns->proc_inum); | 108 | err = ns_alloc_inum(&ns->ns); |
| 109 | if (err) | 109 | if (err) |
| 110 | goto out_free_map; | 110 | goto out_free_map; |
| 111 | ns->ns.ops = &pidns_operations; | ||
| 111 | 112 | ||
| 112 | kref_init(&ns->kref); | 113 | kref_init(&ns->kref); |
| 113 | ns->level = level; | 114 | ns->level = level; |
| @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) | |||
| 142 | { | 143 | { |
| 143 | int i; | 144 | int i; |
| 144 | 145 | ||
| 145 | proc_free_inum(ns->proc_inum); | 146 | ns_free_inum(&ns->ns); |
| 146 | for (i = 0; i < PIDMAP_ENTRIES; i++) | 147 | for (i = 0; i < PIDMAP_ENTRIES; i++) |
| 147 | kfree(ns->pidmap[i].page); | 148 | kfree(ns->pidmap[i].page); |
| 148 | put_user_ns(ns->user_ns); | 149 | put_user_ns(ns->user_ns); |
| @@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 190 | /* Don't allow any more processes into the pid namespace */ | 191 | /* Don't allow any more processes into the pid namespace */ |
| 191 | disable_pid_allocation(pid_ns); | 192 | disable_pid_allocation(pid_ns); |
| 192 | 193 | ||
| 193 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | 194 | /* |
| 195 | * Ignore SIGCHLD causing any terminated children to autoreap. | ||
| 196 | * This speeds up the namespace shutdown, plus see the comment | ||
| 197 | * below. | ||
| 198 | */ | ||
| 194 | spin_lock_irq(&me->sighand->siglock); | 199 | spin_lock_irq(&me->sighand->siglock); |
| 195 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | 200 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; |
| 196 | spin_unlock_irq(&me->sighand->siglock); | 201 | spin_unlock_irq(&me->sighand->siglock); |
| @@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 223 | } | 228 | } |
| 224 | read_unlock(&tasklist_lock); | 229 | read_unlock(&tasklist_lock); |
| 225 | 230 | ||
| 226 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | 231 | /* |
| 232 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. | ||
| 233 | * sys_wait4() will also block until our children traced from the | ||
| 234 | * parent namespace are detached and become EXIT_DEAD. | ||
| 235 | */ | ||
| 227 | do { | 236 | do { |
| 228 | clear_thread_flag(TIF_SIGPENDING); | 237 | clear_thread_flag(TIF_SIGPENDING); |
| 229 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 238 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
| 230 | } while (rc != -ECHILD); | 239 | } while (rc != -ECHILD); |
| 231 | 240 | ||
| 232 | /* | 241 | /* |
| 233 | * sys_wait4() above can't reap the TASK_DEAD children. | 242 | * sys_wait4() above can't reap the EXIT_DEAD children but we do not |
| 234 | * Make sure they all go away, see free_pid(). | 243 | * really care, we could reparent them to the global init. We could |
| 244 | * exit and reap ->child_reaper even if it is not the last thread in | ||
| 245 | * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), | ||
| 246 | * pid_ns can not go away until proc_kill_sb() drops the reference. | ||
| 247 | * | ||
| 248 | * But this ns can also have other tasks injected by setns()+fork(). | ||
| 249 | * Again, ignoring the user visible semantics we do not really need | ||
| 250 | * to wait until they are all reaped, but they can be reparented to | ||
| 251 | * us and thus we need to ensure that pid->child_reaper stays valid | ||
| 252 | * until they all go away. See free_pid()->wake_up_process(). | ||
| 253 | * | ||
| 254 | * We rely on ignored SIGCHLD, an injected zombie must be autoreaped | ||
| 255 | * if reparented. | ||
| 235 | */ | 256 | */ |
| 236 | for (;;) { | 257 | for (;;) { |
| 237 | set_current_state(TASK_UNINTERRUPTIBLE); | 258 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
| 313 | return 0; | 334 | return 0; |
| 314 | } | 335 | } |
| 315 | 336 | ||
| 316 | static void *pidns_get(struct task_struct *task) | 337 | static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) |
| 338 | { | ||
| 339 | return container_of(ns, struct pid_namespace, ns); | ||
| 340 | } | ||
| 341 | |||
| 342 | static struct ns_common *pidns_get(struct task_struct *task) | ||
| 317 | { | 343 | { |
| 318 | struct pid_namespace *ns; | 344 | struct pid_namespace *ns; |
| 319 | 345 | ||
| @@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task) | |||
| 323 | get_pid_ns(ns); | 349 | get_pid_ns(ns); |
| 324 | rcu_read_unlock(); | 350 | rcu_read_unlock(); |
| 325 | 351 | ||
| 326 | return ns; | 352 | return ns ? &ns->ns : NULL; |
| 327 | } | 353 | } |
| 328 | 354 | ||
| 329 | static void pidns_put(void *ns) | 355 | static void pidns_put(struct ns_common *ns) |
| 330 | { | 356 | { |
| 331 | put_pid_ns(ns); | 357 | put_pid_ns(to_pid_ns(ns)); |
| 332 | } | 358 | } |
| 333 | 359 | ||
| 334 | static int pidns_install(struct nsproxy *nsproxy, void *ns) | 360 | static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
| 335 | { | 361 | { |
| 336 | struct pid_namespace *active = task_active_pid_ns(current); | 362 | struct pid_namespace *active = task_active_pid_ns(current); |
| 337 | struct pid_namespace *ancestor, *new = ns; | 363 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); |
| 338 | 364 | ||
| 339 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | 365 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
| 340 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 366 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| @@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
| 362 | return 0; | 388 | return 0; |
| 363 | } | 389 | } |
| 364 | 390 | ||
| 365 | static unsigned int pidns_inum(void *ns) | ||
| 366 | { | ||
| 367 | struct pid_namespace *pid_ns = ns; | ||
| 368 | return pid_ns->proc_inum; | ||
| 369 | } | ||
| 370 | |||
| 371 | const struct proc_ns_operations pidns_operations = { | 391 | const struct proc_ns_operations pidns_operations = { |
| 372 | .name = "pid", | 392 | .name = "pid", |
| 373 | .type = CLONE_NEWPID, | 393 | .type = CLONE_NEWPID, |
| 374 | .get = pidns_get, | 394 | .get = pidns_get, |
| 375 | .put = pidns_put, | 395 | .put = pidns_put, |
| 376 | .install = pidns_install, | 396 | .install = pidns_install, |
| 377 | .inum = pidns_inum, | ||
| 378 | }; | 397 | }; |
| 379 | 398 | ||
| 380 | static __init int pid_namespaces_init(void) | 399 | static __init int pid_namespaces_init(void) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e4e4121fa327..48b28d387c7f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -94,6 +94,7 @@ config PM_STD_PARTITION | |||
| 94 | config PM_SLEEP | 94 | config PM_SLEEP |
| 95 | def_bool y | 95 | def_bool y |
| 96 | depends on SUSPEND || HIBERNATE_CALLBACKS | 96 | depends on SUSPEND || HIBERNATE_CALLBACKS |
| 97 | select PM | ||
| 97 | 98 | ||
| 98 | config PM_SLEEP_SMP | 99 | config PM_SLEEP_SMP |
| 99 | def_bool y | 100 | def_bool y |
| @@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC | |||
| 129 | depends on PM_WAKELOCKS | 130 | depends on PM_WAKELOCKS |
| 130 | default y | 131 | default y |
| 131 | 132 | ||
| 132 | config PM_RUNTIME | 133 | config PM |
| 133 | bool "Run-time PM core functionality" | 134 | bool "Device power management core functionality" |
| 134 | depends on !IA64_HP_SIM | ||
| 135 | ---help--- | 135 | ---help--- |
| 136 | Enable functionality allowing I/O devices to be put into energy-saving | 136 | Enable functionality allowing I/O devices to be put into energy-saving |
| 137 | (low power) states at run time (or autosuspended) after a specified | 137 | (low power) states, for example after a specified period of inactivity |
| 138 | period of inactivity and woken up in response to a hardware-generated | 138 | (autosuspended), and woken up in response to a hardware-generated |
| 139 | wake-up event or a driver's request. | 139 | wake-up event or a driver's request. |
| 140 | 140 | ||
| 141 | Hardware support is generally required for this functionality to work | 141 | Hardware support is generally required for this functionality to work |
| 142 | and the bus type drivers of the buses the devices are on are | 142 | and the bus type drivers of the buses the devices are on are |
| 143 | responsible for the actual handling of the autosuspend requests and | 143 | responsible for the actual handling of device suspend requests and |
| 144 | wake-up events. | 144 | wake-up events. |
| 145 | 145 | ||
| 146 | config PM | ||
| 147 | def_bool y | ||
| 148 | depends on PM_SLEEP || PM_RUNTIME | ||
| 149 | |||
| 150 | config PM_DEBUG | 146 | config PM_DEBUG |
| 151 | bool "Power Management Debug Support" | 147 | bool "Power Management Debug Support" |
| 152 | depends on PM | 148 | depends on PM |
| @@ -298,10 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP | |||
| 298 | def_bool y | 294 | def_bool y |
| 299 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | 295 | depends on PM_SLEEP && PM_GENERIC_DOMAINS |
| 300 | 296 | ||
| 301 | config PM_GENERIC_DOMAINS_RUNTIME | 297 | config PM_GENERIC_DOMAINS_OF |
| 302 | def_bool y | 298 | def_bool y |
| 303 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | 299 | depends on PM_GENERIC_DOMAINS && OF |
| 304 | 300 | ||
| 305 | config CPU_PM | 301 | config CPU_PM |
| 306 | bool | 302 | bool |
| 307 | depends on SUSPEND || CPU_IDLE | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..2329daae5255 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/syscore_ops.h> | 28 | #include <linux/syscore_ops.h> |
| 29 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
| 30 | #include <linux/genhd.h> | 30 | #include <linux/genhd.h> |
| 31 | #include <linux/ktime.h> | ||
| 31 | #include <trace/events/power.h> | 32 | #include <trace/events/power.h> |
| 32 | 33 | ||
| 33 | #include "power.h" | 34 | #include "power.h" |
| @@ -232,20 +233,17 @@ static void platform_recover(int platform_mode) | |||
| 232 | * @nr_pages: Number of memory pages processed between @start and @stop. | 233 | * @nr_pages: Number of memory pages processed between @start and @stop. |
| 233 | * @msg: Additional diagnostic message to print. | 234 | * @msg: Additional diagnostic message to print. |
| 234 | */ | 235 | */ |
| 235 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 236 | void swsusp_show_speed(ktime_t start, ktime_t stop, |
| 236 | unsigned nr_pages, char *msg) | 237 | unsigned nr_pages, char *msg) |
| 237 | { | 238 | { |
| 239 | ktime_t diff; | ||
| 238 | u64 elapsed_centisecs64; | 240 | u64 elapsed_centisecs64; |
| 239 | unsigned int centisecs; | 241 | unsigned int centisecs; |
| 240 | unsigned int k; | 242 | unsigned int k; |
| 241 | unsigned int kps; | 243 | unsigned int kps; |
| 242 | 244 | ||
| 243 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | 245 | diff = ktime_sub(stop, start); |
| 244 | /* | 246 | elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); |
| 245 | * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, | ||
| 246 | * it is obvious enough for what went wrong. | ||
| 247 | */ | ||
| 248 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
| 249 | centisecs = elapsed_centisecs64; | 247 | centisecs = elapsed_centisecs64; |
| 250 | if (centisecs == 0) | 248 | if (centisecs == 0) |
| 251 | centisecs = 1; /* avoid div-by-zero */ | 249 | centisecs = 1; /* avoid div-by-zero */ |
| @@ -502,8 +500,14 @@ int hibernation_restore(int platform_mode) | |||
| 502 | error = dpm_suspend_start(PMSG_QUIESCE); | 500 | error = dpm_suspend_start(PMSG_QUIESCE); |
| 503 | if (!error) { | 501 | if (!error) { |
| 504 | error = resume_target_kernel(platform_mode); | 502 | error = resume_target_kernel(platform_mode); |
| 505 | dpm_resume_end(PMSG_RECOVER); | 503 | /* |
| 504 | * The above should either succeed and jump to the new kernel, | ||
| 505 | * or return with an error. Otherwise things are just | ||
| 506 | * undefined, so let's be paranoid. | ||
| 507 | */ | ||
| 508 | BUG_ON(!error); | ||
| 506 | } | 509 | } |
| 510 | dpm_resume_end(PMSG_RECOVER); | ||
| 507 | pm_restore_gfp_mask(); | 511 | pm_restore_gfp_mask(); |
| 508 | resume_console(); | 512 | resume_console(); |
| 509 | pm_restore_console(); | 513 | pm_restore_console(); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 2df883a9d3cb..ce9b8328a689 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain); | |||
| 174 | 174 | ||
| 175 | struct timeval; | 175 | struct timeval; |
| 176 | /* kernel/power/swsusp.c */ | 176 | /* kernel/power/swsusp.c */ |
| 177 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | 177 | extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); |
| 178 | unsigned int, char *); | ||
| 179 | 178 | ||
| 180 | #ifdef CONFIG_SUSPEND | 179 | #ifdef CONFIG_SUSPEND |
| 181 | /* kernel/power/suspend.c */ | 180 | /* kernel/power/suspend.c */ |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 46 | while (true) { | 46 | while (true) { |
| 47 | todo = 0; | 47 | todo = 0; |
| 48 | read_lock(&tasklist_lock); | 48 | read_lock(&tasklist_lock); |
| 49 | do_each_thread(g, p) { | 49 | for_each_process_thread(g, p) { |
| 50 | if (p == current || !freeze_task(p)) | 50 | if (p == current || !freeze_task(p)) |
| 51 | continue; | 51 | continue; |
| 52 | 52 | ||
| 53 | if (!freezer_should_skip(p)) | 53 | if (!freezer_should_skip(p)) |
| 54 | todo++; | 54 | todo++; |
| 55 | } while_each_thread(g, p); | 55 | } |
| 56 | read_unlock(&tasklist_lock); | 56 | read_unlock(&tasklist_lock); |
| 57 | 57 | ||
| 58 | if (!user_only) { | 58 | if (!user_only) { |
| @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 93 | 93 | ||
| 94 | if (!wakeup) { | 94 | if (!wakeup) { |
| 95 | read_lock(&tasklist_lock); | 95 | read_lock(&tasklist_lock); |
| 96 | do_each_thread(g, p) { | 96 | for_each_process_thread(g, p) { |
| 97 | if (p != current && !freezer_should_skip(p) | 97 | if (p != current && !freezer_should_skip(p) |
| 98 | && freezing(p) && !frozen(p)) | 98 | && freezing(p) && !frozen(p)) |
| 99 | sched_show_task(p); | 99 | sched_show_task(p); |
| 100 | } while_each_thread(g, p); | 100 | } |
| 101 | read_unlock(&tasklist_lock); | 101 | read_unlock(&tasklist_lock); |
| 102 | } | 102 | } |
| 103 | } else { | 103 | } else { |
| @@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 108 | return todo ? -EBUSY : 0; | 108 | return todo ? -EBUSY : 0; |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | static bool __check_frozen_processes(void) | ||
| 112 | { | ||
| 113 | struct task_struct *g, *p; | ||
| 114 | |||
| 115 | for_each_process_thread(g, p) | ||
| 116 | if (p != current && !freezer_should_skip(p) && !frozen(p)) | ||
| 117 | return false; | ||
| 118 | |||
| 119 | return true; | ||
| 120 | } | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Returns true if all freezable tasks (except for current) are frozen already | ||
| 124 | */ | ||
| 125 | static bool check_frozen_processes(void) | ||
| 126 | { | ||
| 127 | bool ret; | ||
| 128 | |||
| 129 | read_lock(&tasklist_lock); | ||
| 130 | ret = __check_frozen_processes(); | ||
| 131 | read_unlock(&tasklist_lock); | ||
| 132 | return ret; | ||
| 133 | } | ||
| 134 | |||
| 111 | /** | 135 | /** |
| 112 | * freeze_processes - Signal user space processes to enter the refrigerator. | 136 | * freeze_processes - Signal user space processes to enter the refrigerator. |
| 113 | * The current thread will not be frozen. The same process that calls | 137 | * The current thread will not be frozen. The same process that calls |
| @@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 118 | int freeze_processes(void) | 142 | int freeze_processes(void) |
| 119 | { | 143 | { |
| 120 | int error; | 144 | int error; |
| 145 | int oom_kills_saved; | ||
| 121 | 146 | ||
| 122 | error = __usermodehelper_disable(UMH_FREEZING); | 147 | error = __usermodehelper_disable(UMH_FREEZING); |
| 123 | if (error) | 148 | if (error) |
| @@ -129,13 +154,28 @@ int freeze_processes(void) | |||
| 129 | if (!pm_freezing) | 154 | if (!pm_freezing) |
| 130 | atomic_inc(&system_freezing_cnt); | 155 | atomic_inc(&system_freezing_cnt); |
| 131 | 156 | ||
| 157 | pm_wakeup_clear(); | ||
| 132 | printk("Freezing user space processes ... "); | 158 | printk("Freezing user space processes ... "); |
| 133 | pm_freezing = true; | 159 | pm_freezing = true; |
| 160 | oom_kills_saved = oom_kills_count(); | ||
| 134 | error = try_to_freeze_tasks(true); | 161 | error = try_to_freeze_tasks(true); |
| 135 | if (!error) { | 162 | if (!error) { |
| 136 | printk("done."); | ||
| 137 | __usermodehelper_set_disable_depth(UMH_DISABLED); | 163 | __usermodehelper_set_disable_depth(UMH_DISABLED); |
| 138 | oom_killer_disable(); | 164 | oom_killer_disable(); |
| 165 | |||
| 166 | /* | ||
| 167 | * There might have been an OOM kill while we were | ||
| 168 | * freezing tasks and the killed task might be still | ||
| 169 | * on the way out so we have to double check for race. | ||
| 170 | */ | ||
| 171 | if (oom_kills_count() != oom_kills_saved && | ||
| 172 | !check_frozen_processes()) { | ||
| 173 | __usermodehelper_set_disable_depth(UMH_ENABLED); | ||
| 174 | printk("OOM in progress."); | ||
| 175 | error = -EBUSY; | ||
| 176 | } else { | ||
| 177 | printk("done."); | ||
| 178 | } | ||
| 139 | } | 179 | } |
| 140 | printk("\n"); | 180 | printk("\n"); |
| 141 | BUG_ON(in_atomic()); | 181 | BUG_ON(in_atomic()); |
| @@ -190,11 +230,11 @@ void thaw_processes(void) | |||
| 190 | thaw_workqueues(); | 230 | thaw_workqueues(); |
| 191 | 231 | ||
| 192 | read_lock(&tasklist_lock); | 232 | read_lock(&tasklist_lock); |
| 193 | do_each_thread(g, p) { | 233 | for_each_process_thread(g, p) { |
| 194 | /* No other threads should have PF_SUSPEND_TASK set */ | 234 | /* No other threads should have PF_SUSPEND_TASK set */ |
| 195 | WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); | 235 | WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); |
| 196 | __thaw_task(p); | 236 | __thaw_task(p); |
| 197 | } while_each_thread(g, p); | 237 | } |
| 198 | read_unlock(&tasklist_lock); | 238 | read_unlock(&tasklist_lock); |
| 199 | 239 | ||
| 200 | WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); | 240 | WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); |
| @@ -217,10 +257,10 @@ void thaw_kernel_threads(void) | |||
| 217 | thaw_workqueues(); | 257 | thaw_workqueues(); |
| 218 | 258 | ||
| 219 | read_lock(&tasklist_lock); | 259 | read_lock(&tasklist_lock); |
| 220 | do_each_thread(g, p) { | 260 | for_each_process_thread(g, p) { |
| 221 | if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) | 261 | if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) |
| 222 | __thaw_task(p); | 262 | __thaw_task(p); |
| 223 | } while_each_thread(g, p); | 263 | } |
| 224 | read_unlock(&tasklist_lock); | 264 | read_unlock(&tasklist_lock); |
| 225 | 265 | ||
| 226 | schedule(); | 266 | schedule(); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { | |||
| 105 | }; | 105 | }; |
| 106 | 106 | ||
| 107 | 107 | ||
| 108 | static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); | ||
| 109 | static struct pm_qos_constraints memory_bw_constraints = { | ||
| 110 | .list = PLIST_HEAD_INIT(memory_bw_constraints.list), | ||
| 111 | .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 112 | .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 113 | .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, | ||
| 114 | .type = PM_QOS_SUM, | ||
| 115 | .notifiers = &memory_bandwidth_notifier, | ||
| 116 | }; | ||
| 117 | static struct pm_qos_object memory_bandwidth_pm_qos = { | ||
| 118 | .constraints = &memory_bw_constraints, | ||
| 119 | .name = "memory_bandwidth", | ||
| 120 | }; | ||
| 121 | |||
| 122 | |||
| 108 | static struct pm_qos_object *pm_qos_array[] = { | 123 | static struct pm_qos_object *pm_qos_array[] = { |
| 109 | &null_pm_qos, | 124 | &null_pm_qos, |
| 110 | &cpu_dma_pm_qos, | 125 | &cpu_dma_pm_qos, |
| 111 | &network_lat_pm_qos, | 126 | &network_lat_pm_qos, |
| 112 | &network_throughput_pm_qos | 127 | &network_throughput_pm_qos, |
| 128 | &memory_bandwidth_pm_qos, | ||
| 113 | }; | 129 | }; |
| 114 | 130 | ||
| 115 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 131 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
| @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { | |||
| 130 | /* unlocked internal variant */ | 146 | /* unlocked internal variant */ |
| 131 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) | 147 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) |
| 132 | { | 148 | { |
| 149 | struct plist_node *node; | ||
| 150 | int total_value = 0; | ||
| 151 | |||
| 133 | if (plist_head_empty(&c->list)) | 152 | if (plist_head_empty(&c->list)) |
| 134 | return c->no_constraint_value; | 153 | return c->no_constraint_value; |
| 135 | 154 | ||
| @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) | |||
| 140 | case PM_QOS_MAX: | 159 | case PM_QOS_MAX: |
| 141 | return plist_last(&c->list)->prio; | 160 | return plist_last(&c->list)->prio; |
| 142 | 161 | ||
| 162 | case PM_QOS_SUM: | ||
| 163 | plist_for_each(node, &c->list) | ||
| 164 | total_value += node->prio; | ||
| 165 | |||
| 166 | return total_value; | ||
| 167 | |||
| 143 | default: | 168 | default: |
| 144 | /* runtime check for not using enum */ | 169 | /* runtime check for not using enum */ |
| 145 | BUG(); | 170 | BUG(); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f1604d8cf489..0c40c16174b4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/list.h> | 28 | #include <linux/list.h> |
| 29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 30 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> |
| 31 | #include <linux/ktime.h> | ||
| 31 | 32 | ||
| 32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
| 33 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
| @@ -725,6 +726,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) | |||
| 725 | clear_bit(bit, addr); | 726 | clear_bit(bit, addr); |
| 726 | } | 727 | } |
| 727 | 728 | ||
| 729 | static void memory_bm_clear_current(struct memory_bitmap *bm) | ||
| 730 | { | ||
| 731 | int bit; | ||
| 732 | |||
| 733 | bit = max(bm->cur.node_bit - 1, 0); | ||
| 734 | clear_bit(bit, bm->cur.node->data); | ||
| 735 | } | ||
| 736 | |||
| 728 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) | 737 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) |
| 729 | { | 738 | { |
| 730 | void *addr; | 739 | void *addr; |
| @@ -1333,23 +1342,39 @@ static struct memory_bitmap copy_bm; | |||
| 1333 | 1342 | ||
| 1334 | void swsusp_free(void) | 1343 | void swsusp_free(void) |
| 1335 | { | 1344 | { |
| 1336 | struct zone *zone; | 1345 | unsigned long fb_pfn, fr_pfn; |
| 1337 | unsigned long pfn, max_zone_pfn; | ||
| 1338 | 1346 | ||
| 1339 | for_each_populated_zone(zone) { | 1347 | if (!forbidden_pages_map || !free_pages_map) |
| 1340 | max_zone_pfn = zone_end_pfn(zone); | 1348 | goto out; |
| 1341 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1349 | |
| 1342 | if (pfn_valid(pfn)) { | 1350 | memory_bm_position_reset(forbidden_pages_map); |
| 1343 | struct page *page = pfn_to_page(pfn); | 1351 | memory_bm_position_reset(free_pages_map); |
| 1344 | 1352 | ||
| 1345 | if (swsusp_page_is_forbidden(page) && | 1353 | loop: |
| 1346 | swsusp_page_is_free(page)) { | 1354 | fr_pfn = memory_bm_next_pfn(free_pages_map); |
| 1347 | swsusp_unset_page_forbidden(page); | 1355 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); |
| 1348 | swsusp_unset_page_free(page); | 1356 | |
| 1349 | __free_page(page); | 1357 | /* |
| 1350 | } | 1358 | * Find the next bit set in both bitmaps. This is guaranteed to |
| 1351 | } | 1359 | * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. |
| 1360 | */ | ||
| 1361 | do { | ||
| 1362 | if (fb_pfn < fr_pfn) | ||
| 1363 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); | ||
| 1364 | if (fr_pfn < fb_pfn) | ||
| 1365 | fr_pfn = memory_bm_next_pfn(free_pages_map); | ||
| 1366 | } while (fb_pfn != fr_pfn); | ||
| 1367 | |||
| 1368 | if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { | ||
| 1369 | struct page *page = pfn_to_page(fr_pfn); | ||
| 1370 | |||
| 1371 | memory_bm_clear_current(forbidden_pages_map); | ||
| 1372 | memory_bm_clear_current(free_pages_map); | ||
| 1373 | __free_page(page); | ||
| 1374 | goto loop; | ||
| 1352 | } | 1375 | } |
| 1376 | |||
| 1377 | out: | ||
| 1353 | nr_copy_pages = 0; | 1378 | nr_copy_pages = 0; |
| 1354 | nr_meta_pages = 0; | 1379 | nr_meta_pages = 0; |
| 1355 | restore_pblist = NULL; | 1380 | restore_pblist = NULL; |
| @@ -1552,11 +1577,11 @@ int hibernate_preallocate_memory(void) | |||
| 1552 | struct zone *zone; | 1577 | struct zone *zone; |
| 1553 | unsigned long saveable, size, max_size, count, highmem, pages = 0; | 1578 | unsigned long saveable, size, max_size, count, highmem, pages = 0; |
| 1554 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; | 1579 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; |
| 1555 | struct timeval start, stop; | 1580 | ktime_t start, stop; |
| 1556 | int error; | 1581 | int error; |
| 1557 | 1582 | ||
| 1558 | printk(KERN_INFO "PM: Preallocating image memory... "); | 1583 | printk(KERN_INFO "PM: Preallocating image memory... "); |
| 1559 | do_gettimeofday(&start); | 1584 | start = ktime_get(); |
| 1560 | 1585 | ||
| 1561 | error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); | 1586 | error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); |
| 1562 | if (error) | 1587 | if (error) |
| @@ -1685,9 +1710,9 @@ int hibernate_preallocate_memory(void) | |||
| 1685 | free_unnecessary_pages(); | 1710 | free_unnecessary_pages(); |
| 1686 | 1711 | ||
| 1687 | out: | 1712 | out: |
| 1688 | do_gettimeofday(&stop); | 1713 | stop = ktime_get(); |
| 1689 | printk(KERN_CONT "done (allocated %lu pages)\n", pages); | 1714 | printk(KERN_CONT "done (allocated %lu pages)\n", pages); |
| 1690 | swsusp_show_speed(&start, &stop, pages, "Allocated"); | 1715 | swsusp_show_speed(start, stop, pages, "Allocated"); |
| 1691 | 1716 | ||
| 1692 | return 0; | 1717 | return 0; |
| 1693 | 1718 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 18c62195660f..c347e3ce3a55 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state) | |||
| 146 | 146 | ||
| 147 | static int platform_suspend_prepare_late(suspend_state_t state) | 147 | static int platform_suspend_prepare_late(suspend_state_t state) |
| 148 | { | 148 | { |
| 149 | return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? | ||
| 150 | freeze_ops->prepare() : 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | static int platform_suspend_prepare_noirq(suspend_state_t state) | ||
| 154 | { | ||
| 149 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? | 155 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? |
| 150 | suspend_ops->prepare_late() : 0; | 156 | suspend_ops->prepare_late() : 0; |
| 151 | } | 157 | } |
| 152 | 158 | ||
| 153 | static void platform_suspend_wake(suspend_state_t state) | 159 | static void platform_resume_noirq(suspend_state_t state) |
| 154 | { | 160 | { |
| 155 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) | 161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) |
| 156 | suspend_ops->wake(); | 162 | suspend_ops->wake(); |
| 157 | } | 163 | } |
| 158 | 164 | ||
| 159 | static void platform_suspend_finish(suspend_state_t state) | 165 | static void platform_resume_early(suspend_state_t state) |
| 166 | { | ||
| 167 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) | ||
| 168 | freeze_ops->restore(); | ||
| 169 | } | ||
| 170 | |||
| 171 | static void platform_resume_finish(suspend_state_t state) | ||
| 160 | { | 172 | { |
| 161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) | 173 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) |
| 162 | suspend_ops->finish(); | 174 | suspend_ops->finish(); |
| @@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state) | |||
| 172 | return 0; | 184 | return 0; |
| 173 | } | 185 | } |
| 174 | 186 | ||
| 175 | static void platform_suspend_end(suspend_state_t state) | 187 | static void platform_resume_end(suspend_state_t state) |
| 176 | { | 188 | { |
| 177 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | 189 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) |
| 178 | freeze_ops->end(); | 190 | freeze_ops->end(); |
| @@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state) | |||
| 180 | suspend_ops->end(); | 192 | suspend_ops->end(); |
| 181 | } | 193 | } |
| 182 | 194 | ||
| 183 | static void platform_suspend_recover(suspend_state_t state) | 195 | static void platform_recover(suspend_state_t state) |
| 184 | { | 196 | { |
| 185 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) | 197 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) |
| 186 | suspend_ops->recover(); | 198 | suspend_ops->recover(); |
| @@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 265 | if (error) | 277 | if (error) |
| 266 | goto Platform_finish; | 278 | goto Platform_finish; |
| 267 | 279 | ||
| 268 | error = dpm_suspend_end(PMSG_SUSPEND); | 280 | error = dpm_suspend_late(PMSG_SUSPEND); |
| 269 | if (error) { | 281 | if (error) { |
| 270 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 282 | printk(KERN_ERR "PM: late suspend of devices failed\n"); |
| 271 | goto Platform_finish; | 283 | goto Platform_finish; |
| 272 | } | 284 | } |
| 273 | error = platform_suspend_prepare_late(state); | 285 | error = platform_suspend_prepare_late(state); |
| 274 | if (error) | 286 | if (error) |
| 287 | goto Devices_early_resume; | ||
| 288 | |||
| 289 | error = dpm_suspend_noirq(PMSG_SUSPEND); | ||
| 290 | if (error) { | ||
| 291 | printk(KERN_ERR "PM: noirq suspend of devices failed\n"); | ||
| 292 | goto Platform_early_resume; | ||
| 293 | } | ||
| 294 | error = platform_suspend_prepare_noirq(state); | ||
| 295 | if (error) | ||
| 275 | goto Platform_wake; | 296 | goto Platform_wake; |
| 276 | 297 | ||
| 277 | if (suspend_test(TEST_PLATFORM)) | 298 | if (suspend_test(TEST_PLATFORM)) |
| @@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 318 | enable_nonboot_cpus(); | 339 | enable_nonboot_cpus(); |
| 319 | 340 | ||
| 320 | Platform_wake: | 341 | Platform_wake: |
| 321 | platform_suspend_wake(state); | 342 | platform_resume_noirq(state); |
| 322 | dpm_resume_start(PMSG_RESUME); | 343 | dpm_resume_noirq(PMSG_RESUME); |
| 344 | |||
| 345 | Platform_early_resume: | ||
| 346 | platform_resume_early(state); | ||
| 347 | |||
| 348 | Devices_early_resume: | ||
| 349 | dpm_resume_early(PMSG_RESUME); | ||
| 323 | 350 | ||
| 324 | Platform_finish: | 351 | Platform_finish: |
| 325 | platform_suspend_finish(state); | 352 | platform_resume_finish(state); |
| 326 | return error; | 353 | return error; |
| 327 | } | 354 | } |
| 328 | 355 | ||
| @@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 361 | suspend_test_start(); | 388 | suspend_test_start(); |
| 362 | dpm_resume_end(PMSG_RESUME); | 389 | dpm_resume_end(PMSG_RESUME); |
| 363 | suspend_test_finish("resume devices"); | 390 | suspend_test_finish("resume devices"); |
| 391 | trace_suspend_resume(TPS("resume_console"), state, true); | ||
| 364 | resume_console(); | 392 | resume_console(); |
| 393 | trace_suspend_resume(TPS("resume_console"), state, false); | ||
| 365 | 394 | ||
| 366 | Close: | 395 | Close: |
| 367 | platform_suspend_end(state); | 396 | platform_resume_end(state); |
| 368 | return error; | 397 | return error; |
| 369 | 398 | ||
| 370 | Recover_platform: | 399 | Recover_platform: |
| 371 | platform_suspend_recover(state); | 400 | platform_recover(state); |
| 372 | goto Resume_devices; | 401 | goto Resume_devices; |
| 373 | } | 402 | } |
| 374 | 403 | ||
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index bd91bc177c93..084452e34a12 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
| @@ -22,6 +22,8 @@ | |||
| 22 | #define TEST_SUSPEND_SECONDS 10 | 22 | #define TEST_SUSPEND_SECONDS 10 |
| 23 | 23 | ||
| 24 | static unsigned long suspend_test_start_time; | 24 | static unsigned long suspend_test_start_time; |
| 25 | static u32 test_repeat_count_max = 1; | ||
| 26 | static u32 test_repeat_count_current; | ||
| 25 | 27 | ||
| 26 | void suspend_test_start(void) | 28 | void suspend_test_start(void) |
| 27 | { | 29 | { |
| @@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 74 | int status; | 76 | int status; |
| 75 | 77 | ||
| 76 | /* this may fail if the RTC hasn't been initialized */ | 78 | /* this may fail if the RTC hasn't been initialized */ |
| 79 | repeat: | ||
| 77 | status = rtc_read_time(rtc, &alm.time); | 80 | status = rtc_read_time(rtc, &alm.time); |
| 78 | if (status < 0) { | 81 | if (status < 0) { |
| 79 | printk(err_readtime, dev_name(&rtc->dev), status); | 82 | printk(err_readtime, dev_name(&rtc->dev), status); |
| @@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 100 | if (state == PM_SUSPEND_STANDBY) { | 103 | if (state == PM_SUSPEND_STANDBY) { |
| 101 | printk(info_test, pm_states[state]); | 104 | printk(info_test, pm_states[state]); |
| 102 | status = pm_suspend(state); | 105 | status = pm_suspend(state); |
| 106 | if (status < 0) | ||
| 107 | state = PM_SUSPEND_FREEZE; | ||
| 103 | } | 108 | } |
| 109 | if (state == PM_SUSPEND_FREEZE) { | ||
| 110 | printk(info_test, pm_states[state]); | ||
| 111 | status = pm_suspend(state); | ||
| 112 | } | ||
| 113 | |||
| 104 | if (status < 0) | 114 | if (status < 0) |
| 105 | printk(err_suspend, status); | 115 | printk(err_suspend, status); |
| 106 | 116 | ||
| 117 | test_repeat_count_current++; | ||
| 118 | if (test_repeat_count_current < test_repeat_count_max) | ||
| 119 | goto repeat; | ||
| 120 | |||
| 107 | /* Some platforms can't detect that the alarm triggered the | 121 | /* Some platforms can't detect that the alarm triggered the |
| 108 | * wakeup, or (accordingly) disable it after it afterwards. | 122 | * wakeup, or (accordingly) disable it after it afterwards. |
| 109 | * It's supposed to give oneshot behavior; cope. | 123 | * It's supposed to give oneshot behavior; cope. |
| @@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata = | |||
| 137 | static int __init setup_test_suspend(char *value) | 151 | static int __init setup_test_suspend(char *value) |
| 138 | { | 152 | { |
| 139 | int i; | 153 | int i; |
| 154 | char *repeat; | ||
| 155 | char *suspend_type; | ||
| 140 | 156 | ||
| 141 | /* "=mem" ==> "mem" */ | 157 | /* example : "=mem[,N]" ==> "mem[,N]" */ |
| 142 | value++; | 158 | value++; |
| 159 | suspend_type = strsep(&value, ","); | ||
| 160 | if (!suspend_type) | ||
| 161 | return 0; | ||
| 162 | |||
| 163 | repeat = strsep(&value, ","); | ||
| 164 | if (repeat) { | ||
| 165 | if (kstrtou32(repeat, 0, &test_repeat_count_max)) | ||
| 166 | return 0; | ||
| 167 | } | ||
| 168 | |||
| 143 | for (i = 0; pm_labels[i]; i++) | 169 | for (i = 0; pm_labels[i]; i++) |
| 144 | if (!strcmp(pm_labels[i], value)) { | 170 | if (!strcmp(pm_labels[i], suspend_type)) { |
| 145 | test_state_label = pm_labels[i]; | 171 | test_state_label = pm_labels[i]; |
| 146 | return 0; | 172 | return 0; |
| 147 | } | 173 | } |
| 148 | 174 | ||
| 149 | printk(warn_bad_state, value); | 175 | printk(warn_bad_state, suspend_type); |
| 150 | return 0; | 176 | return 0; |
| 151 | } | 177 | } |
| 152 | __setup("test_suspend", setup_test_suspend); | 178 | __setup("test_suspend", setup_test_suspend); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aaa3261dea5d..570aff817543 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/atomic.h> | 30 | #include <linux/atomic.h> |
| 31 | #include <linux/kthread.h> | 31 | #include <linux/kthread.h> |
| 32 | #include <linux/crc32.h> | 32 | #include <linux/crc32.h> |
| 33 | #include <linux/ktime.h> | ||
| 33 | 34 | ||
| 34 | #include "power.h" | 35 | #include "power.h" |
| 35 | 36 | ||
| @@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle, | |||
| 445 | int nr_pages; | 446 | int nr_pages; |
| 446 | int err2; | 447 | int err2; |
| 447 | struct bio *bio; | 448 | struct bio *bio; |
| 448 | struct timeval start; | 449 | ktime_t start; |
| 449 | struct timeval stop; | 450 | ktime_t stop; |
| 450 | 451 | ||
| 451 | printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", | 452 | printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", |
| 452 | nr_to_write); | 453 | nr_to_write); |
| @@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle, | |||
| 455 | m = 1; | 456 | m = 1; |
| 456 | nr_pages = 0; | 457 | nr_pages = 0; |
| 457 | bio = NULL; | 458 | bio = NULL; |
| 458 | do_gettimeofday(&start); | 459 | start = ktime_get(); |
| 459 | while (1) { | 460 | while (1) { |
| 460 | ret = snapshot_read_next(snapshot); | 461 | ret = snapshot_read_next(snapshot); |
| 461 | if (ret <= 0) | 462 | if (ret <= 0) |
| @@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle, | |||
| 469 | nr_pages++; | 470 | nr_pages++; |
| 470 | } | 471 | } |
| 471 | err2 = hib_wait_on_bio_chain(&bio); | 472 | err2 = hib_wait_on_bio_chain(&bio); |
| 472 | do_gettimeofday(&stop); | 473 | stop = ktime_get(); |
| 473 | if (!ret) | 474 | if (!ret) |
| 474 | ret = err2; | 475 | ret = err2; |
| 475 | if (!ret) | 476 | if (!ret) |
| 476 | printk(KERN_INFO "PM: Image saving done.\n"); | 477 | printk(KERN_INFO "PM: Image saving done.\n"); |
| 477 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 478 | swsusp_show_speed(start, stop, nr_to_write, "Wrote"); |
| 478 | return ret; | 479 | return ret; |
| 479 | } | 480 | } |
| 480 | 481 | ||
| @@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 580 | int nr_pages; | 581 | int nr_pages; |
| 581 | int err2; | 582 | int err2; |
| 582 | struct bio *bio; | 583 | struct bio *bio; |
| 583 | struct timeval start; | 584 | ktime_t start; |
| 584 | struct timeval stop; | 585 | ktime_t stop; |
| 585 | size_t off; | 586 | size_t off; |
| 586 | unsigned thr, run_threads, nr_threads; | 587 | unsigned thr, run_threads, nr_threads; |
| 587 | unsigned char *page = NULL; | 588 | unsigned char *page = NULL; |
| @@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 674 | m = 1; | 675 | m = 1; |
| 675 | nr_pages = 0; | 676 | nr_pages = 0; |
| 676 | bio = NULL; | 677 | bio = NULL; |
| 677 | do_gettimeofday(&start); | 678 | start = ktime_get(); |
| 678 | for (;;) { | 679 | for (;;) { |
| 679 | for (thr = 0; thr < nr_threads; thr++) { | 680 | for (thr = 0; thr < nr_threads; thr++) { |
| 680 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | 681 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { |
| @@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
| 759 | 760 | ||
| 760 | out_finish: | 761 | out_finish: |
| 761 | err2 = hib_wait_on_bio_chain(&bio); | 762 | err2 = hib_wait_on_bio_chain(&bio); |
| 762 | do_gettimeofday(&stop); | 763 | stop = ktime_get(); |
| 763 | if (!ret) | 764 | if (!ret) |
| 764 | ret = err2; | 765 | ret = err2; |
| 765 | if (!ret) | 766 | if (!ret) |
| 766 | printk(KERN_INFO "PM: Image saving done.\n"); | 767 | printk(KERN_INFO "PM: Image saving done.\n"); |
| 767 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 768 | swsusp_show_speed(start, stop, nr_to_write, "Wrote"); |
| 768 | out_clean: | 769 | out_clean: |
| 769 | if (crc) { | 770 | if (crc) { |
| 770 | if (crc->thr) | 771 | if (crc->thr) |
| @@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle, | |||
| 965 | { | 966 | { |
| 966 | unsigned int m; | 967 | unsigned int m; |
| 967 | int ret = 0; | 968 | int ret = 0; |
| 968 | struct timeval start; | 969 | ktime_t start; |
| 969 | struct timeval stop; | 970 | ktime_t stop; |
| 970 | struct bio *bio; | 971 | struct bio *bio; |
| 971 | int err2; | 972 | int err2; |
| 972 | unsigned nr_pages; | 973 | unsigned nr_pages; |
| @@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 978 | m = 1; | 979 | m = 1; |
| 979 | nr_pages = 0; | 980 | nr_pages = 0; |
| 980 | bio = NULL; | 981 | bio = NULL; |
| 981 | do_gettimeofday(&start); | 982 | start = ktime_get(); |
| 982 | for ( ; ; ) { | 983 | for ( ; ; ) { |
| 983 | ret = snapshot_write_next(snapshot); | 984 | ret = snapshot_write_next(snapshot); |
| 984 | if (ret <= 0) | 985 | if (ret <= 0) |
| @@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 996 | nr_pages++; | 997 | nr_pages++; |
| 997 | } | 998 | } |
| 998 | err2 = hib_wait_on_bio_chain(&bio); | 999 | err2 = hib_wait_on_bio_chain(&bio); |
| 999 | do_gettimeofday(&stop); | 1000 | stop = ktime_get(); |
| 1000 | if (!ret) | 1001 | if (!ret) |
| 1001 | ret = err2; | 1002 | ret = err2; |
| 1002 | if (!ret) { | 1003 | if (!ret) { |
| @@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle, | |||
| 1005 | if (!snapshot_image_loaded(snapshot)) | 1006 | if (!snapshot_image_loaded(snapshot)) |
| 1006 | ret = -ENODATA; | 1007 | ret = -ENODATA; |
| 1007 | } | 1008 | } |
| 1008 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1009 | swsusp_show_speed(start, stop, nr_to_read, "Read"); |
| 1009 | return ret; | 1010 | return ret; |
| 1010 | } | 1011 | } |
| 1011 | 1012 | ||
| @@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 1067 | int ret = 0; | 1068 | int ret = 0; |
| 1068 | int eof = 0; | 1069 | int eof = 0; |
| 1069 | struct bio *bio; | 1070 | struct bio *bio; |
| 1070 | struct timeval start; | 1071 | ktime_t start; |
| 1071 | struct timeval stop; | 1072 | ktime_t stop; |
| 1072 | unsigned nr_pages; | 1073 | unsigned nr_pages; |
| 1073 | size_t off; | 1074 | size_t off; |
| 1074 | unsigned i, thr, run_threads, nr_threads; | 1075 | unsigned i, thr, run_threads, nr_threads; |
| @@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
| 1190 | m = 1; | 1191 | m = 1; |
| 1191 | nr_pages = 0; | 1192 | nr_pages = 0; |
| 1192 | bio = NULL; | 1193 | bio = NULL; |
| 1193 | do_gettimeofday(&start); | 1194 | start = ktime_get(); |
| 1194 | 1195 | ||
| 1195 | ret = snapshot_write_next(snapshot); | 1196 | ret = snapshot_write_next(snapshot); |
| 1196 | if (ret <= 0) | 1197 | if (ret <= 0) |
| @@ -1343,7 +1344,7 @@ out_finish: | |||
| 1343 | wait_event(crc->done, atomic_read(&crc->stop)); | 1344 | wait_event(crc->done, atomic_read(&crc->stop)); |
| 1344 | atomic_set(&crc->stop, 0); | 1345 | atomic_set(&crc->stop, 0); |
| 1345 | } | 1346 | } |
| 1346 | do_gettimeofday(&stop); | 1347 | stop = ktime_get(); |
| 1347 | if (!ret) { | 1348 | if (!ret) { |
| 1348 | printk(KERN_INFO "PM: Image loading done.\n"); | 1349 | printk(KERN_INFO "PM: Image loading done.\n"); |
| 1349 | snapshot_write_finalize(snapshot); | 1350 | snapshot_write_finalize(snapshot); |
| @@ -1359,7 +1360,7 @@ out_finish: | |||
| 1359 | } | 1360 | } |
| 1360 | } | 1361 | } |
| 1361 | } | 1362 | } |
| 1362 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1363 | swsusp_show_speed(start, stop, nr_to_read, "Read"); |
| 1363 | out_clean: | 1364 | out_clean: |
| 1364 | for (i = 0; i < ring_size; i++) | 1365 | for (i = 0; i < ring_size; i++) |
| 1365 | free_page((unsigned long)page[i]); | 1366 | free_page((unsigned long)page[i]); |
| @@ -1374,7 +1375,7 @@ out_clean: | |||
| 1374 | kthread_stop(data[thr].thr); | 1375 | kthread_stop(data[thr].thr); |
| 1375 | vfree(data); | 1376 | vfree(data); |
| 1376 | } | 1377 | } |
| 1377 | if (page) vfree(page); | 1378 | vfree(page); |
| 1378 | 1379 | ||
| 1379 | return ret; | 1380 | return ret; |
| 1380 | } | 1381 | } |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1ce770687ea8..02d6b6d28796 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -62,9 +62,6 @@ int console_printk[4] = { | |||
| 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ | 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
| 63 | }; | 63 | }; |
| 64 | 64 | ||
| 65 | /* Deferred messaged from sched code are marked by this special level */ | ||
| 66 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
| 67 | |||
| 68 | /* | 65 | /* |
| 69 | * Low level drivers may need that to know if they can schedule in | 66 | * Low level drivers may need that to know if they can schedule in |
| 70 | * their unblank() callback or not. So let's export it. | 67 | * their unblank() callback or not. So let's export it. |
| @@ -267,7 +264,6 @@ static u32 clear_idx; | |||
| 267 | #define LOG_ALIGN __alignof__(struct printk_log) | 264 | #define LOG_ALIGN __alignof__(struct printk_log) |
| 268 | #endif | 265 | #endif |
| 269 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 266 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
| 270 | #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) | ||
| 271 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 267 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
| 272 | static char *log_buf = __log_buf; | 268 | static char *log_buf = __log_buf; |
| 273 | static u32 log_buf_len = __LOG_BUF_LEN; | 269 | static u32 log_buf_len = __LOG_BUF_LEN; |
| @@ -481,7 +477,7 @@ static int syslog_action_restricted(int type) | |||
| 481 | type != SYSLOG_ACTION_SIZE_BUFFER; | 477 | type != SYSLOG_ACTION_SIZE_BUFFER; |
| 482 | } | 478 | } |
| 483 | 479 | ||
| 484 | static int check_syslog_permissions(int type, bool from_file) | 480 | int check_syslog_permissions(int type, bool from_file) |
| 485 | { | 481 | { |
| 486 | /* | 482 | /* |
| 487 | * If this is from /proc/kmsg and we've already opened it, then we've | 483 | * If this is from /proc/kmsg and we've already opened it, then we've |
| @@ -519,14 +515,13 @@ struct devkmsg_user { | |||
| 519 | char buf[8192]; | 515 | char buf[8192]; |
| 520 | }; | 516 | }; |
| 521 | 517 | ||
| 522 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | 518 | static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) |
| 523 | unsigned long count, loff_t pos) | ||
| 524 | { | 519 | { |
| 525 | char *buf, *line; | 520 | char *buf, *line; |
| 526 | int i; | 521 | int i; |
| 527 | int level = default_message_loglevel; | 522 | int level = default_message_loglevel; |
| 528 | int facility = 1; /* LOG_USER */ | 523 | int facility = 1; /* LOG_USER */ |
| 529 | size_t len = iov_length(iv, count); | 524 | size_t len = iocb->ki_nbytes; |
| 530 | ssize_t ret = len; | 525 | ssize_t ret = len; |
| 531 | 526 | ||
| 532 | if (len > LOG_LINE_MAX) | 527 | if (len > LOG_LINE_MAX) |
| @@ -535,13 +530,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | |||
| 535 | if (buf == NULL) | 530 | if (buf == NULL) |
| 536 | return -ENOMEM; | 531 | return -ENOMEM; |
| 537 | 532 | ||
| 538 | line = buf; | 533 | buf[len] = '\0'; |
| 539 | for (i = 0; i < count; i++) { | 534 | if (copy_from_iter(buf, len, from) != len) { |
| 540 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { | 535 | kfree(buf); |
| 541 | ret = -EFAULT; | 536 | return -EFAULT; |
| 542 | goto out; | ||
| 543 | } | ||
| 544 | line += iv[i].iov_len; | ||
| 545 | } | 537 | } |
| 546 | 538 | ||
| 547 | /* | 539 | /* |
| @@ -567,10 +559,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | |||
| 567 | line = endp; | 559 | line = endp; |
| 568 | } | 560 | } |
| 569 | } | 561 | } |
| 570 | line[len] = '\0'; | ||
| 571 | 562 | ||
| 572 | printk_emit(facility, level, NULL, 0, "%s", line); | 563 | printk_emit(facility, level, NULL, 0, "%s", line); |
| 573 | out: | ||
| 574 | kfree(buf); | 564 | kfree(buf); |
| 575 | return ret; | 565 | return ret; |
| 576 | } | 566 | } |
| @@ -802,7 +792,7 @@ static int devkmsg_release(struct inode *inode, struct file *file) | |||
| 802 | const struct file_operations kmsg_fops = { | 792 | const struct file_operations kmsg_fops = { |
| 803 | .open = devkmsg_open, | 793 | .open = devkmsg_open, |
| 804 | .read = devkmsg_read, | 794 | .read = devkmsg_read, |
| 805 | .aio_write = devkmsg_writev, | 795 | .write_iter = devkmsg_write, |
| 806 | .llseek = devkmsg_llseek, | 796 | .llseek = devkmsg_llseek, |
| 807 | .poll = devkmsg_poll, | 797 | .poll = devkmsg_poll, |
| 808 | .release = devkmsg_release, | 798 | .release = devkmsg_release, |
| @@ -858,6 +848,9 @@ static int __init log_buf_len_setup(char *str) | |||
| 858 | } | 848 | } |
| 859 | early_param("log_buf_len", log_buf_len_setup); | 849 | early_param("log_buf_len", log_buf_len_setup); |
| 860 | 850 | ||
| 851 | #ifdef CONFIG_SMP | ||
| 852 | #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) | ||
| 853 | |||
| 861 | static void __init log_buf_add_cpu(void) | 854 | static void __init log_buf_add_cpu(void) |
| 862 | { | 855 | { |
| 863 | unsigned int cpu_extra; | 856 | unsigned int cpu_extra; |
| @@ -884,6 +877,9 @@ static void __init log_buf_add_cpu(void) | |||
| 884 | 877 | ||
| 885 | log_buf_len_update(cpu_extra + __LOG_BUF_LEN); | 878 | log_buf_len_update(cpu_extra + __LOG_BUF_LEN); |
| 886 | } | 879 | } |
| 880 | #else /* !CONFIG_SMP */ | ||
| 881 | static inline void log_buf_add_cpu(void) {} | ||
| 882 | #endif /* CONFIG_SMP */ | ||
| 887 | 883 | ||
| 888 | void __init setup_log_buf(int early) | 884 | void __init setup_log_buf(int early) |
| 889 | { | 885 | { |
| @@ -1260,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
| 1260 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 1256 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
| 1261 | { | 1257 | { |
| 1262 | bool clear = false; | 1258 | bool clear = false; |
| 1263 | static int saved_console_loglevel = -1; | 1259 | static int saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1264 | int error; | 1260 | int error; |
| 1265 | 1261 | ||
| 1266 | error = check_syslog_permissions(type, from_file); | 1262 | error = check_syslog_permissions(type, from_file); |
| @@ -1317,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1317 | break; | 1313 | break; |
| 1318 | /* Disable logging to console */ | 1314 | /* Disable logging to console */ |
| 1319 | case SYSLOG_ACTION_CONSOLE_OFF: | 1315 | case SYSLOG_ACTION_CONSOLE_OFF: |
| 1320 | if (saved_console_loglevel == -1) | 1316 | if (saved_console_loglevel == LOGLEVEL_DEFAULT) |
| 1321 | saved_console_loglevel = console_loglevel; | 1317 | saved_console_loglevel = console_loglevel; |
| 1322 | console_loglevel = minimum_console_loglevel; | 1318 | console_loglevel = minimum_console_loglevel; |
| 1323 | break; | 1319 | break; |
| 1324 | /* Enable logging to console */ | 1320 | /* Enable logging to console */ |
| 1325 | case SYSLOG_ACTION_CONSOLE_ON: | 1321 | case SYSLOG_ACTION_CONSOLE_ON: |
| 1326 | if (saved_console_loglevel != -1) { | 1322 | if (saved_console_loglevel != LOGLEVEL_DEFAULT) { |
| 1327 | console_loglevel = saved_console_loglevel; | 1323 | console_loglevel = saved_console_loglevel; |
| 1328 | saved_console_loglevel = -1; | 1324 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1329 | } | 1325 | } |
| 1330 | break; | 1326 | break; |
| 1331 | /* Set level of messages printed to console */ | 1327 | /* Set level of messages printed to console */ |
| @@ -1337,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
| 1337 | len = minimum_console_loglevel; | 1333 | len = minimum_console_loglevel; |
| 1338 | console_loglevel = len; | 1334 | console_loglevel = len; |
| 1339 | /* Implicitly re-enable logging to console */ | 1335 | /* Implicitly re-enable logging to console */ |
| 1340 | saved_console_loglevel = -1; | 1336 | saved_console_loglevel = LOGLEVEL_DEFAULT; |
| 1341 | error = 0; | 1337 | error = 0; |
| 1342 | break; | 1338 | break; |
| 1343 | /* Number of chars in the log buffer */ | 1339 | /* Number of chars in the log buffer */ |
| @@ -1628,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1628 | int printed_len = 0; | 1624 | int printed_len = 0; |
| 1629 | bool in_sched = false; | 1625 | bool in_sched = false; |
| 1630 | /* cpu currently holding logbuf_lock in this function */ | 1626 | /* cpu currently holding logbuf_lock in this function */ |
| 1631 | static volatile unsigned int logbuf_cpu = UINT_MAX; | 1627 | static unsigned int logbuf_cpu = UINT_MAX; |
| 1632 | 1628 | ||
| 1633 | if (level == SCHED_MESSAGE_LOGLEVEL) { | 1629 | if (level == LOGLEVEL_SCHED) { |
| 1634 | level = -1; | 1630 | level = LOGLEVEL_DEFAULT; |
| 1635 | in_sched = true; | 1631 | in_sched = true; |
| 1636 | } | 1632 | } |
| 1637 | 1633 | ||
| @@ -1680,12 +1676,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1680 | * The printf needs to come first; we need the syslog | 1676 | * The printf needs to come first; we need the syslog |
| 1681 | * prefix which might be passed-in as a parameter. | 1677 | * prefix which might be passed-in as a parameter. |
| 1682 | */ | 1678 | */ |
| 1683 | if (in_sched) | 1679 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); |
| 1684 | text_len = scnprintf(text, sizeof(textbuf), | ||
| 1685 | KERN_WARNING "[sched_delayed] "); | ||
| 1686 | |||
| 1687 | text_len += vscnprintf(text + text_len, | ||
| 1688 | sizeof(textbuf) - text_len, fmt, args); | ||
| 1689 | 1680 | ||
| 1690 | /* mark and strip a trailing newline */ | 1681 | /* mark and strip a trailing newline */ |
| 1691 | if (text_len && text[text_len-1] == '\n') { | 1682 | if (text_len && text[text_len-1] == '\n') { |
| @@ -1701,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1701 | const char *end_of_header = printk_skip_level(text); | 1692 | const char *end_of_header = printk_skip_level(text); |
| 1702 | switch (kern_level) { | 1693 | switch (kern_level) { |
| 1703 | case '0' ... '7': | 1694 | case '0' ... '7': |
| 1704 | if (level == -1) | 1695 | if (level == LOGLEVEL_DEFAULT) |
| 1705 | level = kern_level - '0'; | 1696 | level = kern_level - '0'; |
| 1697 | /* fallthrough */ | ||
| 1706 | case 'd': /* KERN_DEFAULT */ | 1698 | case 'd': /* KERN_DEFAULT */ |
| 1707 | lflags |= LOG_PREFIX; | 1699 | lflags |= LOG_PREFIX; |
| 1708 | } | 1700 | } |
| @@ -1716,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1716 | } | 1708 | } |
| 1717 | } | 1709 | } |
| 1718 | 1710 | ||
| 1719 | if (level == -1) | 1711 | if (level == LOGLEVEL_DEFAULT) |
| 1720 | level = default_message_loglevel; | 1712 | level = default_message_loglevel; |
| 1721 | 1713 | ||
| 1722 | if (dict) | 1714 | if (dict) |
| @@ -1794,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); | |||
| 1794 | 1786 | ||
| 1795 | asmlinkage int vprintk(const char *fmt, va_list args) | 1787 | asmlinkage int vprintk(const char *fmt, va_list args) |
| 1796 | { | 1788 | { |
| 1797 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | 1789 | return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); |
| 1798 | } | 1790 | } |
| 1799 | EXPORT_SYMBOL(vprintk); | 1791 | EXPORT_SYMBOL(vprintk); |
| 1800 | 1792 | ||
| @@ -1813,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level, | |||
| 1813 | } | 1805 | } |
| 1814 | EXPORT_SYMBOL(printk_emit); | 1806 | EXPORT_SYMBOL(printk_emit); |
| 1815 | 1807 | ||
| 1808 | int vprintk_default(const char *fmt, va_list args) | ||
| 1809 | { | ||
| 1810 | int r; | ||
| 1811 | |||
| 1812 | #ifdef CONFIG_KGDB_KDB | ||
| 1813 | if (unlikely(kdb_trap_printk)) { | ||
| 1814 | r = vkdb_printf(fmt, args); | ||
| 1815 | return r; | ||
| 1816 | } | ||
| 1817 | #endif | ||
| 1818 | r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); | ||
| 1819 | |||
| 1820 | return r; | ||
| 1821 | } | ||
| 1822 | EXPORT_SYMBOL_GPL(vprintk_default); | ||
| 1823 | |||
| 1824 | /* | ||
| 1825 | * This allows printk to be diverted to another function per cpu. | ||
| 1826 | * This is useful for calling printk functions from within NMI | ||
| 1827 | * without worrying about race conditions that can lock up the | ||
| 1828 | * box. | ||
| 1829 | */ | ||
| 1830 | DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; | ||
| 1831 | |||
| 1816 | /** | 1832 | /** |
| 1817 | * printk - print a kernel message | 1833 | * printk - print a kernel message |
| 1818 | * @fmt: format string | 1834 | * @fmt: format string |
| @@ -1836,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit); | |||
| 1836 | */ | 1852 | */ |
| 1837 | asmlinkage __visible int printk(const char *fmt, ...) | 1853 | asmlinkage __visible int printk(const char *fmt, ...) |
| 1838 | { | 1854 | { |
| 1855 | printk_func_t vprintk_func; | ||
| 1839 | va_list args; | 1856 | va_list args; |
| 1840 | int r; | 1857 | int r; |
| 1841 | 1858 | ||
| 1842 | #ifdef CONFIG_KGDB_KDB | ||
| 1843 | if (unlikely(kdb_trap_printk)) { | ||
| 1844 | va_start(args, fmt); | ||
| 1845 | r = vkdb_printf(fmt, args); | ||
| 1846 | va_end(args); | ||
| 1847 | return r; | ||
| 1848 | } | ||
| 1849 | #endif | ||
| 1850 | va_start(args, fmt); | 1859 | va_start(args, fmt); |
| 1851 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | 1860 | |
| 1861 | /* | ||
| 1862 | * If a caller overrides the per_cpu printk_func, then it needs | ||
| 1863 | * to disable preemption when calling printk(). Otherwise | ||
| 1864 | * the printk_func should be set to the default. No need to | ||
| 1865 | * disable preemption here. | ||
| 1866 | */ | ||
| 1867 | vprintk_func = this_cpu_read(printk_func); | ||
| 1868 | r = vprintk_func(fmt, args); | ||
| 1869 | |||
| 1852 | va_end(args); | 1870 | va_end(args); |
| 1853 | 1871 | ||
| 1854 | return r; | 1872 | return r; |
| @@ -1882,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, | |||
| 1882 | bool syslog, char *buf, size_t size) { return 0; } | 1900 | bool syslog, char *buf, size_t size) { return 0; } |
| 1883 | static size_t cont_print_text(char *text, size_t size) { return 0; } | 1901 | static size_t cont_print_text(char *text, size_t size) { return 0; } |
| 1884 | 1902 | ||
| 1903 | /* Still needs to be defined for users */ | ||
| 1904 | DEFINE_PER_CPU(printk_func_t, printk_func); | ||
| 1905 | |||
| 1885 | #endif /* CONFIG_PRINTK */ | 1906 | #endif /* CONFIG_PRINTK */ |
| 1886 | 1907 | ||
| 1887 | #ifdef CONFIG_EARLY_PRINTK | 1908 | #ifdef CONFIG_EARLY_PRINTK |
| 1888 | struct console *early_console; | 1909 | struct console *early_console; |
| 1889 | 1910 | ||
| 1890 | void early_vprintk(const char *fmt, va_list ap) | ||
| 1891 | { | ||
| 1892 | if (early_console) { | ||
| 1893 | char buf[512]; | ||
| 1894 | int n = vscnprintf(buf, sizeof(buf), fmt, ap); | ||
| 1895 | |||
| 1896 | early_console->write(early_console, buf, n); | ||
| 1897 | } | ||
| 1898 | } | ||
| 1899 | |||
| 1900 | asmlinkage __visible void early_printk(const char *fmt, ...) | 1911 | asmlinkage __visible void early_printk(const char *fmt, ...) |
| 1901 | { | 1912 | { |
| 1902 | va_list ap; | 1913 | va_list ap; |
| 1914 | char buf[512]; | ||
| 1915 | int n; | ||
| 1916 | |||
| 1917 | if (!early_console) | ||
| 1918 | return; | ||
| 1903 | 1919 | ||
| 1904 | va_start(ap, fmt); | 1920 | va_start(ap, fmt); |
| 1905 | early_vprintk(fmt, ap); | 1921 | n = vscnprintf(buf, sizeof(buf), fmt, ap); |
| 1906 | va_end(ap); | 1922 | va_end(ap); |
| 1923 | |||
| 1924 | early_console->write(early_console, buf, n); | ||
| 1907 | } | 1925 | } |
| 1908 | #endif | 1926 | #endif |
| 1909 | 1927 | ||
| @@ -2628,7 +2646,7 @@ void wake_up_klogd(void) | |||
| 2628 | preempt_disable(); | 2646 | preempt_disable(); |
| 2629 | if (waitqueue_active(&log_wait)) { | 2647 | if (waitqueue_active(&log_wait)) { |
| 2630 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 2648 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
| 2631 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2649 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2632 | } | 2650 | } |
| 2633 | preempt_enable(); | 2651 | preempt_enable(); |
| 2634 | } | 2652 | } |
| @@ -2640,11 +2658,11 @@ int printk_deferred(const char *fmt, ...) | |||
| 2640 | 2658 | ||
| 2641 | preempt_disable(); | 2659 | preempt_disable(); |
| 2642 | va_start(args, fmt); | 2660 | va_start(args, fmt); |
| 2643 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); | 2661 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); |
| 2644 | va_end(args); | 2662 | va_end(args); |
| 2645 | 2663 | ||
| 2646 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2664 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
| 2647 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2665 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2648 | preempt_enable(); | 2666 | preempt_enable(); |
| 2649 | 2667 | ||
| 2650 | return r; | 2668 | return r; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c4..1eb9d90c3af9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 485 | 485 | ||
| 486 | /* | 486 | /* |
| 487 | * Detach all tasks we were using ptrace on. Called with tasklist held | 487 | * Detach all tasks we were using ptrace on. Called with tasklist held |
| 488 | * for writing, and returns with it held too. But note it can release | 488 | * for writing. |
| 489 | * and reacquire the lock. | ||
| 490 | */ | 489 | */ |
| 491 | void exit_ptrace(struct task_struct *tracer) | 490 | void exit_ptrace(struct task_struct *tracer, struct list_head *dead) |
| 492 | __releases(&tasklist_lock) | ||
| 493 | __acquires(&tasklist_lock) | ||
| 494 | { | 491 | { |
| 495 | struct task_struct *p, *n; | 492 | struct task_struct *p, *n; |
| 496 | LIST_HEAD(ptrace_dead); | ||
| 497 | |||
| 498 | if (likely(list_empty(&tracer->ptraced))) | ||
| 499 | return; | ||
| 500 | 493 | ||
| 501 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 494 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
| 502 | if (unlikely(p->ptrace & PT_EXITKILL)) | 495 | if (unlikely(p->ptrace & PT_EXITKILL)) |
| 503 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); | 496 | send_sig_info(SIGKILL, SEND_SIG_FORCED, p); |
| 504 | 497 | ||
| 505 | if (__ptrace_detach(tracer, p)) | 498 | if (__ptrace_detach(tracer, p)) |
| 506 | list_add(&p->ptrace_entry, &ptrace_dead); | 499 | list_add(&p->ptrace_entry, dead); |
| 507 | } | ||
| 508 | |||
| 509 | write_unlock_irq(&tasklist_lock); | ||
| 510 | BUG_ON(!list_empty(&tracer->ptraced)); | ||
| 511 | |||
| 512 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { | ||
| 513 | list_del_init(&p->ptrace_entry); | ||
| 514 | release_task(p); | ||
| 515 | } | 500 | } |
| 516 | |||
| 517 | write_lock_irq(&tasklist_lock); | ||
| 518 | } | 501 | } |
| 519 | 502 | ||
| 520 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 503 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 807ccfbf69b3..e6fae503d1bc 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | obj-y += update.o srcu.o | 1 | obj-y += update.o srcu.o |
| 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 3 | obj-$(CONFIG_TREE_RCU) += tree.o | 3 | obj-$(CONFIG_TREE_RCU) += tree.o |
| 4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | 4 | obj-$(CONFIG_PREEMPT_RCU) += tree.o |
| 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o |
| 6 | obj-$(CONFIG_TINY_RCU) += tiny.o | 6 | obj-$(CONFIG_TINY_RCU) += tiny.o |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ff1a6de62f17..07bb02eda844 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void); | |||
| 135 | */ | 135 | */ |
| 136 | #define TPS(x) tracepoint_string(x) | 136 | #define TPS(x) tracepoint_string(x) |
| 137 | 137 | ||
| 138 | void rcu_early_boot_tests(void); | ||
| 139 | |||
| 138 | #endif /* __LINUX_RCU_H */ | 140 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..4d559baf06e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -49,11 +49,19 @@ | |||
| 49 | #include <linux/trace_clock.h> | 49 | #include <linux/trace_clock.h> |
| 50 | #include <asm/byteorder.h> | 50 | #include <asm/byteorder.h> |
| 51 | #include <linux/torture.h> | 51 | #include <linux/torture.h> |
| 52 | #include <linux/vmalloc.h> | ||
| 52 | 53 | ||
| 53 | MODULE_LICENSE("GPL"); | 54 | MODULE_LICENSE("GPL"); |
| 54 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); | 55 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); |
| 55 | 56 | ||
| 56 | 57 | ||
| 58 | torture_param(int, cbflood_inter_holdoff, HZ, | ||
| 59 | "Holdoff between floods (jiffies)"); | ||
| 60 | torture_param(int, cbflood_intra_holdoff, 1, | ||
| 61 | "Holdoff between bursts (jiffies)"); | ||
| 62 | torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); | ||
| 63 | torture_param(int, cbflood_n_per_burst, 20000, | ||
| 64 | "# callbacks per burst in flood"); | ||
| 57 | torture_param(int, fqs_duration, 0, | 65 | torture_param(int, fqs_duration, 0, |
| 58 | "Duration of fqs bursts (us), 0 to disable"); | 66 | "Duration of fqs bursts (us), 0 to disable"); |
| 59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); | 67 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); |
| @@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444); | |||
| 96 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); | 104 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
| 97 | 105 | ||
| 98 | static int nrealreaders; | 106 | static int nrealreaders; |
| 107 | static int ncbflooders; | ||
| 99 | static struct task_struct *writer_task; | 108 | static struct task_struct *writer_task; |
| 100 | static struct task_struct **fakewriter_tasks; | 109 | static struct task_struct **fakewriter_tasks; |
| 101 | static struct task_struct **reader_tasks; | 110 | static struct task_struct **reader_tasks; |
| 102 | static struct task_struct *stats_task; | 111 | static struct task_struct *stats_task; |
| 112 | static struct task_struct **cbflood_task; | ||
| 103 | static struct task_struct *fqs_task; | 113 | static struct task_struct *fqs_task; |
| 104 | static struct task_struct *boost_tasks[NR_CPUS]; | 114 | static struct task_struct *boost_tasks[NR_CPUS]; |
| 105 | static struct task_struct *stall_task; | 115 | static struct task_struct *stall_task; |
| @@ -138,6 +148,7 @@ static long n_rcu_torture_boosts; | |||
| 138 | static long n_rcu_torture_timers; | 148 | static long n_rcu_torture_timers; |
| 139 | static long n_barrier_attempts; | 149 | static long n_barrier_attempts; |
| 140 | static long n_barrier_successes; | 150 | static long n_barrier_successes; |
| 151 | static atomic_long_t n_cbfloods; | ||
| 141 | static struct list_head rcu_torture_removed; | 152 | static struct list_head rcu_torture_removed; |
| 142 | 153 | ||
| 143 | static int rcu_torture_writer_state; | 154 | static int rcu_torture_writer_state; |
| @@ -157,9 +168,9 @@ static int rcu_torture_writer_state; | |||
| 157 | #else | 168 | #else |
| 158 | #define RCUTORTURE_RUNNABLE_INIT 0 | 169 | #define RCUTORTURE_RUNNABLE_INIT 0 |
| 159 | #endif | 170 | #endif |
| 160 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 171 | static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; |
| 161 | module_param(rcutorture_runnable, int, 0444); | 172 | module_param(torture_runnable, int, 0444); |
| 162 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | 173 | MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); |
| 163 | 174 | ||
| 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 175 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
| 165 | #define rcu_can_boost() 1 | 176 | #define rcu_can_boost() 1 |
| @@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void) | |||
| 182 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 193 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
| 183 | 194 | ||
| 184 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 195 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
| 185 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 196 | static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
| 186 | /* and boost task create/destroy. */ | 197 | /* and boost task create/destroy. */ |
| 187 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | 198 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ |
| 188 | static bool barrier_phase; /* Test phase. */ | 199 | static bool barrier_phase; /* Test phase. */ |
| @@ -242,7 +253,7 @@ struct rcu_torture_ops { | |||
| 242 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 253 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 243 | void (*cb_barrier)(void); | 254 | void (*cb_barrier)(void); |
| 244 | void (*fqs)(void); | 255 | void (*fqs)(void); |
| 245 | void (*stats)(char *page); | 256 | void (*stats)(void); |
| 246 | int irq_capable; | 257 | int irq_capable; |
| 247 | int can_boost; | 258 | int can_boost; |
| 248 | const char *name; | 259 | const char *name; |
| @@ -525,21 +536,21 @@ static void srcu_torture_barrier(void) | |||
| 525 | srcu_barrier(&srcu_ctl); | 536 | srcu_barrier(&srcu_ctl); |
| 526 | } | 537 | } |
| 527 | 538 | ||
| 528 | static void srcu_torture_stats(char *page) | 539 | static void srcu_torture_stats(void) |
| 529 | { | 540 | { |
| 530 | int cpu; | 541 | int cpu; |
| 531 | int idx = srcu_ctl.completed & 0x1; | 542 | int idx = srcu_ctl.completed & 0x1; |
| 532 | 543 | ||
| 533 | page += sprintf(page, "%s%s per-CPU(idx=%d):", | 544 | pr_alert("%s%s per-CPU(idx=%d):", |
| 534 | torture_type, TORTURE_FLAG, idx); | 545 | torture_type, TORTURE_FLAG, idx); |
| 535 | for_each_possible_cpu(cpu) { | 546 | for_each_possible_cpu(cpu) { |
| 536 | long c0, c1; | 547 | long c0, c1; |
| 537 | 548 | ||
| 538 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; | 549 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; |
| 539 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; | 550 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; |
| 540 | page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); | 551 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); |
| 541 | } | 552 | } |
| 542 | sprintf(page, "\n"); | 553 | pr_cont("\n"); |
| 543 | } | 554 | } |
| 544 | 555 | ||
| 545 | static void srcu_torture_synchronize_expedited(void) | 556 | static void srcu_torture_synchronize_expedited(void) |
| @@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = { | |||
| 601 | .name = "sched" | 612 | .name = "sched" |
| 602 | }; | 613 | }; |
| 603 | 614 | ||
| 615 | #ifdef CONFIG_TASKS_RCU | ||
| 616 | |||
| 617 | /* | ||
| 618 | * Definitions for RCU-tasks torture testing. | ||
| 619 | */ | ||
| 620 | |||
| 621 | static int tasks_torture_read_lock(void) | ||
| 622 | { | ||
| 623 | return 0; | ||
| 624 | } | ||
| 625 | |||
| 626 | static void tasks_torture_read_unlock(int idx) | ||
| 627 | { | ||
| 628 | } | ||
| 629 | |||
| 630 | static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) | ||
| 631 | { | ||
| 632 | call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); | ||
| 633 | } | ||
| 634 | |||
| 635 | static struct rcu_torture_ops tasks_ops = { | ||
| 636 | .ttype = RCU_TASKS_FLAVOR, | ||
| 637 | .init = rcu_sync_torture_init, | ||
| 638 | .readlock = tasks_torture_read_lock, | ||
| 639 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 640 | .readunlock = tasks_torture_read_unlock, | ||
| 641 | .completed = rcu_no_completed, | ||
| 642 | .deferred_free = rcu_tasks_torture_deferred_free, | ||
| 643 | .sync = synchronize_rcu_tasks, | ||
| 644 | .exp_sync = synchronize_rcu_tasks, | ||
| 645 | .call = call_rcu_tasks, | ||
| 646 | .cb_barrier = rcu_barrier_tasks, | ||
| 647 | .fqs = NULL, | ||
| 648 | .stats = NULL, | ||
| 649 | .irq_capable = 1, | ||
| 650 | .name = "tasks" | ||
| 651 | }; | ||
| 652 | |||
| 653 | #define RCUTORTURE_TASKS_OPS &tasks_ops, | ||
| 654 | |||
| 655 | #else /* #ifdef CONFIG_TASKS_RCU */ | ||
| 656 | |||
| 657 | #define RCUTORTURE_TASKS_OPS | ||
| 658 | |||
| 659 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ | ||
| 660 | |||
| 604 | /* | 661 | /* |
| 605 | * RCU torture priority-boost testing. Runs one real-time thread per | 662 | * RCU torture priority-boost testing. Runs one real-time thread per |
| 606 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 663 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
| @@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg) | |||
| 667 | } | 724 | } |
| 668 | call_rcu_time = jiffies; | 725 | call_rcu_time = jiffies; |
| 669 | } | 726 | } |
| 670 | cond_resched(); | 727 | cond_resched_rcu_qs(); |
| 671 | stutter_wait("rcu_torture_boost"); | 728 | stutter_wait("rcu_torture_boost"); |
| 672 | if (torture_must_stop()) | 729 | if (torture_must_stop()) |
| 673 | goto checkwait; | 730 | goto checkwait; |
| @@ -707,6 +764,59 @@ checkwait: stutter_wait("rcu_torture_boost"); | |||
| 707 | return 0; | 764 | return 0; |
| 708 | } | 765 | } |
| 709 | 766 | ||
| 767 | static void rcu_torture_cbflood_cb(struct rcu_head *rhp) | ||
| 768 | { | ||
| 769 | } | ||
| 770 | |||
| 771 | /* | ||
| 772 | * RCU torture callback-flood kthread. Repeatedly induces bursts of calls | ||
| 773 | * to call_rcu() or analogous, increasing the probability of occurrence | ||
| 774 | * of callback-overflow corner cases. | ||
| 775 | */ | ||
| 776 | static int | ||
| 777 | rcu_torture_cbflood(void *arg) | ||
| 778 | { | ||
| 779 | int err = 1; | ||
| 780 | int i; | ||
| 781 | int j; | ||
| 782 | struct rcu_head *rhp; | ||
| 783 | |||
| 784 | if (cbflood_n_per_burst > 0 && | ||
| 785 | cbflood_inter_holdoff > 0 && | ||
| 786 | cbflood_intra_holdoff > 0 && | ||
| 787 | cur_ops->call && | ||
| 788 | cur_ops->cb_barrier) { | ||
| 789 | rhp = vmalloc(sizeof(*rhp) * | ||
| 790 | cbflood_n_burst * cbflood_n_per_burst); | ||
| 791 | err = !rhp; | ||
| 792 | } | ||
| 793 | if (err) { | ||
| 794 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); | ||
| 795 | while (!torture_must_stop()) | ||
| 796 | schedule_timeout_interruptible(HZ); | ||
| 797 | return 0; | ||
| 798 | } | ||
| 799 | VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); | ||
| 800 | do { | ||
| 801 | schedule_timeout_interruptible(cbflood_inter_holdoff); | ||
| 802 | atomic_long_inc(&n_cbfloods); | ||
| 803 | WARN_ON(signal_pending(current)); | ||
| 804 | for (i = 0; i < cbflood_n_burst; i++) { | ||
| 805 | for (j = 0; j < cbflood_n_per_burst; j++) { | ||
| 806 | cur_ops->call(&rhp[i * cbflood_n_per_burst + j], | ||
| 807 | rcu_torture_cbflood_cb); | ||
| 808 | } | ||
| 809 | schedule_timeout_interruptible(cbflood_intra_holdoff); | ||
| 810 | WARN_ON(signal_pending(current)); | ||
| 811 | } | ||
| 812 | cur_ops->cb_barrier(); | ||
| 813 | stutter_wait("rcu_torture_cbflood"); | ||
| 814 | } while (!torture_must_stop()); | ||
| 815 | vfree(rhp); | ||
| 816 | torture_kthread_stopping("rcu_torture_cbflood"); | ||
| 817 | return 0; | ||
| 818 | } | ||
| 819 | |||
| 710 | /* | 820 | /* |
| 711 | * RCU torture force-quiescent-state kthread. Repeatedly induces | 821 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
| 712 | * bursts of calls to force_quiescent_state(), increasing the probability | 822 | * bursts of calls to force_quiescent_state(), increasing the probability |
| @@ -1019,7 +1129,7 @@ rcu_torture_reader(void *arg) | |||
| 1019 | __this_cpu_inc(rcu_torture_batch[completed]); | 1129 | __this_cpu_inc(rcu_torture_batch[completed]); |
| 1020 | preempt_enable(); | 1130 | preempt_enable(); |
| 1021 | cur_ops->readunlock(idx); | 1131 | cur_ops->readunlock(idx); |
| 1022 | cond_resched(); | 1132 | cond_resched_rcu_qs(); |
| 1023 | stutter_wait("rcu_torture_reader"); | 1133 | stutter_wait("rcu_torture_reader"); |
| 1024 | } while (!torture_must_stop()); | 1134 | } while (!torture_must_stop()); |
| 1025 | if (irqreader && cur_ops->irq_capable) { | 1135 | if (irqreader && cur_ops->irq_capable) { |
| @@ -1031,10 +1141,15 @@ rcu_torture_reader(void *arg) | |||
| 1031 | } | 1141 | } |
| 1032 | 1142 | ||
| 1033 | /* | 1143 | /* |
| 1034 | * Create an RCU-torture statistics message in the specified buffer. | 1144 | * Print torture statistics. Caller must ensure that there is only |
| 1145 | * one call to this function at a given time!!! This is normally | ||
| 1146 | * accomplished by relying on the module system to only have one copy | ||
| 1147 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 1148 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 1149 | * thread is not running). | ||
| 1035 | */ | 1150 | */ |
| 1036 | static void | 1151 | static void |
| 1037 | rcu_torture_printk(char *page) | 1152 | rcu_torture_stats_print(void) |
| 1038 | { | 1153 | { |
| 1039 | int cpu; | 1154 | int cpu; |
| 1040 | int i; | 1155 | int i; |
| @@ -1052,55 +1167,61 @@ rcu_torture_printk(char *page) | |||
| 1052 | if (pipesummary[i] != 0) | 1167 | if (pipesummary[i] != 0) |
| 1053 | break; | 1168 | break; |
| 1054 | } | 1169 | } |
| 1055 | page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); | 1170 | |
| 1056 | page += sprintf(page, | 1171 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); |
| 1057 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", | 1172 | pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
| 1058 | rcu_torture_current, | 1173 | rcu_torture_current, |
| 1059 | rcu_torture_current_version, | 1174 | rcu_torture_current_version, |
| 1060 | list_empty(&rcu_torture_freelist), | 1175 | list_empty(&rcu_torture_freelist), |
| 1061 | atomic_read(&n_rcu_torture_alloc), | 1176 | atomic_read(&n_rcu_torture_alloc), |
| 1062 | atomic_read(&n_rcu_torture_alloc_fail), | 1177 | atomic_read(&n_rcu_torture_alloc_fail), |
| 1063 | atomic_read(&n_rcu_torture_free)); | 1178 | atomic_read(&n_rcu_torture_free)); |
| 1064 | page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", | 1179 | pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", |
| 1065 | atomic_read(&n_rcu_torture_mberror), | 1180 | atomic_read(&n_rcu_torture_mberror), |
| 1066 | n_rcu_torture_boost_ktrerror, | 1181 | n_rcu_torture_boost_ktrerror, |
| 1067 | n_rcu_torture_boost_rterror); | 1182 | n_rcu_torture_boost_rterror); |
| 1068 | page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", | 1183 | pr_cont("rtbf: %ld rtb: %ld nt: %ld ", |
| 1069 | n_rcu_torture_boost_failure, | 1184 | n_rcu_torture_boost_failure, |
| 1070 | n_rcu_torture_boosts, | 1185 | n_rcu_torture_boosts, |
| 1071 | n_rcu_torture_timers); | 1186 | n_rcu_torture_timers); |
| 1072 | page = torture_onoff_stats(page); | 1187 | torture_onoff_stats(); |
| 1073 | page += sprintf(page, "barrier: %ld/%ld:%ld", | 1188 | pr_cont("barrier: %ld/%ld:%ld ", |
| 1074 | n_barrier_successes, | 1189 | n_barrier_successes, |
| 1075 | n_barrier_attempts, | 1190 | n_barrier_attempts, |
| 1076 | n_rcu_torture_barrier_error); | 1191 | n_rcu_torture_barrier_error); |
| 1077 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1192 | pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); |
| 1193 | |||
| 1194 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1078 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1195 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
| 1079 | n_rcu_torture_barrier_error != 0 || | 1196 | n_rcu_torture_barrier_error != 0 || |
| 1080 | n_rcu_torture_boost_ktrerror != 0 || | 1197 | n_rcu_torture_boost_ktrerror != 0 || |
| 1081 | n_rcu_torture_boost_rterror != 0 || | 1198 | n_rcu_torture_boost_rterror != 0 || |
| 1082 | n_rcu_torture_boost_failure != 0 || | 1199 | n_rcu_torture_boost_failure != 0 || |
| 1083 | i > 1) { | 1200 | i > 1) { |
| 1084 | page += sprintf(page, "!!! "); | 1201 | pr_cont("%s", "!!! "); |
| 1085 | atomic_inc(&n_rcu_torture_error); | 1202 | atomic_inc(&n_rcu_torture_error); |
| 1086 | WARN_ON_ONCE(1); | 1203 | WARN_ON_ONCE(1); |
| 1087 | } | 1204 | } |
| 1088 | page += sprintf(page, "Reader Pipe: "); | 1205 | pr_cont("Reader Pipe: "); |
| 1089 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1206 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 1090 | page += sprintf(page, " %ld", pipesummary[i]); | 1207 | pr_cont(" %ld", pipesummary[i]); |
| 1091 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1208 | pr_cont("\n"); |
| 1092 | page += sprintf(page, "Reader Batch: "); | 1209 | |
| 1210 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1211 | pr_cont("Reader Batch: "); | ||
| 1093 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1212 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 1094 | page += sprintf(page, " %ld", batchsummary[i]); | 1213 | pr_cont(" %ld", batchsummary[i]); |
| 1095 | page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); | 1214 | pr_cont("\n"); |
| 1096 | page += sprintf(page, "Free-Block Circulation: "); | 1215 | |
| 1216 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | ||
| 1217 | pr_cont("Free-Block Circulation: "); | ||
| 1097 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1218 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
| 1098 | page += sprintf(page, " %d", | 1219 | pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); |
| 1099 | atomic_read(&rcu_torture_wcount[i])); | ||
| 1100 | } | 1220 | } |
| 1101 | page += sprintf(page, "\n"); | 1221 | pr_cont("\n"); |
| 1222 | |||
| 1102 | if (cur_ops->stats) | 1223 | if (cur_ops->stats) |
| 1103 | cur_ops->stats(page); | 1224 | cur_ops->stats(); |
| 1104 | if (rtcv_snap == rcu_torture_current_version && | 1225 | if (rtcv_snap == rcu_torture_current_version && |
| 1105 | rcu_torture_current != NULL) { | 1226 | rcu_torture_current != NULL) { |
| 1106 | int __maybe_unused flags; | 1227 | int __maybe_unused flags; |
| @@ -1109,10 +1230,9 @@ rcu_torture_printk(char *page) | |||
| 1109 | 1230 | ||
| 1110 | rcutorture_get_gp_data(cur_ops->ttype, | 1231 | rcutorture_get_gp_data(cur_ops->ttype, |
| 1111 | &flags, &gpnum, &completed); | 1232 | &flags, &gpnum, &completed); |
| 1112 | page += sprintf(page, | 1233 | pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", |
| 1113 | "??? Writer stall state %d g%lu c%lu f%#x\n", | 1234 | rcu_torture_writer_state, |
| 1114 | rcu_torture_writer_state, | 1235 | gpnum, completed, flags); |
| 1115 | gpnum, completed, flags); | ||
| 1116 | show_rcu_gp_kthreads(); | 1236 | show_rcu_gp_kthreads(); |
| 1117 | rcutorture_trace_dump(); | 1237 | rcutorture_trace_dump(); |
| 1118 | } | 1238 | } |
| @@ -1120,30 +1240,6 @@ rcu_torture_printk(char *page) | |||
| 1120 | } | 1240 | } |
| 1121 | 1241 | ||
| 1122 | /* | 1242 | /* |
| 1123 | * Print torture statistics. Caller must ensure that there is only | ||
| 1124 | * one call to this function at a given time!!! This is normally | ||
| 1125 | * accomplished by relying on the module system to only have one copy | ||
| 1126 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 1127 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 1128 | * thread is not running). | ||
| 1129 | */ | ||
| 1130 | static void | ||
| 1131 | rcu_torture_stats_print(void) | ||
| 1132 | { | ||
| 1133 | int size = nr_cpu_ids * 200 + 8192; | ||
| 1134 | char *buf; | ||
| 1135 | |||
| 1136 | buf = kmalloc(size, GFP_KERNEL); | ||
| 1137 | if (!buf) { | ||
| 1138 | pr_err("rcu-torture: Out of memory, need: %d", size); | ||
| 1139 | return; | ||
| 1140 | } | ||
| 1141 | rcu_torture_printk(buf); | ||
| 1142 | pr_alert("%s", buf); | ||
| 1143 | kfree(buf); | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | /* | ||
| 1147 | * Periodically prints torture statistics, if periodic statistics printing | 1243 | * Periodically prints torture statistics, if periodic statistics printing |
| 1148 | * was specified via the stat_interval module parameter. | 1244 | * was specified via the stat_interval module parameter. |
| 1149 | */ | 1245 | */ |
| @@ -1295,7 +1391,8 @@ static int rcu_torture_barrier_cbs(void *arg) | |||
| 1295 | if (atomic_dec_and_test(&barrier_cbs_count)) | 1391 | if (atomic_dec_and_test(&barrier_cbs_count)) |
| 1296 | wake_up(&barrier_wq); | 1392 | wake_up(&barrier_wq); |
| 1297 | } while (!torture_must_stop()); | 1393 | } while (!torture_must_stop()); |
| 1298 | cur_ops->cb_barrier(); | 1394 | if (cur_ops->cb_barrier != NULL) |
| 1395 | cur_ops->cb_barrier(); | ||
| 1299 | destroy_rcu_head_on_stack(&rcu); | 1396 | destroy_rcu_head_on_stack(&rcu); |
| 1300 | torture_kthread_stopping("rcu_torture_barrier_cbs"); | 1397 | torture_kthread_stopping("rcu_torture_barrier_cbs"); |
| 1301 | return 0; | 1398 | return 0; |
| @@ -1418,7 +1515,7 @@ rcu_torture_cleanup(void) | |||
| 1418 | int i; | 1515 | int i; |
| 1419 | 1516 | ||
| 1420 | rcutorture_record_test_transition(); | 1517 | rcutorture_record_test_transition(); |
| 1421 | if (torture_cleanup()) { | 1518 | if (torture_cleanup_begin()) { |
| 1422 | if (cur_ops->cb_barrier != NULL) | 1519 | if (cur_ops->cb_barrier != NULL) |
| 1423 | cur_ops->cb_barrier(); | 1520 | cur_ops->cb_barrier(); |
| 1424 | return; | 1521 | return; |
| @@ -1447,6 +1544,8 @@ rcu_torture_cleanup(void) | |||
| 1447 | 1544 | ||
| 1448 | torture_stop_kthread(rcu_torture_stats, stats_task); | 1545 | torture_stop_kthread(rcu_torture_stats, stats_task); |
| 1449 | torture_stop_kthread(rcu_torture_fqs, fqs_task); | 1546 | torture_stop_kthread(rcu_torture_fqs, fqs_task); |
| 1547 | for (i = 0; i < ncbflooders; i++) | ||
| 1548 | torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); | ||
| 1450 | if ((test_boost == 1 && cur_ops->can_boost) || | 1549 | if ((test_boost == 1 && cur_ops->can_boost) || |
| 1451 | test_boost == 2) { | 1550 | test_boost == 2) { |
| 1452 | unregister_cpu_notifier(&rcutorture_cpu_nb); | 1551 | unregister_cpu_notifier(&rcutorture_cpu_nb); |
| @@ -1468,6 +1567,7 @@ rcu_torture_cleanup(void) | |||
| 1468 | "End of test: RCU_HOTPLUG"); | 1567 | "End of test: RCU_HOTPLUG"); |
| 1469 | else | 1568 | else |
| 1470 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1569 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
| 1570 | torture_cleanup_end(); | ||
| 1471 | } | 1571 | } |
| 1472 | 1572 | ||
| 1473 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 1573 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
| @@ -1534,9 +1634,10 @@ rcu_torture_init(void) | |||
| 1534 | int firsterr = 0; | 1634 | int firsterr = 0; |
| 1535 | static struct rcu_torture_ops *torture_ops[] = { | 1635 | static struct rcu_torture_ops *torture_ops[] = { |
| 1536 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, | 1636 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, |
| 1637 | RCUTORTURE_TASKS_OPS | ||
| 1537 | }; | 1638 | }; |
| 1538 | 1639 | ||
| 1539 | if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) | 1640 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) |
| 1540 | return -EBUSY; | 1641 | return -EBUSY; |
| 1541 | 1642 | ||
| 1542 | /* Process args and tell the world that the torturer is on the job. */ | 1643 | /* Process args and tell the world that the torturer is on the job. */ |
| @@ -1693,6 +1794,24 @@ rcu_torture_init(void) | |||
| 1693 | goto unwind; | 1794 | goto unwind; |
| 1694 | if (object_debug) | 1795 | if (object_debug) |
| 1695 | rcu_test_debug_objects(); | 1796 | rcu_test_debug_objects(); |
| 1797 | if (cbflood_n_burst > 0) { | ||
| 1798 | /* Create the cbflood threads */ | ||
| 1799 | ncbflooders = (num_online_cpus() + 3) / 4; | ||
| 1800 | cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), | ||
| 1801 | GFP_KERNEL); | ||
| 1802 | if (!cbflood_task) { | ||
| 1803 | VERBOSE_TOROUT_ERRSTRING("out of memory"); | ||
| 1804 | firsterr = -ENOMEM; | ||
| 1805 | goto unwind; | ||
| 1806 | } | ||
| 1807 | for (i = 0; i < ncbflooders; i++) { | ||
| 1808 | firsterr = torture_create_kthread(rcu_torture_cbflood, | ||
| 1809 | NULL, | ||
| 1810 | cbflood_task[i]); | ||
| 1811 | if (firsterr) | ||
| 1812 | goto unwind; | ||
| 1813 | } | ||
| 1814 | } | ||
| 1696 | rcutorture_record_test_transition(); | 1815 | rcutorture_record_test_transition(); |
| 1697 | torture_init_end(); | 1816 | torture_init_end(); |
| 1698 | return 0; | 1817 | return 0; |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..0db5649f8817 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | |||
| 51 | 51 | ||
| 52 | #include "tiny_plugin.h" | 52 | #include "tiny_plugin.h" |
| 53 | 53 | ||
| 54 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 54 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ |
| 55 | static void rcu_idle_enter_common(long long newval) | 55 | static void rcu_idle_enter_common(long long newval) |
| 56 | { | 56 | { |
| 57 | if (newval) { | 57 | if (newval) { |
| @@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval) | |||
| 62 | } | 62 | } |
| 63 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), | 63 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), |
| 64 | rcu_dynticks_nesting, newval)); | 64 | rcu_dynticks_nesting, newval)); |
| 65 | if (!is_idle_task(current)) { | 65 | if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { |
| 66 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | 66 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 67 | 67 | ||
| 68 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), | 68 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), |
| @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) | |||
| 72 | current->pid, current->comm, | 72 | current->pid, current->comm, |
| 73 | idle->pid, idle->comm); /* must be idle task! */ | 73 | idle->pid, idle->comm); /* must be idle task! */ |
| 74 | } | 74 | } |
| 75 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 75 | rcu_sched_qs(); /* implies rcu_bh_inc() */ |
| 76 | barrier(); | 76 | barrier(); |
| 77 | rcu_dynticks_nesting = newval; | 77 | rcu_dynticks_nesting = newval; |
| 78 | } | 78 | } |
| @@ -114,7 +114,7 @@ void rcu_irq_exit(void) | |||
| 114 | } | 114 | } |
| 115 | EXPORT_SYMBOL_GPL(rcu_irq_exit); | 115 | EXPORT_SYMBOL_GPL(rcu_irq_exit); |
| 116 | 116 | ||
| 117 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | 117 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ |
| 118 | static void rcu_idle_exit_common(long long oldval) | 118 | static void rcu_idle_exit_common(long long oldval) |
| 119 | { | 119 | { |
| 120 | if (oldval) { | 120 | if (oldval) { |
| @@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval) | |||
| 123 | return; | 123 | return; |
| 124 | } | 124 | } |
| 125 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); | 125 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); |
| 126 | if (!is_idle_task(current)) { | 126 | if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { |
| 127 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | 127 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 128 | 128 | ||
| 129 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), | 129 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), |
| @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | |||
| 217 | * are at it, given that any rcu quiescent state is also an rcu_bh | 217 | * are at it, given that any rcu quiescent state is also an rcu_bh |
| 218 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 218 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
| 219 | */ | 219 | */ |
| 220 | void rcu_sched_qs(int cpu) | 220 | void rcu_sched_qs(void) |
| 221 | { | 221 | { |
| 222 | unsigned long flags; | 222 | unsigned long flags; |
| 223 | 223 | ||
| @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) | |||
| 231 | /* | 231 | /* |
| 232 | * Record an rcu_bh quiescent state. | 232 | * Record an rcu_bh quiescent state. |
| 233 | */ | 233 | */ |
| 234 | void rcu_bh_qs(int cpu) | 234 | void rcu_bh_qs(void) |
| 235 | { | 235 | { |
| 236 | unsigned long flags; | 236 | unsigned long flags; |
| 237 | 237 | ||
| @@ -247,13 +247,15 @@ void rcu_bh_qs(int cpu) | |||
| 247 | * be called from hardirq context. It is normally called from the | 247 | * be called from hardirq context. It is normally called from the |
| 248 | * scheduling-clock interrupt. | 248 | * scheduling-clock interrupt. |
| 249 | */ | 249 | */ |
| 250 | void rcu_check_callbacks(int cpu, int user) | 250 | void rcu_check_callbacks(int user) |
| 251 | { | 251 | { |
| 252 | RCU_TRACE(check_cpu_stalls()); | 252 | RCU_TRACE(check_cpu_stalls()); |
| 253 | if (user || rcu_is_cpu_rrupt_from_idle()) | 253 | if (user || rcu_is_cpu_rrupt_from_idle()) |
| 254 | rcu_sched_qs(cpu); | 254 | rcu_sched_qs(); |
| 255 | else if (!in_softirq()) | 255 | else if (!in_softirq()) |
| 256 | rcu_bh_qs(cpu); | 256 | rcu_bh_qs(); |
| 257 | if (user) | ||
| 258 | rcu_note_voluntary_context_switch(current); | ||
| 257 | } | 259 | } |
| 258 | 260 | ||
| 259 | /* | 261 | /* |
| @@ -378,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 378 | } | 380 | } |
| 379 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 381 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
| 380 | 382 | ||
| 381 | void rcu_init(void) | 383 | void __init rcu_init(void) |
| 382 | { | 384 | { |
| 383 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 385 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 386 | |||
| 387 | rcu_early_boot_tests(); | ||
| 384 | } | 388 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..7680fc275036 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
| 79 | * the tracing userspace tools to be able to decipher the string | 79 | * the tracing userspace tools to be able to decipher the string |
| 80 | * address to the matching string. | 80 | * address to the matching string. |
| 81 | */ | 81 | */ |
| 82 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | 82 | #ifdef CONFIG_TRACING |
| 83 | # define DEFINE_RCU_TPS(sname) \ | ||
| 83 | static char sname##_varname[] = #sname; \ | 84 | static char sname##_varname[] = #sname; \ |
| 84 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | 85 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; |
| 86 | # define RCU_STATE_NAME(sname) sname##_varname | ||
| 87 | #else | ||
| 88 | # define DEFINE_RCU_TPS(sname) | ||
| 89 | # define RCU_STATE_NAME(sname) __stringify(sname) | ||
| 90 | #endif | ||
| 91 | |||
| 92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
| 93 | DEFINE_RCU_TPS(sname) \ | ||
| 85 | struct rcu_state sname##_state = { \ | 94 | struct rcu_state sname##_state = { \ |
| 86 | .level = { &sname##_state.node[0] }, \ | 95 | .level = { &sname##_state.node[0] }, \ |
| 87 | .call = cr, \ | 96 | .call = cr, \ |
| @@ -93,10 +102,10 @@ struct rcu_state sname##_state = { \ | |||
| 93 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 102 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
| 94 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
| 95 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 104 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
| 96 | .name = sname##_varname, \ | 105 | .name = RCU_STATE_NAME(sname), \ |
| 97 | .abbr = sabbr, \ | 106 | .abbr = sabbr, \ |
| 98 | }; \ | 107 | }; \ |
| 99 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) |
| 100 | 109 | ||
| 101 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
| 102 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
| @@ -143,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
| 143 | */ | 152 | */ |
| 144 | static int rcu_scheduler_fully_active __read_mostly; | 153 | static int rcu_scheduler_fully_active __read_mostly; |
| 145 | 154 | ||
| 146 | #ifdef CONFIG_RCU_BOOST | ||
| 147 | |||
| 148 | /* | ||
| 149 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 150 | * handle all flavors of RCU. | ||
| 151 | */ | ||
| 152 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 153 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 154 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 155 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 156 | |||
| 157 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 158 | |||
| 159 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 155 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
| 160 | static void invoke_rcu_core(void); | 156 | static void invoke_rcu_core(void); |
| 161 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 157 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
| @@ -188,22 +184,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
| 188 | * one since the start of the grace period, this just sets a flag. | 184 | * one since the start of the grace period, this just sets a flag. |
| 189 | * The caller must have disabled preemption. | 185 | * The caller must have disabled preemption. |
| 190 | */ | 186 | */ |
| 191 | void rcu_sched_qs(int cpu) | 187 | void rcu_sched_qs(void) |
| 192 | { | 188 | { |
| 193 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 189 | if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { |
| 194 | 190 | trace_rcu_grace_period(TPS("rcu_sched"), | |
| 195 | if (rdp->passed_quiesce == 0) | 191 | __this_cpu_read(rcu_sched_data.gpnum), |
| 196 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); | 192 | TPS("cpuqs")); |
| 197 | rdp->passed_quiesce = 1; | 193 | __this_cpu_write(rcu_sched_data.passed_quiesce, 1); |
| 194 | } | ||
| 198 | } | 195 | } |
| 199 | 196 | ||
| 200 | void rcu_bh_qs(int cpu) | 197 | void rcu_bh_qs(void) |
| 201 | { | 198 | { |
| 202 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 199 | if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { |
| 203 | 200 | trace_rcu_grace_period(TPS("rcu_bh"), | |
| 204 | if (rdp->passed_quiesce == 0) | 201 | __this_cpu_read(rcu_bh_data.gpnum), |
| 205 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); | 202 | TPS("cpuqs")); |
| 206 | rdp->passed_quiesce = 1; | 203 | __this_cpu_write(rcu_bh_data.passed_quiesce, 1); |
| 204 | } | ||
| 207 | } | 205 | } |
| 208 | 206 | ||
| 209 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | 207 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); |
| @@ -275,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void) | |||
| 275 | * and requires special handling for preemptible RCU. | 273 | * and requires special handling for preemptible RCU. |
| 276 | * The caller must have disabled preemption. | 274 | * The caller must have disabled preemption. |
| 277 | */ | 275 | */ |
| 278 | void rcu_note_context_switch(int cpu) | 276 | void rcu_note_context_switch(void) |
| 279 | { | 277 | { |
| 280 | trace_rcu_utilization(TPS("Start context switch")); | 278 | trace_rcu_utilization(TPS("Start context switch")); |
| 281 | rcu_sched_qs(cpu); | 279 | rcu_sched_qs(); |
| 282 | rcu_preempt_note_context_switch(cpu); | 280 | rcu_preempt_note_context_switch(); |
| 283 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 281 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) |
| 284 | rcu_momentary_dyntick_idle(); | 282 | rcu_momentary_dyntick_idle(); |
| 285 | trace_rcu_utilization(TPS("End context switch")); | 283 | trace_rcu_utilization(TPS("End context switch")); |
| @@ -314,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 314 | unsigned long *maxj), | 312 | unsigned long *maxj), |
| 315 | bool *isidle, unsigned long *maxj); | 313 | bool *isidle, unsigned long *maxj); |
| 316 | static void force_quiescent_state(struct rcu_state *rsp); | 314 | static void force_quiescent_state(struct rcu_state *rsp); |
| 317 | static int rcu_pending(int cpu); | 315 | static int rcu_pending(void); |
| 318 | 316 | ||
| 319 | /* | 317 | /* |
| 320 | * Return the number of RCU-sched batches processed thus far for debug & stats. | 318 | * Return the number of RCU-sched batches processed thus far for debug & stats. |
| @@ -499,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 499 | * we really have entered idle, and must do the appropriate accounting. | 497 | * we really have entered idle, and must do the appropriate accounting. |
| 500 | * The caller must have disabled interrupts. | 498 | * The caller must have disabled interrupts. |
| 501 | */ | 499 | */ |
| 502 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 500 | static void rcu_eqs_enter_common(long long oldval, bool user) |
| 503 | bool user) | ||
| 504 | { | 501 | { |
| 505 | struct rcu_state *rsp; | 502 | struct rcu_state *rsp; |
| 506 | struct rcu_data *rdp; | 503 | struct rcu_data *rdp; |
| 504 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 507 | 505 | ||
| 508 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 506 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
| 509 | if (!user && !is_idle_task(current)) { | 507 | if (!user && !is_idle_task(current)) { |
| @@ -520,12 +518,13 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 520 | rdp = this_cpu_ptr(rsp->rda); | 518 | rdp = this_cpu_ptr(rsp->rda); |
| 521 | do_nocb_deferred_wakeup(rdp); | 519 | do_nocb_deferred_wakeup(rdp); |
| 522 | } | 520 | } |
| 523 | rcu_prepare_for_idle(smp_processor_id()); | 521 | rcu_prepare_for_idle(); |
| 524 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 522 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 525 | smp_mb__before_atomic(); /* See above. */ | 523 | smp_mb__before_atomic(); /* See above. */ |
| 526 | atomic_inc(&rdtp->dynticks); | 524 | atomic_inc(&rdtp->dynticks); |
| 527 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ | 525 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ |
| 528 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 526 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 527 | rcu_dynticks_task_enter(); | ||
| 529 | 528 | ||
| 530 | /* | 529 | /* |
| 531 | * It is illegal to enter an extended quiescent state while | 530 | * It is illegal to enter an extended quiescent state while |
| @@ -553,7 +552,7 @@ static void rcu_eqs_enter(bool user) | |||
| 553 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 552 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
| 554 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { | 553 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { |
| 555 | rdtp->dynticks_nesting = 0; | 554 | rdtp->dynticks_nesting = 0; |
| 556 | rcu_eqs_enter_common(rdtp, oldval, user); | 555 | rcu_eqs_enter_common(oldval, user); |
| 557 | } else { | 556 | } else { |
| 558 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 557 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; |
| 559 | } | 558 | } |
| @@ -577,7 +576,7 @@ void rcu_idle_enter(void) | |||
| 577 | 576 | ||
| 578 | local_irq_save(flags); | 577 | local_irq_save(flags); |
| 579 | rcu_eqs_enter(false); | 578 | rcu_eqs_enter(false); |
| 580 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); | 579 | rcu_sysidle_enter(0); |
| 581 | local_irq_restore(flags); | 580 | local_irq_restore(flags); |
| 582 | } | 581 | } |
| 583 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 582 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -627,8 +626,8 @@ void rcu_irq_exit(void) | |||
| 627 | if (rdtp->dynticks_nesting) | 626 | if (rdtp->dynticks_nesting) |
| 628 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); | 627 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
| 629 | else | 628 | else |
| 630 | rcu_eqs_enter_common(rdtp, oldval, true); | 629 | rcu_eqs_enter_common(oldval, true); |
| 631 | rcu_sysidle_enter(rdtp, 1); | 630 | rcu_sysidle_enter(1); |
| 632 | local_irq_restore(flags); | 631 | local_irq_restore(flags); |
| 633 | } | 632 | } |
| 634 | 633 | ||
| @@ -639,15 +638,17 @@ void rcu_irq_exit(void) | |||
| 639 | * we really have exited idle, and must do the appropriate accounting. | 638 | * we really have exited idle, and must do the appropriate accounting. |
| 640 | * The caller must have disabled interrupts. | 639 | * The caller must have disabled interrupts. |
| 641 | */ | 640 | */ |
| 642 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | 641 | static void rcu_eqs_exit_common(long long oldval, int user) |
| 643 | int user) | ||
| 644 | { | 642 | { |
| 643 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 644 | |||
| 645 | rcu_dynticks_task_exit(); | ||
| 645 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ | 646 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ |
| 646 | atomic_inc(&rdtp->dynticks); | 647 | atomic_inc(&rdtp->dynticks); |
| 647 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 648 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
| 648 | smp_mb__after_atomic(); /* See above. */ | 649 | smp_mb__after_atomic(); /* See above. */ |
| 649 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 650 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 650 | rcu_cleanup_after_idle(smp_processor_id()); | 651 | rcu_cleanup_after_idle(); |
| 651 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 652 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
| 652 | if (!user && !is_idle_task(current)) { | 653 | if (!user && !is_idle_task(current)) { |
| 653 | struct task_struct *idle __maybe_unused = | 654 | struct task_struct *idle __maybe_unused = |
| @@ -678,7 +679,7 @@ static void rcu_eqs_exit(bool user) | |||
| 678 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | 679 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; |
| 679 | } else { | 680 | } else { |
| 680 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 681 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 681 | rcu_eqs_exit_common(rdtp, oldval, user); | 682 | rcu_eqs_exit_common(oldval, user); |
| 682 | } | 683 | } |
| 683 | } | 684 | } |
| 684 | 685 | ||
| @@ -699,7 +700,7 @@ void rcu_idle_exit(void) | |||
| 699 | 700 | ||
| 700 | local_irq_save(flags); | 701 | local_irq_save(flags); |
| 701 | rcu_eqs_exit(false); | 702 | rcu_eqs_exit(false); |
| 702 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); | 703 | rcu_sysidle_exit(0); |
| 703 | local_irq_restore(flags); | 704 | local_irq_restore(flags); |
| 704 | } | 705 | } |
| 705 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 706 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
| @@ -750,8 +751,8 @@ void rcu_irq_enter(void) | |||
| 750 | if (oldval) | 751 | if (oldval) |
| 751 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); | 752 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
| 752 | else | 753 | else |
| 753 | rcu_eqs_exit_common(rdtp, oldval, true); | 754 | rcu_eqs_exit_common(oldval, true); |
| 754 | rcu_sysidle_exit(rdtp, 1); | 755 | rcu_sysidle_exit(1); |
| 755 | local_irq_restore(flags); | 756 | local_irq_restore(flags); |
| 756 | } | 757 | } |
| 757 | 758 | ||
| @@ -819,7 +820,7 @@ bool notrace __rcu_is_watching(void) | |||
| 819 | */ | 820 | */ |
| 820 | bool notrace rcu_is_watching(void) | 821 | bool notrace rcu_is_watching(void) |
| 821 | { | 822 | { |
| 822 | int ret; | 823 | bool ret; |
| 823 | 824 | ||
| 824 | preempt_disable(); | 825 | preempt_disable(); |
| 825 | ret = __rcu_is_watching(); | 826 | ret = __rcu_is_watching(); |
| @@ -1647,7 +1648,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1647 | rnp->level, rnp->grplo, | 1648 | rnp->level, rnp->grplo, |
| 1648 | rnp->grphi, rnp->qsmask); | 1649 | rnp->grphi, rnp->qsmask); |
| 1649 | raw_spin_unlock_irq(&rnp->lock); | 1650 | raw_spin_unlock_irq(&rnp->lock); |
| 1650 | cond_resched(); | 1651 | cond_resched_rcu_qs(); |
| 1651 | } | 1652 | } |
| 1652 | 1653 | ||
| 1653 | mutex_unlock(&rsp->onoff_mutex); | 1654 | mutex_unlock(&rsp->onoff_mutex); |
| @@ -1668,7 +1669,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1668 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1669 | if (fqs_state == RCU_SAVE_DYNTICK) { |
| 1669 | /* Collect dyntick-idle snapshots. */ | 1670 | /* Collect dyntick-idle snapshots. */ |
| 1670 | if (is_sysidle_rcu_state(rsp)) { | 1671 | if (is_sysidle_rcu_state(rsp)) { |
| 1671 | isidle = 1; | 1672 | isidle = true; |
| 1672 | maxj = jiffies - ULONG_MAX / 4; | 1673 | maxj = jiffies - ULONG_MAX / 4; |
| 1673 | } | 1674 | } |
| 1674 | force_qs_rnp(rsp, dyntick_save_progress_counter, | 1675 | force_qs_rnp(rsp, dyntick_save_progress_counter, |
| @@ -1677,14 +1678,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1677 | fqs_state = RCU_FORCE_QS; | 1678 | fqs_state = RCU_FORCE_QS; |
| 1678 | } else { | 1679 | } else { |
| 1679 | /* Handle dyntick-idle and offline CPUs. */ | 1680 | /* Handle dyntick-idle and offline CPUs. */ |
| 1680 | isidle = 0; | 1681 | isidle = false; |
| 1681 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | 1682 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); |
| 1682 | } | 1683 | } |
| 1683 | /* Clear flag to prevent immediate re-entry. */ | 1684 | /* Clear flag to prevent immediate re-entry. */ |
| 1684 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1685 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 1685 | raw_spin_lock_irq(&rnp->lock); | 1686 | raw_spin_lock_irq(&rnp->lock); |
| 1686 | smp_mb__after_unlock_lock(); | 1687 | smp_mb__after_unlock_lock(); |
| 1687 | ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; | 1688 | ACCESS_ONCE(rsp->gp_flags) = |
| 1689 | ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; | ||
| 1688 | raw_spin_unlock_irq(&rnp->lock); | 1690 | raw_spin_unlock_irq(&rnp->lock); |
| 1689 | } | 1691 | } |
| 1690 | return fqs_state; | 1692 | return fqs_state; |
| @@ -1736,7 +1738,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1736 | /* smp_mb() provided by prior unlock-lock pair. */ | 1738 | /* smp_mb() provided by prior unlock-lock pair. */ |
| 1737 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1739 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
| 1738 | raw_spin_unlock_irq(&rnp->lock); | 1740 | raw_spin_unlock_irq(&rnp->lock); |
| 1739 | cond_resched(); | 1741 | cond_resched_rcu_qs(); |
| 1740 | } | 1742 | } |
| 1741 | rnp = rcu_get_root(rsp); | 1743 | rnp = rcu_get_root(rsp); |
| 1742 | raw_spin_lock_irq(&rnp->lock); | 1744 | raw_spin_lock_irq(&rnp->lock); |
| @@ -1785,8 +1787,8 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1785 | /* Locking provides needed memory barrier. */ | 1787 | /* Locking provides needed memory barrier. */ |
| 1786 | if (rcu_gp_init(rsp)) | 1788 | if (rcu_gp_init(rsp)) |
| 1787 | break; | 1789 | break; |
| 1788 | cond_resched(); | 1790 | cond_resched_rcu_qs(); |
| 1789 | flush_signals(current); | 1791 | WARN_ON(signal_pending(current)); |
| 1790 | trace_rcu_grace_period(rsp->name, | 1792 | trace_rcu_grace_period(rsp->name, |
| 1791 | ACCESS_ONCE(rsp->gpnum), | 1793 | ACCESS_ONCE(rsp->gpnum), |
| 1792 | TPS("reqwaitsig")); | 1794 | TPS("reqwaitsig")); |
| @@ -1828,11 +1830,11 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1828 | trace_rcu_grace_period(rsp->name, | 1830 | trace_rcu_grace_period(rsp->name, |
| 1829 | ACCESS_ONCE(rsp->gpnum), | 1831 | ACCESS_ONCE(rsp->gpnum), |
| 1830 | TPS("fqsend")); | 1832 | TPS("fqsend")); |
| 1831 | cond_resched(); | 1833 | cond_resched_rcu_qs(); |
| 1832 | } else { | 1834 | } else { |
| 1833 | /* Deal with stray signal. */ | 1835 | /* Deal with stray signal. */ |
| 1834 | cond_resched(); | 1836 | cond_resched_rcu_qs(); |
| 1835 | flush_signals(current); | 1837 | WARN_ON(signal_pending(current)); |
| 1836 | trace_rcu_grace_period(rsp->name, | 1838 | trace_rcu_grace_period(rsp->name, |
| 1837 | ACCESS_ONCE(rsp->gpnum), | 1839 | ACCESS_ONCE(rsp->gpnum), |
| 1838 | TPS("fqswaitsig")); | 1840 | TPS("fqswaitsig")); |
| @@ -1928,7 +1930,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 1928 | { | 1930 | { |
| 1929 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 1931 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 1930 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 1932 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
| 1931 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 1933 | rcu_gp_kthread_wake(rsp); |
| 1932 | } | 1934 | } |
| 1933 | 1935 | ||
| 1934 | /* | 1936 | /* |
| @@ -2210,8 +2212,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 2210 | /* Adjust any no-longer-needed kthreads. */ | 2212 | /* Adjust any no-longer-needed kthreads. */ |
| 2211 | rcu_boost_kthread_setaffinity(rnp, -1); | 2213 | rcu_boost_kthread_setaffinity(rnp, -1); |
| 2212 | 2214 | ||
| 2213 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | ||
| 2214 | |||
| 2215 | /* Exclude any attempts to start a new grace period. */ | 2215 | /* Exclude any attempts to start a new grace period. */ |
| 2216 | mutex_lock(&rsp->onoff_mutex); | 2216 | mutex_lock(&rsp->onoff_mutex); |
| 2217 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | 2217 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); |
| @@ -2375,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2375 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 2375 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
| 2376 | * false, there is no point in invoking rcu_check_callbacks(). | 2376 | * false, there is no point in invoking rcu_check_callbacks(). |
| 2377 | */ | 2377 | */ |
| 2378 | void rcu_check_callbacks(int cpu, int user) | 2378 | void rcu_check_callbacks(int user) |
| 2379 | { | 2379 | { |
| 2380 | trace_rcu_utilization(TPS("Start scheduler-tick")); | 2380 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
| 2381 | increment_cpu_stall_ticks(); | 2381 | increment_cpu_stall_ticks(); |
| @@ -2393,8 +2393,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2393 | * at least not while the corresponding CPU is online. | 2393 | * at least not while the corresponding CPU is online. |
| 2394 | */ | 2394 | */ |
| 2395 | 2395 | ||
| 2396 | rcu_sched_qs(cpu); | 2396 | rcu_sched_qs(); |
| 2397 | rcu_bh_qs(cpu); | 2397 | rcu_bh_qs(); |
| 2398 | 2398 | ||
| 2399 | } else if (!in_softirq()) { | 2399 | } else if (!in_softirq()) { |
| 2400 | 2400 | ||
| @@ -2405,11 +2405,13 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2405 | * critical section, so note it. | 2405 | * critical section, so note it. |
| 2406 | */ | 2406 | */ |
| 2407 | 2407 | ||
| 2408 | rcu_bh_qs(cpu); | 2408 | rcu_bh_qs(); |
| 2409 | } | 2409 | } |
| 2410 | rcu_preempt_check_callbacks(cpu); | 2410 | rcu_preempt_check_callbacks(); |
| 2411 | if (rcu_pending(cpu)) | 2411 | if (rcu_pending()) |
| 2412 | invoke_rcu_core(); | 2412 | invoke_rcu_core(); |
| 2413 | if (user) | ||
| 2414 | rcu_note_voluntary_context_switch(current); | ||
| 2413 | trace_rcu_utilization(TPS("End scheduler-tick")); | 2415 | trace_rcu_utilization(TPS("End scheduler-tick")); |
| 2414 | } | 2416 | } |
| 2415 | 2417 | ||
| @@ -2432,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2432 | struct rcu_node *rnp; | 2434 | struct rcu_node *rnp; |
| 2433 | 2435 | ||
| 2434 | rcu_for_each_leaf_node(rsp, rnp) { | 2436 | rcu_for_each_leaf_node(rsp, rnp) { |
| 2435 | cond_resched(); | 2437 | cond_resched_rcu_qs(); |
| 2436 | mask = 0; | 2438 | mask = 0; |
| 2437 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2439 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 2438 | smp_mb__after_unlock_lock(); | 2440 | smp_mb__after_unlock_lock(); |
| @@ -2449,7 +2451,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2449 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2451 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
| 2450 | if ((rnp->qsmask & bit) != 0) { | 2452 | if ((rnp->qsmask & bit) != 0) { |
| 2451 | if ((rnp->qsmaskinit & bit) != 0) | 2453 | if ((rnp->qsmaskinit & bit) != 0) |
| 2452 | *isidle = 0; | 2454 | *isidle = false; |
| 2453 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2455 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) |
| 2454 | mask |= bit; | 2456 | mask |= bit; |
| 2455 | } | 2457 | } |
| @@ -2505,9 +2507,10 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2505 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2507 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2506 | return; /* Someone beat us to it. */ | 2508 | return; /* Someone beat us to it. */ |
| 2507 | } | 2509 | } |
| 2508 | ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; | 2510 | ACCESS_ONCE(rsp->gp_flags) = |
| 2511 | ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; | ||
| 2509 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2512 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2510 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 2513 | rcu_gp_kthread_wake(rsp); |
| 2511 | } | 2514 | } |
| 2512 | 2515 | ||
| 2513 | /* | 2516 | /* |
| @@ -2925,11 +2928,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
| 2925 | * restructure your code to batch your updates, and then use a single | 2928 | * restructure your code to batch your updates, and then use a single |
| 2926 | * synchronize_sched() instead. | 2929 | * synchronize_sched() instead. |
| 2927 | * | 2930 | * |
| 2928 | * Note that it is illegal to call this function while holding any lock | ||
| 2929 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 2930 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 2931 | * these restriction will result in deadlock. | ||
| 2932 | * | ||
| 2933 | * This implementation can be thought of as an application of ticket | 2931 | * This implementation can be thought of as an application of ticket |
| 2934 | * locking to RCU, with sync_sched_expedited_started and | 2932 | * locking to RCU, with sync_sched_expedited_started and |
| 2935 | * sync_sched_expedited_done taking on the roles of the halves | 2933 | * sync_sched_expedited_done taking on the roles of the halves |
| @@ -2953,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
| 2953 | */ | 2951 | */ |
| 2954 | void synchronize_sched_expedited(void) | 2952 | void synchronize_sched_expedited(void) |
| 2955 | { | 2953 | { |
| 2954 | cpumask_var_t cm; | ||
| 2955 | bool cma = false; | ||
| 2956 | int cpu; | ||
| 2956 | long firstsnap, s, snap; | 2957 | long firstsnap, s, snap; |
| 2957 | int trycount = 0; | 2958 | int trycount = 0; |
| 2958 | struct rcu_state *rsp = &rcu_sched_state; | 2959 | struct rcu_state *rsp = &rcu_sched_state; |
| @@ -2979,14 +2980,34 @@ void synchronize_sched_expedited(void) | |||
| 2979 | */ | 2980 | */ |
| 2980 | snap = atomic_long_inc_return(&rsp->expedited_start); | 2981 | snap = atomic_long_inc_return(&rsp->expedited_start); |
| 2981 | firstsnap = snap; | 2982 | firstsnap = snap; |
| 2982 | get_online_cpus(); | 2983 | if (!try_get_online_cpus()) { |
| 2984 | /* CPU hotplug operation in flight, fall back to normal GP. */ | ||
| 2985 | wait_rcu_gp(call_rcu_sched); | ||
| 2986 | atomic_long_inc(&rsp->expedited_normal); | ||
| 2987 | return; | ||
| 2988 | } | ||
| 2983 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | 2989 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); |
| 2984 | 2990 | ||
| 2991 | /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ | ||
| 2992 | cma = zalloc_cpumask_var(&cm, GFP_KERNEL); | ||
| 2993 | if (cma) { | ||
| 2994 | cpumask_copy(cm, cpu_online_mask); | ||
| 2995 | cpumask_clear_cpu(raw_smp_processor_id(), cm); | ||
| 2996 | for_each_cpu(cpu, cm) { | ||
| 2997 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 2998 | |||
| 2999 | if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
| 3000 | cpumask_clear_cpu(cpu, cm); | ||
| 3001 | } | ||
| 3002 | if (cpumask_weight(cm) == 0) | ||
| 3003 | goto all_cpus_idle; | ||
| 3004 | } | ||
| 3005 | |||
| 2985 | /* | 3006 | /* |
| 2986 | * Each pass through the following loop attempts to force a | 3007 | * Each pass through the following loop attempts to force a |
| 2987 | * context switch on each CPU. | 3008 | * context switch on each CPU. |
| 2988 | */ | 3009 | */ |
| 2989 | while (try_stop_cpus(cpu_online_mask, | 3010 | while (try_stop_cpus(cma ? cm : cpu_online_mask, |
| 2990 | synchronize_sched_expedited_cpu_stop, | 3011 | synchronize_sched_expedited_cpu_stop, |
| 2991 | NULL) == -EAGAIN) { | 3012 | NULL) == -EAGAIN) { |
| 2992 | put_online_cpus(); | 3013 | put_online_cpus(); |
| @@ -2998,6 +3019,7 @@ void synchronize_sched_expedited(void) | |||
| 2998 | /* ensure test happens before caller kfree */ | 3019 | /* ensure test happens before caller kfree */ |
| 2999 | smp_mb__before_atomic(); /* ^^^ */ | 3020 | smp_mb__before_atomic(); /* ^^^ */ |
| 3000 | atomic_long_inc(&rsp->expedited_workdone1); | 3021 | atomic_long_inc(&rsp->expedited_workdone1); |
| 3022 | free_cpumask_var(cm); | ||
| 3001 | return; | 3023 | return; |
| 3002 | } | 3024 | } |
| 3003 | 3025 | ||
| @@ -3007,6 +3029,7 @@ void synchronize_sched_expedited(void) | |||
| 3007 | } else { | 3029 | } else { |
| 3008 | wait_rcu_gp(call_rcu_sched); | 3030 | wait_rcu_gp(call_rcu_sched); |
| 3009 | atomic_long_inc(&rsp->expedited_normal); | 3031 | atomic_long_inc(&rsp->expedited_normal); |
| 3032 | free_cpumask_var(cm); | ||
| 3010 | return; | 3033 | return; |
| 3011 | } | 3034 | } |
| 3012 | 3035 | ||
| @@ -3016,6 +3039,7 @@ void synchronize_sched_expedited(void) | |||
| 3016 | /* ensure test happens before caller kfree */ | 3039 | /* ensure test happens before caller kfree */ |
| 3017 | smp_mb__before_atomic(); /* ^^^ */ | 3040 | smp_mb__before_atomic(); /* ^^^ */ |
| 3018 | atomic_long_inc(&rsp->expedited_workdone2); | 3041 | atomic_long_inc(&rsp->expedited_workdone2); |
| 3042 | free_cpumask_var(cm); | ||
| 3019 | return; | 3043 | return; |
| 3020 | } | 3044 | } |
| 3021 | 3045 | ||
| @@ -3026,12 +3050,21 @@ void synchronize_sched_expedited(void) | |||
| 3026 | * and they started after our first try, so their grace | 3050 | * and they started after our first try, so their grace |
| 3027 | * period works for us. | 3051 | * period works for us. |
| 3028 | */ | 3052 | */ |
| 3029 | get_online_cpus(); | 3053 | if (!try_get_online_cpus()) { |
| 3054 | /* CPU hotplug operation in flight, use normal GP. */ | ||
| 3055 | wait_rcu_gp(call_rcu_sched); | ||
| 3056 | atomic_long_inc(&rsp->expedited_normal); | ||
| 3057 | free_cpumask_var(cm); | ||
| 3058 | return; | ||
| 3059 | } | ||
| 3030 | snap = atomic_long_read(&rsp->expedited_start); | 3060 | snap = atomic_long_read(&rsp->expedited_start); |
| 3031 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 3061 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
| 3032 | } | 3062 | } |
| 3033 | atomic_long_inc(&rsp->expedited_stoppedcpus); | 3063 | atomic_long_inc(&rsp->expedited_stoppedcpus); |
| 3034 | 3064 | ||
| 3065 | all_cpus_idle: | ||
| 3066 | free_cpumask_var(cm); | ||
| 3067 | |||
| 3035 | /* | 3068 | /* |
| 3036 | * Everyone up to our most recent fetch is covered by our grace | 3069 | * Everyone up to our most recent fetch is covered by our grace |
| 3037 | * period. Update the counter, but only if our work is still | 3070 | * period. Update the counter, but only if our work is still |
| @@ -3123,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 3123 | * by the current CPU, returning 1 if so. This function is part of the | 3156 | * by the current CPU, returning 1 if so. This function is part of the |
| 3124 | * RCU implementation; it is -not- an exported member of the RCU API. | 3157 | * RCU implementation; it is -not- an exported member of the RCU API. |
| 3125 | */ | 3158 | */ |
| 3126 | static int rcu_pending(int cpu) | 3159 | static int rcu_pending(void) |
| 3127 | { | 3160 | { |
| 3128 | struct rcu_state *rsp; | 3161 | struct rcu_state *rsp; |
| 3129 | 3162 | ||
| 3130 | for_each_rcu_flavor(rsp) | 3163 | for_each_rcu_flavor(rsp) |
| 3131 | if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) | 3164 | if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) |
| 3132 | return 1; | 3165 | return 1; |
| 3133 | return 0; | 3166 | return 0; |
| 3134 | } | 3167 | } |
| @@ -3138,7 +3171,7 @@ static int rcu_pending(int cpu) | |||
| 3138 | * non-NULL, store an indication of whether all callbacks are lazy. | 3171 | * non-NULL, store an indication of whether all callbacks are lazy. |
| 3139 | * (If there are no callbacks, all of them are deemed to be lazy.) | 3172 | * (If there are no callbacks, all of them are deemed to be lazy.) |
| 3140 | */ | 3173 | */ |
| 3141 | static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | 3174 | static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) |
| 3142 | { | 3175 | { |
| 3143 | bool al = true; | 3176 | bool al = true; |
| 3144 | bool hc = false; | 3177 | bool hc = false; |
| @@ -3146,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
| 3146 | struct rcu_state *rsp; | 3179 | struct rcu_state *rsp; |
| 3147 | 3180 | ||
| 3148 | for_each_rcu_flavor(rsp) { | 3181 | for_each_rcu_flavor(rsp) { |
| 3149 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3182 | rdp = this_cpu_ptr(rsp->rda); |
| 3150 | if (!rdp->nxtlist) | 3183 | if (!rdp->nxtlist) |
| 3151 | continue; | 3184 | continue; |
| 3152 | hc = true; | 3185 | hc = true; |
| @@ -3279,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
| 3279 | continue; | 3312 | continue; |
| 3280 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3313 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 3281 | if (rcu_is_nocb_cpu(cpu)) { | 3314 | if (rcu_is_nocb_cpu(cpu)) { |
| 3282 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | 3315 | if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { |
| 3283 | rsp->n_barrier_done); | 3316 | _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, |
| 3284 | atomic_inc(&rsp->barrier_cpu_count); | 3317 | rsp->n_barrier_done); |
| 3285 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, | 3318 | } else { |
| 3286 | rsp, cpu, 0); | 3319 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
| 3320 | rsp->n_barrier_done); | ||
| 3321 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 3322 | __call_rcu(&rdp->barrier_head, | ||
| 3323 | rcu_barrier_callback, rsp, cpu, 0); | ||
| 3324 | } | ||
| 3287 | } else if (ACCESS_ONCE(rdp->qlen)) { | 3325 | } else if (ACCESS_ONCE(rdp->qlen)) { |
| 3288 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3326 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
| 3289 | rsp->n_barrier_done); | 3327 | rsp->n_barrier_done); |
| @@ -3442,6 +3480,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3442 | case CPU_UP_PREPARE_FROZEN: | 3480 | case CPU_UP_PREPARE_FROZEN: |
| 3443 | rcu_prepare_cpu(cpu); | 3481 | rcu_prepare_cpu(cpu); |
| 3444 | rcu_prepare_kthreads(cpu); | 3482 | rcu_prepare_kthreads(cpu); |
| 3483 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 3445 | break; | 3484 | break; |
| 3446 | case CPU_ONLINE: | 3485 | case CPU_ONLINE: |
| 3447 | case CPU_DOWN_FAILED: | 3486 | case CPU_DOWN_FAILED: |
| @@ -3459,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3459 | case CPU_DEAD_FROZEN: | 3498 | case CPU_DEAD_FROZEN: |
| 3460 | case CPU_UP_CANCELED: | 3499 | case CPU_UP_CANCELED: |
| 3461 | case CPU_UP_CANCELED_FROZEN: | 3500 | case CPU_UP_CANCELED_FROZEN: |
| 3462 | for_each_rcu_flavor(rsp) | 3501 | for_each_rcu_flavor(rsp) { |
| 3463 | rcu_cleanup_dead_cpu(cpu, rsp); | 3502 | rcu_cleanup_dead_cpu(cpu, rsp); |
| 3503 | do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); | ||
| 3504 | } | ||
| 3464 | break; | 3505 | break; |
| 3465 | default: | 3506 | default: |
| 3466 | break; | 3507 | break; |
| @@ -3489,7 +3530,7 @@ static int rcu_pm_notify(struct notifier_block *self, | |||
| 3489 | } | 3530 | } |
| 3490 | 3531 | ||
| 3491 | /* | 3532 | /* |
| 3492 | * Spawn the kthread that handles this RCU flavor's grace periods. | 3533 | * Spawn the kthreads that handle each RCU flavor's grace periods. |
| 3493 | */ | 3534 | */ |
| 3494 | static int __init rcu_spawn_gp_kthread(void) | 3535 | static int __init rcu_spawn_gp_kthread(void) |
| 3495 | { | 3536 | { |
| @@ -3498,6 +3539,7 @@ static int __init rcu_spawn_gp_kthread(void) | |||
| 3498 | struct rcu_state *rsp; | 3539 | struct rcu_state *rsp; |
| 3499 | struct task_struct *t; | 3540 | struct task_struct *t; |
| 3500 | 3541 | ||
| 3542 | rcu_scheduler_fully_active = 1; | ||
| 3501 | for_each_rcu_flavor(rsp) { | 3543 | for_each_rcu_flavor(rsp) { |
| 3502 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); | 3544 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); |
| 3503 | BUG_ON(IS_ERR(t)); | 3545 | BUG_ON(IS_ERR(t)); |
| @@ -3505,8 +3547,9 @@ static int __init rcu_spawn_gp_kthread(void) | |||
| 3505 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3547 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 3506 | rsp->gp_kthread = t; | 3548 | rsp->gp_kthread = t; |
| 3507 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3549 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3508 | rcu_spawn_nocb_kthreads(rsp); | ||
| 3509 | } | 3550 | } |
| 3551 | rcu_spawn_nocb_kthreads(); | ||
| 3552 | rcu_spawn_boost_kthreads(); | ||
| 3510 | return 0; | 3553 | return 0; |
| 3511 | } | 3554 | } |
| 3512 | early_initcall(rcu_spawn_gp_kthread); | 3555 | early_initcall(rcu_spawn_gp_kthread); |
| @@ -3738,6 +3781,8 @@ void __init rcu_init(void) | |||
| 3738 | pm_notifier(rcu_pm_notify, 0); | 3781 | pm_notifier(rcu_pm_notify, 0); |
| 3739 | for_each_online_cpu(cpu) | 3782 | for_each_online_cpu(cpu) |
| 3740 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3783 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 3784 | |||
| 3785 | rcu_early_boot_tests(); | ||
| 3741 | } | 3786 | } |
| 3742 | 3787 | ||
| 3743 | #include "tree_plugin.h" | 3788 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..8e7b1843896e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -139,7 +139,7 @@ struct rcu_node { | |||
| 139 | unsigned long expmask; /* Groups that have ->blkd_tasks */ | 139 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
| 140 | /* elements that need to drain to allow the */ | 140 | /* elements that need to drain to allow the */ |
| 141 | /* current expedited grace period to */ | 141 | /* current expedited grace period to */ |
| 142 | /* complete (only for TREE_PREEMPT_RCU). */ | 142 | /* complete (only for PREEMPT_RCU). */ |
| 143 | unsigned long qsmaskinit; | 143 | unsigned long qsmaskinit; |
| 144 | /* Per-GP initial value for qsmask & expmask. */ | 144 | /* Per-GP initial value for qsmask & expmask. */ |
| 145 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 145 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
| @@ -350,7 +350,7 @@ struct rcu_data { | |||
| 350 | int nocb_p_count_lazy; /* (approximate). */ | 350 | int nocb_p_count_lazy; /* (approximate). */ |
| 351 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 351 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ |
| 352 | struct task_struct *nocb_kthread; | 352 | struct task_struct *nocb_kthread; |
| 353 | bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 353 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
| 354 | 354 | ||
| 355 | /* The following fields are used by the leader, hence own cacheline. */ | 355 | /* The following fields are used by the leader, hence own cacheline. */ |
| 356 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; | 356 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; |
| @@ -383,6 +383,11 @@ struct rcu_data { | |||
| 383 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 383 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
| 384 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 384 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
| 385 | 385 | ||
| 386 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ | ||
| 387 | #define RCU_NOGP_WAKE_NOT 0 | ||
| 388 | #define RCU_NOGP_WAKE 1 | ||
| 389 | #define RCU_NOGP_WAKE_FORCE 2 | ||
| 390 | |||
| 386 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) | 391 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) |
| 387 | /* For jiffies_till_first_fqs and */ | 392 | /* For jiffies_till_first_fqs and */ |
| 388 | /* and jiffies_till_next_fqs. */ | 393 | /* and jiffies_till_next_fqs. */ |
| @@ -525,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
| 525 | extern struct rcu_state rcu_bh_state; | 530 | extern struct rcu_state rcu_bh_state; |
| 526 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); | 531 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); |
| 527 | 532 | ||
| 528 | #ifdef CONFIG_TREE_PREEMPT_RCU | 533 | #ifdef CONFIG_PREEMPT_RCU |
| 529 | extern struct rcu_state rcu_preempt_state; | 534 | extern struct rcu_state rcu_preempt_state; |
| 530 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 535 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
| 531 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 536 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
| 532 | 537 | ||
| 533 | #ifdef CONFIG_RCU_BOOST | 538 | #ifdef CONFIG_RCU_BOOST |
| 534 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 539 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
| @@ -542,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
| 542 | /* Forward declarations for rcutree_plugin.h */ | 547 | /* Forward declarations for rcutree_plugin.h */ |
| 543 | static void rcu_bootup_announce(void); | 548 | static void rcu_bootup_announce(void); |
| 544 | long rcu_batches_completed(void); | 549 | long rcu_batches_completed(void); |
| 545 | static void rcu_preempt_note_context_switch(int cpu); | 550 | static void rcu_preempt_note_context_switch(void); |
| 546 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 551 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 547 | #ifdef CONFIG_HOTPLUG_CPU | 552 | #ifdef CONFIG_HOTPLUG_CPU |
| 548 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 553 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
| @@ -556,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 556 | struct rcu_node *rnp, | 561 | struct rcu_node *rnp, |
| 557 | struct rcu_data *rdp); | 562 | struct rcu_data *rdp); |
| 558 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 563 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 559 | static void rcu_preempt_check_callbacks(int cpu); | 564 | static void rcu_preempt_check_callbacks(void); |
| 560 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 565 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 561 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 566 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) |
| 562 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 567 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
| 563 | bool wake); | 568 | bool wake); |
| 564 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 569 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ |
| 565 | static void __init __rcu_init_preempt(void); | 570 | static void __init __rcu_init_preempt(void); |
| 566 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 571 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
| 567 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 572 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
| @@ -572,15 +577,17 @@ static void rcu_preempt_do_callbacks(void); | |||
| 572 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 577 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
| 573 | struct rcu_node *rnp); | 578 | struct rcu_node *rnp); |
| 574 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 579 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 580 | static void __init rcu_spawn_boost_kthreads(void); | ||
| 575 | static void rcu_prepare_kthreads(int cpu); | 581 | static void rcu_prepare_kthreads(int cpu); |
| 576 | static void rcu_cleanup_after_idle(int cpu); | 582 | static void rcu_cleanup_after_idle(void); |
| 577 | static void rcu_prepare_for_idle(int cpu); | 583 | static void rcu_prepare_for_idle(void); |
| 578 | static void rcu_idle_count_callbacks_posted(void); | 584 | static void rcu_idle_count_callbacks_posted(void); |
| 579 | static void print_cpu_stall_info_begin(void); | 585 | static void print_cpu_stall_info_begin(void); |
| 580 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 586 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
| 581 | static void print_cpu_stall_info_end(void); | 587 | static void print_cpu_stall_info_end(void); |
| 582 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 588 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
| 583 | static void increment_cpu_stall_ticks(void); | 589 | static void increment_cpu_stall_ticks(void); |
| 590 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); | ||
| 584 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 591 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
| 585 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 592 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
| 586 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 593 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
| @@ -589,14 +596,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 589 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 596 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
| 590 | struct rcu_data *rdp, | 597 | struct rcu_data *rdp, |
| 591 | unsigned long flags); | 598 | unsigned long flags); |
| 592 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | 599 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); |
| 593 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | 600 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); |
| 594 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 601 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
| 595 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 602 | static void rcu_spawn_all_nocb_kthreads(int cpu); |
| 603 | static void __init rcu_spawn_nocb_kthreads(void); | ||
| 604 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 605 | static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); | ||
| 606 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 596 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); | 607 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
| 597 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 608 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
| 598 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | 609 | static void rcu_sysidle_enter(int irq); |
| 599 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | 610 | static void rcu_sysidle_exit(int irq); |
| 600 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | 611 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, |
| 601 | unsigned long *maxj); | 612 | unsigned long *maxj); |
| 602 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | 613 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); |
| @@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | |||
| 605 | static void rcu_bind_gp_kthread(void); | 616 | static void rcu_bind_gp_kthread(void); |
| 606 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | 617 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); |
| 607 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | 618 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); |
| 619 | static void rcu_dynticks_task_enter(void); | ||
| 620 | static void rcu_dynticks_task_exit(void); | ||
| 608 | 621 | ||
| 609 | #endif /* #ifndef RCU_TREE_NONCORE */ | 622 | #endif /* #ifndef RCU_TREE_NONCORE */ |
| 610 | 623 | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..3ec85cb5d544 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -30,14 +30,24 @@ | |||
| 30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
| 31 | #include "../time/tick-internal.h" | 31 | #include "../time/tick-internal.h" |
| 32 | 32 | ||
| 33 | #define RCU_KTHREAD_PRIO 1 | ||
| 34 | |||
| 35 | #ifdef CONFIG_RCU_BOOST | 33 | #ifdef CONFIG_RCU_BOOST |
| 34 | |||
| 36 | #include "../locking/rtmutex_common.h" | 35 | #include "../locking/rtmutex_common.h" |
| 37 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | 36 | |
| 38 | #else | 37 | /* rcuc/rcub kthread realtime priority */ |
| 39 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 38 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; |
| 40 | #endif | 39 | module_param(kthread_prio, int, 0644); |
| 40 | |||
| 41 | /* | ||
| 42 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 43 | * handle all flavors of RCU. | ||
| 44 | */ | ||
| 45 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 46 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 47 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 48 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 49 | |||
| 50 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 41 | 51 | ||
| 42 | #ifdef CONFIG_RCU_NOCB_CPU | 52 | #ifdef CONFIG_RCU_NOCB_CPU |
| 43 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | 53 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ |
| @@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 72 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 82 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
| 73 | pr_info("\tRCU torture testing starts during boot.\n"); | 83 | pr_info("\tRCU torture testing starts during boot.\n"); |
| 74 | #endif | 84 | #endif |
| 75 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | ||
| 76 | pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); | ||
| 77 | #endif | ||
| 78 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | 85 | #if defined(CONFIG_RCU_CPU_STALL_INFO) |
| 79 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 86 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); |
| 80 | #endif | 87 | #endif |
| @@ -85,36 +92,12 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 85 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 92 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
| 86 | if (nr_cpu_ids != NR_CPUS) | 93 | if (nr_cpu_ids != NR_CPUS) |
| 87 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 94 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
| 88 | #ifdef CONFIG_RCU_NOCB_CPU | 95 | #ifdef CONFIG_RCU_BOOST |
| 89 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | 96 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); |
| 90 | if (!have_rcu_nocb_mask) { | 97 | #endif |
| 91 | zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); | ||
| 92 | have_rcu_nocb_mask = true; | ||
| 93 | } | ||
| 94 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 95 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 96 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 97 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 98 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 99 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 100 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 101 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 102 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 103 | if (have_rcu_nocb_mask) { | ||
| 104 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 105 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 106 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 107 | rcu_nocb_mask); | ||
| 108 | } | ||
| 109 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
| 110 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
| 111 | if (rcu_nocb_poll) | ||
| 112 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
| 113 | } | ||
| 114 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 115 | } | 98 | } |
| 116 | 99 | ||
| 117 | #ifdef CONFIG_TREE_PREEMPT_RCU | 100 | #ifdef CONFIG_PREEMPT_RCU |
| 118 | 101 | ||
| 119 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | 102 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
| 120 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; | 103 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; |
| @@ -134,7 +117,7 @@ static void __init rcu_bootup_announce(void) | |||
| 134 | * Return the number of RCU-preempt batches processed thus far | 117 | * Return the number of RCU-preempt batches processed thus far |
| 135 | * for debug and statistics. | 118 | * for debug and statistics. |
| 136 | */ | 119 | */ |
| 137 | long rcu_batches_completed_preempt(void) | 120 | static long rcu_batches_completed_preempt(void) |
| 138 | { | 121 | { |
| 139 | return rcu_preempt_state.completed; | 122 | return rcu_preempt_state.completed; |
| 140 | } | 123 | } |
| @@ -155,18 +138,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
| 155 | * not in a quiescent state. There might be any number of tasks blocked | 138 | * not in a quiescent state. There might be any number of tasks blocked |
| 156 | * while in an RCU read-side critical section. | 139 | * while in an RCU read-side critical section. |
| 157 | * | 140 | * |
| 158 | * Unlike the other rcu_*_qs() functions, callers to this function | 141 | * As with the other rcu_*_qs() functions, callers to this function |
| 159 | * must disable irqs in order to protect the assignment to | 142 | * must disable preemption. |
| 160 | * ->rcu_read_unlock_special. | 143 | */ |
| 161 | */ | 144 | static void rcu_preempt_qs(void) |
| 162 | static void rcu_preempt_qs(int cpu) | 145 | { |
| 163 | { | 146 | if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { |
| 164 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 147 | trace_rcu_grace_period(TPS("rcu_preempt"), |
| 165 | 148 | __this_cpu_read(rcu_preempt_data.gpnum), | |
| 166 | if (rdp->passed_quiesce == 0) | 149 | TPS("cpuqs")); |
| 167 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); | 150 | __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); |
| 168 | rdp->passed_quiesce = 1; | 151 | barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ |
| 169 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 152 | current->rcu_read_unlock_special.b.need_qs = false; |
| 153 | } | ||
| 170 | } | 154 | } |
| 171 | 155 | ||
| 172 | /* | 156 | /* |
| @@ -182,7 +166,7 @@ static void rcu_preempt_qs(int cpu) | |||
| 182 | * | 166 | * |
| 183 | * Caller must disable preemption. | 167 | * Caller must disable preemption. |
| 184 | */ | 168 | */ |
| 185 | static void rcu_preempt_note_context_switch(int cpu) | 169 | static void rcu_preempt_note_context_switch(void) |
| 186 | { | 170 | { |
| 187 | struct task_struct *t = current; | 171 | struct task_struct *t = current; |
| 188 | unsigned long flags; | 172 | unsigned long flags; |
| @@ -190,14 +174,14 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 190 | struct rcu_node *rnp; | 174 | struct rcu_node *rnp; |
| 191 | 175 | ||
| 192 | if (t->rcu_read_lock_nesting > 0 && | 176 | if (t->rcu_read_lock_nesting > 0 && |
| 193 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 177 | !t->rcu_read_unlock_special.b.blocked) { |
| 194 | 178 | ||
| 195 | /* Possibly blocking in an RCU read-side critical section. */ | 179 | /* Possibly blocking in an RCU read-side critical section. */ |
| 196 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 180 | rdp = this_cpu_ptr(rcu_preempt_state.rda); |
| 197 | rnp = rdp->mynode; | 181 | rnp = rdp->mynode; |
| 198 | raw_spin_lock_irqsave(&rnp->lock, flags); | 182 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 199 | smp_mb__after_unlock_lock(); | 183 | smp_mb__after_unlock_lock(); |
| 200 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 184 | t->rcu_read_unlock_special.b.blocked = true; |
| 201 | t->rcu_blocked_node = rnp; | 185 | t->rcu_blocked_node = rnp; |
| 202 | 186 | ||
| 203 | /* | 187 | /* |
| @@ -239,7 +223,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 239 | : rnp->gpnum + 1); | 223 | : rnp->gpnum + 1); |
| 240 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 224 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 241 | } else if (t->rcu_read_lock_nesting < 0 && | 225 | } else if (t->rcu_read_lock_nesting < 0 && |
| 242 | t->rcu_read_unlock_special) { | 226 | t->rcu_read_unlock_special.s) { |
| 243 | 227 | ||
| 244 | /* | 228 | /* |
| 245 | * Complete exit from RCU read-side critical section on | 229 | * Complete exit from RCU read-side critical section on |
| @@ -257,9 +241,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 257 | * grace period, then the fact that the task has been enqueued | 241 | * grace period, then the fact that the task has been enqueued |
| 258 | * means that we continue to block the current grace period. | 242 | * means that we continue to block the current grace period. |
| 259 | */ | 243 | */ |
| 260 | local_irq_save(flags); | 244 | rcu_preempt_qs(); |
| 261 | rcu_preempt_qs(cpu); | ||
| 262 | local_irq_restore(flags); | ||
| 263 | } | 245 | } |
| 264 | 246 | ||
| 265 | /* | 247 | /* |
| @@ -340,7 +322,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 340 | bool drop_boost_mutex = false; | 322 | bool drop_boost_mutex = false; |
| 341 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 323 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 342 | struct rcu_node *rnp; | 324 | struct rcu_node *rnp; |
| 343 | int special; | 325 | union rcu_special special; |
| 344 | 326 | ||
| 345 | /* NMI handlers cannot block and cannot safely manipulate state. */ | 327 | /* NMI handlers cannot block and cannot safely manipulate state. */ |
| 346 | if (in_nmi()) | 328 | if (in_nmi()) |
| @@ -350,12 +332,13 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 350 | 332 | ||
| 351 | /* | 333 | /* |
| 352 | * If RCU core is waiting for this CPU to exit critical section, | 334 | * If RCU core is waiting for this CPU to exit critical section, |
| 353 | * let it know that we have done so. | 335 | * let it know that we have done so. Because irqs are disabled, |
| 336 | * t->rcu_read_unlock_special cannot change. | ||
| 354 | */ | 337 | */ |
| 355 | special = t->rcu_read_unlock_special; | 338 | special = t->rcu_read_unlock_special; |
| 356 | if (special & RCU_READ_UNLOCK_NEED_QS) { | 339 | if (special.b.need_qs) { |
| 357 | rcu_preempt_qs(smp_processor_id()); | 340 | rcu_preempt_qs(); |
| 358 | if (!t->rcu_read_unlock_special) { | 341 | if (!t->rcu_read_unlock_special.s) { |
| 359 | local_irq_restore(flags); | 342 | local_irq_restore(flags); |
| 360 | return; | 343 | return; |
| 361 | } | 344 | } |
| @@ -368,8 +351,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 368 | } | 351 | } |
| 369 | 352 | ||
| 370 | /* Clean up if blocked during RCU read-side critical section. */ | 353 | /* Clean up if blocked during RCU read-side critical section. */ |
| 371 | if (special & RCU_READ_UNLOCK_BLOCKED) { | 354 | if (special.b.blocked) { |
| 372 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | 355 | t->rcu_read_unlock_special.b.blocked = false; |
| 373 | 356 | ||
| 374 | /* | 357 | /* |
| 375 | * Remove this task from the list it blocked on. The | 358 | * Remove this task from the list it blocked on. The |
| @@ -442,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 442 | } | 425 | } |
| 443 | } | 426 | } |
| 444 | 427 | ||
| 445 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
| 446 | |||
| 447 | /* | 428 | /* |
| 448 | * Dump detailed information for all tasks blocking the current RCU | 429 | * Dump detailed information for all tasks blocking the current RCU |
| 449 | * grace period on the specified rcu_node structure. | 430 | * grace period on the specified rcu_node structure. |
| @@ -478,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
| 478 | rcu_print_detail_task_stall_rnp(rnp); | 459 | rcu_print_detail_task_stall_rnp(rnp); |
| 479 | } | 460 | } |
| 480 | 461 | ||
| 481 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 482 | |||
| 483 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 484 | { | ||
| 485 | } | ||
| 486 | |||
| 487 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 488 | |||
| 489 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 462 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
| 490 | 463 | ||
| 491 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | 464 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) |
| @@ -648,17 +621,18 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 648 | * | 621 | * |
| 649 | * Caller must disable hard irqs. | 622 | * Caller must disable hard irqs. |
| 650 | */ | 623 | */ |
| 651 | static void rcu_preempt_check_callbacks(int cpu) | 624 | static void rcu_preempt_check_callbacks(void) |
| 652 | { | 625 | { |
| 653 | struct task_struct *t = current; | 626 | struct task_struct *t = current; |
| 654 | 627 | ||
| 655 | if (t->rcu_read_lock_nesting == 0) { | 628 | if (t->rcu_read_lock_nesting == 0) { |
| 656 | rcu_preempt_qs(cpu); | 629 | rcu_preempt_qs(); |
| 657 | return; | 630 | return; |
| 658 | } | 631 | } |
| 659 | if (t->rcu_read_lock_nesting > 0 && | 632 | if (t->rcu_read_lock_nesting > 0 && |
| 660 | per_cpu(rcu_preempt_data, cpu).qs_pending) | 633 | __this_cpu_read(rcu_preempt_data.qs_pending) && |
| 661 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 634 | !__this_cpu_read(rcu_preempt_data.passed_quiesce)) |
| 635 | t->rcu_read_unlock_special.b.need_qs = true; | ||
| 662 | } | 636 | } |
| 663 | 637 | ||
| 664 | #ifdef CONFIG_RCU_BOOST | 638 | #ifdef CONFIG_RCU_BOOST |
| @@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 819 | * In fact, if you are using synchronize_rcu_expedited() in a loop, | 793 | * In fact, if you are using synchronize_rcu_expedited() in a loop, |
| 820 | * please restructure your code to batch your updates, and then Use a | 794 | * please restructure your code to batch your updates, and then Use a |
| 821 | * single synchronize_rcu() instead. | 795 | * single synchronize_rcu() instead. |
| 822 | * | ||
| 823 | * Note that it is illegal to call this function while holding any lock | ||
| 824 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 825 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 826 | * these restriction will result in deadlock. | ||
| 827 | */ | 796 | */ |
| 828 | void synchronize_rcu_expedited(void) | 797 | void synchronize_rcu_expedited(void) |
| 829 | { | 798 | { |
| @@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void) | |||
| 845 | * being boosted. This simplifies the process of moving tasks | 814 | * being boosted. This simplifies the process of moving tasks |
| 846 | * from leaf to root rcu_node structures. | 815 | * from leaf to root rcu_node structures. |
| 847 | */ | 816 | */ |
| 848 | get_online_cpus(); | 817 | if (!try_get_online_cpus()) { |
| 818 | /* CPU-hotplug operation in flight, fall back to normal GP. */ | ||
| 819 | wait_rcu_gp(call_rcu); | ||
| 820 | return; | ||
| 821 | } | ||
| 849 | 822 | ||
| 850 | /* | 823 | /* |
| 851 | * Acquire lock, falling back to synchronize_rcu() if too many | 824 | * Acquire lock, falling back to synchronize_rcu() if too many |
| @@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void) | |||
| 897 | 870 | ||
| 898 | /* Clean up and exit. */ | 871 | /* Clean up and exit. */ |
| 899 | smp_mb(); /* ensure expedited GP seen before counter increment. */ | 872 | smp_mb(); /* ensure expedited GP seen before counter increment. */ |
| 900 | ACCESS_ONCE(sync_rcu_preempt_exp_count)++; | 873 | ACCESS_ONCE(sync_rcu_preempt_exp_count) = |
| 874 | sync_rcu_preempt_exp_count + 1; | ||
| 901 | unlock_mb_ret: | 875 | unlock_mb_ret: |
| 902 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | 876 | mutex_unlock(&sync_rcu_preempt_exp_mutex); |
| 903 | mb_ret: | 877 | mb_ret: |
| @@ -941,11 +915,11 @@ void exit_rcu(void) | |||
| 941 | return; | 915 | return; |
| 942 | t->rcu_read_lock_nesting = 1; | 916 | t->rcu_read_lock_nesting = 1; |
| 943 | barrier(); | 917 | barrier(); |
| 944 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | 918 | t->rcu_read_unlock_special.b.blocked = true; |
| 945 | __rcu_read_unlock(); | 919 | __rcu_read_unlock(); |
| 946 | } | 920 | } |
| 947 | 921 | ||
| 948 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 922 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
| 949 | 923 | ||
| 950 | static struct rcu_state *rcu_state_p = &rcu_sched_state; | 924 | static struct rcu_state *rcu_state_p = &rcu_sched_state; |
| 951 | 925 | ||
| @@ -971,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
| 971 | * Because preemptible RCU does not exist, we never have to check for | 945 | * Because preemptible RCU does not exist, we never have to check for |
| 972 | * CPUs being in quiescent states. | 946 | * CPUs being in quiescent states. |
| 973 | */ | 947 | */ |
| 974 | static void rcu_preempt_note_context_switch(int cpu) | 948 | static void rcu_preempt_note_context_switch(void) |
| 975 | { | 949 | { |
| 976 | } | 950 | } |
| 977 | 951 | ||
| @@ -1043,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 1043 | * Because preemptible RCU does not exist, it never has any callbacks | 1017 | * Because preemptible RCU does not exist, it never has any callbacks |
| 1044 | * to check. | 1018 | * to check. |
| 1045 | */ | 1019 | */ |
| 1046 | static void rcu_preempt_check_callbacks(int cpu) | 1020 | static void rcu_preempt_check_callbacks(void) |
| 1047 | { | 1021 | { |
| 1048 | } | 1022 | } |
| 1049 | 1023 | ||
| @@ -1096,7 +1070,7 @@ void exit_rcu(void) | |||
| 1096 | { | 1070 | { |
| 1097 | } | 1071 | } |
| 1098 | 1072 | ||
| 1099 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1073 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
| 1100 | 1074 | ||
| 1101 | #ifdef CONFIG_RCU_BOOST | 1075 | #ifdef CONFIG_RCU_BOOST |
| 1102 | 1076 | ||
| @@ -1352,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1352 | smp_mb__after_unlock_lock(); | 1326 | smp_mb__after_unlock_lock(); |
| 1353 | rnp->boost_kthread_task = t; | 1327 | rnp->boost_kthread_task = t; |
| 1354 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1328 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1355 | sp.sched_priority = RCU_BOOST_PRIO; | 1329 | sp.sched_priority = kthread_prio; |
| 1356 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 1330 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
| 1357 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | 1331 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ |
| 1358 | return 0; | 1332 | return 0; |
| @@ -1369,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) | |||
| 1369 | { | 1343 | { |
| 1370 | struct sched_param sp; | 1344 | struct sched_param sp; |
| 1371 | 1345 | ||
| 1372 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1346 | sp.sched_priority = kthread_prio; |
| 1373 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | 1347 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
| 1374 | } | 1348 | } |
| 1375 | 1349 | ||
| @@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = { | |||
| 1462 | }; | 1436 | }; |
| 1463 | 1437 | ||
| 1464 | /* | 1438 | /* |
| 1465 | * Spawn all kthreads -- called as soon as the scheduler is running. | 1439 | * Spawn boost kthreads -- called as soon as the scheduler is running. |
| 1466 | */ | 1440 | */ |
| 1467 | static int __init rcu_spawn_kthreads(void) | 1441 | static void __init rcu_spawn_boost_kthreads(void) |
| 1468 | { | 1442 | { |
| 1469 | struct rcu_node *rnp; | 1443 | struct rcu_node *rnp; |
| 1470 | int cpu; | 1444 | int cpu; |
| 1471 | 1445 | ||
| 1472 | rcu_scheduler_fully_active = 1; | ||
| 1473 | for_each_possible_cpu(cpu) | 1446 | for_each_possible_cpu(cpu) |
| 1474 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1447 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
| 1475 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | 1448 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
| @@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void) | |||
| 1479 | rcu_for_each_leaf_node(rcu_state_p, rnp) | 1452 | rcu_for_each_leaf_node(rcu_state_p, rnp) |
| 1480 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); | 1453 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
| 1481 | } | 1454 | } |
| 1482 | return 0; | ||
| 1483 | } | 1455 | } |
| 1484 | early_initcall(rcu_spawn_kthreads); | ||
| 1485 | 1456 | ||
| 1486 | static void rcu_prepare_kthreads(int cpu) | 1457 | static void rcu_prepare_kthreads(int cpu) |
| 1487 | { | 1458 | { |
| @@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
| 1519 | { | 1490 | { |
| 1520 | } | 1491 | } |
| 1521 | 1492 | ||
| 1522 | static int __init rcu_scheduler_really_started(void) | 1493 | static void __init rcu_spawn_boost_kthreads(void) |
| 1523 | { | 1494 | { |
| 1524 | rcu_scheduler_fully_active = 1; | ||
| 1525 | return 0; | ||
| 1526 | } | 1495 | } |
| 1527 | early_initcall(rcu_scheduler_really_started); | ||
| 1528 | 1496 | ||
| 1529 | static void rcu_prepare_kthreads(int cpu) | 1497 | static void rcu_prepare_kthreads(int cpu) |
| 1530 | { | 1498 | { |
| @@ -1544,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu) | |||
| 1544 | * any flavor of RCU. | 1512 | * any flavor of RCU. |
| 1545 | */ | 1513 | */ |
| 1546 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1514 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1547 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1515 | int rcu_needs_cpu(unsigned long *delta_jiffies) |
| 1548 | { | 1516 | { |
| 1549 | *delta_jiffies = ULONG_MAX; | 1517 | *delta_jiffies = ULONG_MAX; |
| 1550 | return rcu_cpu_has_callbacks(cpu, NULL); | 1518 | return rcu_cpu_has_callbacks(NULL); |
| 1551 | } | 1519 | } |
| 1552 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | 1520 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ |
| 1553 | 1521 | ||
| @@ -1555,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | |||
| 1555 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | 1523 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up |
| 1556 | * after it. | 1524 | * after it. |
| 1557 | */ | 1525 | */ |
| 1558 | static void rcu_cleanup_after_idle(int cpu) | 1526 | static void rcu_cleanup_after_idle(void) |
| 1559 | { | 1527 | { |
| 1560 | } | 1528 | } |
| 1561 | 1529 | ||
| @@ -1563,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu) | |||
| 1563 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, | 1531 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, |
| 1564 | * is nothing. | 1532 | * is nothing. |
| 1565 | */ | 1533 | */ |
| 1566 | static void rcu_prepare_for_idle(int cpu) | 1534 | static void rcu_prepare_for_idle(void) |
| 1567 | { | 1535 | { |
| 1568 | } | 1536 | } |
| 1569 | 1537 | ||
| @@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
| 1625 | 1593 | ||
| 1626 | /* Exit early if we advanced recently. */ | 1594 | /* Exit early if we advanced recently. */ |
| 1627 | if (jiffies == rdtp->last_advance_all) | 1595 | if (jiffies == rdtp->last_advance_all) |
| 1628 | return 0; | 1596 | return false; |
| 1629 | rdtp->last_advance_all = jiffies; | 1597 | rdtp->last_advance_all = jiffies; |
| 1630 | 1598 | ||
| 1631 | for_each_rcu_flavor(rsp) { | 1599 | for_each_rcu_flavor(rsp) { |
| @@ -1656,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
| 1656 | * The caller must have disabled interrupts. | 1624 | * The caller must have disabled interrupts. |
| 1657 | */ | 1625 | */ |
| 1658 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1626 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1659 | int rcu_needs_cpu(int cpu, unsigned long *dj) | 1627 | int rcu_needs_cpu(unsigned long *dj) |
| 1660 | { | 1628 | { |
| 1661 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1629 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 1662 | 1630 | ||
| 1663 | /* Snapshot to detect later posting of non-lazy callback. */ | 1631 | /* Snapshot to detect later posting of non-lazy callback. */ |
| 1664 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | 1632 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; |
| 1665 | 1633 | ||
| 1666 | /* If no callbacks, RCU doesn't need the CPU. */ | 1634 | /* If no callbacks, RCU doesn't need the CPU. */ |
| 1667 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { | 1635 | if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { |
| 1668 | *dj = ULONG_MAX; | 1636 | *dj = ULONG_MAX; |
| 1669 | return 0; | 1637 | return 0; |
| 1670 | } | 1638 | } |
| @@ -1698,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) | |||
| 1698 | * | 1666 | * |
| 1699 | * The caller must have disabled interrupts. | 1667 | * The caller must have disabled interrupts. |
| 1700 | */ | 1668 | */ |
| 1701 | static void rcu_prepare_for_idle(int cpu) | 1669 | static void rcu_prepare_for_idle(void) |
| 1702 | { | 1670 | { |
| 1703 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1671 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1704 | bool needwake; | 1672 | bool needwake; |
| 1705 | struct rcu_data *rdp; | 1673 | struct rcu_data *rdp; |
| 1706 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1674 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 1707 | struct rcu_node *rnp; | 1675 | struct rcu_node *rnp; |
| 1708 | struct rcu_state *rsp; | 1676 | struct rcu_state *rsp; |
| 1709 | int tne; | 1677 | int tne; |
| @@ -1711,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1711 | /* Handle nohz enablement switches conservatively. */ | 1679 | /* Handle nohz enablement switches conservatively. */ |
| 1712 | tne = ACCESS_ONCE(tick_nohz_active); | 1680 | tne = ACCESS_ONCE(tick_nohz_active); |
| 1713 | if (tne != rdtp->tick_nohz_enabled_snap) { | 1681 | if (tne != rdtp->tick_nohz_enabled_snap) { |
| 1714 | if (rcu_cpu_has_callbacks(cpu, NULL)) | 1682 | if (rcu_cpu_has_callbacks(NULL)) |
| 1715 | invoke_rcu_core(); /* force nohz to see update. */ | 1683 | invoke_rcu_core(); /* force nohz to see update. */ |
| 1716 | rdtp->tick_nohz_enabled_snap = tne; | 1684 | rdtp->tick_nohz_enabled_snap = tne; |
| 1717 | return; | 1685 | return; |
| @@ -1720,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1720 | return; | 1688 | return; |
| 1721 | 1689 | ||
| 1722 | /* If this is a no-CBs CPU, no callbacks, just return. */ | 1690 | /* If this is a no-CBs CPU, no callbacks, just return. */ |
| 1723 | if (rcu_is_nocb_cpu(cpu)) | 1691 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1724 | return; | 1692 | return; |
| 1725 | 1693 | ||
| 1726 | /* | 1694 | /* |
| @@ -1744,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1744 | return; | 1712 | return; |
| 1745 | rdtp->last_accelerate = jiffies; | 1713 | rdtp->last_accelerate = jiffies; |
| 1746 | for_each_rcu_flavor(rsp) { | 1714 | for_each_rcu_flavor(rsp) { |
| 1747 | rdp = per_cpu_ptr(rsp->rda, cpu); | 1715 | rdp = this_cpu_ptr(rsp->rda); |
| 1748 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1716 | if (!*rdp->nxttail[RCU_DONE_TAIL]) |
| 1749 | continue; | 1717 | continue; |
| 1750 | rnp = rdp->mynode; | 1718 | rnp = rdp->mynode; |
| @@ -1763,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1763 | * any grace periods that elapsed while the CPU was idle, and if any | 1731 | * any grace periods that elapsed while the CPU was idle, and if any |
| 1764 | * callbacks are now ready to invoke, initiate invocation. | 1732 | * callbacks are now ready to invoke, initiate invocation. |
| 1765 | */ | 1733 | */ |
| 1766 | static void rcu_cleanup_after_idle(int cpu) | 1734 | static void rcu_cleanup_after_idle(void) |
| 1767 | { | 1735 | { |
| 1768 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1736 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1769 | if (rcu_is_nocb_cpu(cpu)) | 1737 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1770 | return; | 1738 | return; |
| 1771 | if (rcu_try_advance_all_cbs()) | 1739 | if (rcu_try_advance_all_cbs()) |
| 1772 | invoke_rcu_core(); | 1740 | invoke_rcu_core(); |
| @@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self, | |||
| 1848 | get_online_cpus(); | 1816 | get_online_cpus(); |
| 1849 | for_each_online_cpu(cpu) { | 1817 | for_each_online_cpu(cpu) { |
| 1850 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | 1818 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); |
| 1851 | cond_resched(); | 1819 | cond_resched_rcu_qs(); |
| 1852 | } | 1820 | } |
| 1853 | put_online_cpus(); | 1821 | put_online_cpus(); |
| 1854 | 1822 | ||
| @@ -2075,13 +2043,40 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) | |||
| 2075 | if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) | 2043 | if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) |
| 2076 | return; | 2044 | return; |
| 2077 | if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { | 2045 | if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { |
| 2078 | /* Prior xchg orders against prior callback enqueue. */ | 2046 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ |
| 2079 | ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; | 2047 | ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; |
| 2080 | wake_up(&rdp_leader->nocb_wq); | 2048 | wake_up(&rdp_leader->nocb_wq); |
| 2081 | } | 2049 | } |
| 2082 | } | 2050 | } |
| 2083 | 2051 | ||
| 2084 | /* | 2052 | /* |
| 2053 | * Does the specified CPU need an RCU callback for the specified flavor | ||
| 2054 | * of rcu_barrier()? | ||
| 2055 | */ | ||
| 2056 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | ||
| 2057 | { | ||
| 2058 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2059 | struct rcu_head *rhp; | ||
| 2060 | |||
| 2061 | /* No-CBs CPUs might have callbacks on any of three lists. */ | ||
| 2062 | rhp = ACCESS_ONCE(rdp->nocb_head); | ||
| 2063 | if (!rhp) | ||
| 2064 | rhp = ACCESS_ONCE(rdp->nocb_gp_head); | ||
| 2065 | if (!rhp) | ||
| 2066 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); | ||
| 2067 | |||
| 2068 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | ||
| 2069 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { | ||
| 2070 | /* RCU callback enqueued before CPU first came online??? */ | ||
| 2071 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | ||
| 2072 | cpu, rhp->func); | ||
| 2073 | WARN_ON_ONCE(1); | ||
| 2074 | } | ||
| 2075 | |||
| 2076 | return !!rhp; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | /* | ||
| 2085 | * Enqueue the specified string of rcu_head structures onto the specified | 2080 | * Enqueue the specified string of rcu_head structures onto the specified |
| 2086 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | 2081 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the |
| 2087 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | 2082 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy |
| @@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2104 | ACCESS_ONCE(*old_rhpp) = rhp; | 2099 | ACCESS_ONCE(*old_rhpp) = rhp; |
| 2105 | atomic_long_add(rhcount, &rdp->nocb_q_count); | 2100 | atomic_long_add(rhcount, &rdp->nocb_q_count); |
| 2106 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | 2101 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); |
| 2102 | smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ | ||
| 2107 | 2103 | ||
| 2108 | /* If we are not being polled and there is a kthread, awaken it ... */ | 2104 | /* If we are not being polled and there is a kthread, awaken it ... */ |
| 2109 | t = ACCESS_ONCE(rdp->nocb_kthread); | 2105 | t = ACCESS_ONCE(rdp->nocb_kthread); |
| @@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2120 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2116 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2121 | TPS("WakeEmpty")); | 2117 | TPS("WakeEmpty")); |
| 2122 | } else { | 2118 | } else { |
| 2123 | rdp->nocb_defer_wakeup = true; | 2119 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; |
| 2124 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2120 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2125 | TPS("WakeEmptyIsDeferred")); | 2121 | TPS("WakeEmptyIsDeferred")); |
| 2126 | } | 2122 | } |
| 2127 | rdp->qlen_last_fqs_check = 0; | 2123 | rdp->qlen_last_fqs_check = 0; |
| 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2124 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
| 2129 | /* ... or if many callbacks queued. */ | 2125 | /* ... or if many callbacks queued. */ |
| 2130 | wake_nocb_leader(rdp, true); | 2126 | if (!irqs_disabled_flags(flags)) { |
| 2127 | wake_nocb_leader(rdp, true); | ||
| 2128 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2129 | TPS("WakeOvf")); | ||
| 2130 | } else { | ||
| 2131 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; | ||
| 2132 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2133 | TPS("WakeOvfIsDeferred")); | ||
| 2134 | } | ||
| 2131 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2135 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
| 2132 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
| 2133 | } else { | 2136 | } else { |
| 2134 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | 2137 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); |
| 2135 | } | 2138 | } |
| @@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2150 | { | 2153 | { |
| 2151 | 2154 | ||
| 2152 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 2155 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
| 2153 | return 0; | 2156 | return false; |
| 2154 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); | 2157 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); |
| 2155 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2158 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
| 2156 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2159 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
| @@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2161 | trace_rcu_callback(rdp->rsp->name, rhp, | 2164 | trace_rcu_callback(rdp->rsp->name, rhp, |
| 2162 | -atomic_long_read(&rdp->nocb_q_count_lazy), | 2165 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2163 | -atomic_long_read(&rdp->nocb_q_count)); | 2166 | -atomic_long_read(&rdp->nocb_q_count)); |
| 2164 | return 1; | 2167 | |
| 2168 | /* | ||
| 2169 | * If called from an extended quiescent state with interrupts | ||
| 2170 | * disabled, invoke the RCU core in order to allow the idle-entry | ||
| 2171 | * deferred-wakeup check to function. | ||
| 2172 | */ | ||
| 2173 | if (irqs_disabled_flags(flags) && | ||
| 2174 | !rcu_is_watching() && | ||
| 2175 | cpu_online(smp_processor_id())) | ||
| 2176 | invoke_rcu_core(); | ||
| 2177 | |||
| 2178 | return true; | ||
| 2165 | } | 2179 | } |
| 2166 | 2180 | ||
| 2167 | /* | 2181 | /* |
| @@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
| 2177 | 2191 | ||
| 2178 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 2192 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
| 2179 | if (!rcu_is_nocb_cpu(smp_processor_id())) | 2193 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
| 2180 | return 0; | 2194 | return false; |
| 2181 | rsp->qlen = 0; | 2195 | rsp->qlen = 0; |
| 2182 | rsp->qlen_lazy = 0; | 2196 | rsp->qlen_lazy = 0; |
| 2183 | 2197 | ||
| @@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
| 2196 | rsp->orphan_nxtlist = NULL; | 2210 | rsp->orphan_nxtlist = NULL; |
| 2197 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | 2211 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; |
| 2198 | } | 2212 | } |
| 2199 | return 1; | 2213 | return true; |
| 2200 | } | 2214 | } |
| 2201 | 2215 | ||
| 2202 | /* | 2216 | /* |
| @@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2229 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); | 2243 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); |
| 2230 | if (likely(d)) | 2244 | if (likely(d)) |
| 2231 | break; | 2245 | break; |
| 2232 | flush_signals(current); | 2246 | WARN_ON(signal_pending(current)); |
| 2233 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); | 2247 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
| 2234 | } | 2248 | } |
| 2235 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); | 2249 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
| @@ -2288,7 +2302,7 @@ wait_again: | |||
| 2288 | if (!rcu_nocb_poll) | 2302 | if (!rcu_nocb_poll) |
| 2289 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, | 2303 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, |
| 2290 | "WokeEmpty"); | 2304 | "WokeEmpty"); |
| 2291 | flush_signals(current); | 2305 | WARN_ON(signal_pending(current)); |
| 2292 | schedule_timeout_interruptible(1); | 2306 | schedule_timeout_interruptible(1); |
| 2293 | 2307 | ||
| 2294 | /* Rescan in case we were a victim of memory ordering. */ | 2308 | /* Rescan in case we were a victim of memory ordering. */ |
| @@ -2327,6 +2341,7 @@ wait_again: | |||
| 2327 | atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); | 2341 | atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); |
| 2328 | atomic_long_add(rdp->nocb_gp_count_lazy, | 2342 | atomic_long_add(rdp->nocb_gp_count_lazy, |
| 2329 | &rdp->nocb_follower_count_lazy); | 2343 | &rdp->nocb_follower_count_lazy); |
| 2344 | smp_mb__after_atomic(); /* Store *tail before wakeup. */ | ||
| 2330 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { | 2345 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { |
| 2331 | /* | 2346 | /* |
| 2332 | * List was empty, wake up the follower. | 2347 | * List was empty, wake up the follower. |
| @@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) | |||
| 2367 | if (!rcu_nocb_poll) | 2382 | if (!rcu_nocb_poll) |
| 2368 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2383 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2369 | "WokeEmpty"); | 2384 | "WokeEmpty"); |
| 2370 | flush_signals(current); | 2385 | WARN_ON(signal_pending(current)); |
| 2371 | schedule_timeout_interruptible(1); | 2386 | schedule_timeout_interruptible(1); |
| 2372 | } | 2387 | } |
| 2373 | } | 2388 | } |
| @@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2428 | list = next; | 2443 | list = next; |
| 2429 | } | 2444 | } |
| 2430 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | 2445 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); |
| 2431 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | 2446 | ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; |
| 2432 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | 2447 | ACCESS_ONCE(rdp->nocb_p_count_lazy) = |
| 2448 | rdp->nocb_p_count_lazy - cl; | ||
| 2433 | rdp->n_nocbs_invoked += c; | 2449 | rdp->n_nocbs_invoked += c; |
| 2434 | } | 2450 | } |
| 2435 | return 0; | 2451 | return 0; |
| 2436 | } | 2452 | } |
| 2437 | 2453 | ||
| 2438 | /* Is a deferred wakeup of rcu_nocb_kthread() required? */ | 2454 | /* Is a deferred wakeup of rcu_nocb_kthread() required? */ |
| 2439 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | 2455 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) |
| 2440 | { | 2456 | { |
| 2441 | return ACCESS_ONCE(rdp->nocb_defer_wakeup); | 2457 | return ACCESS_ONCE(rdp->nocb_defer_wakeup); |
| 2442 | } | 2458 | } |
| @@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | |||
| 2444 | /* Do a deferred wakeup of rcu_nocb_kthread(). */ | 2460 | /* Do a deferred wakeup of rcu_nocb_kthread(). */ |
| 2445 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | 2461 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp) |
| 2446 | { | 2462 | { |
| 2463 | int ndw; | ||
| 2464 | |||
| 2447 | if (!rcu_nocb_need_deferred_wakeup(rdp)) | 2465 | if (!rcu_nocb_need_deferred_wakeup(rdp)) |
| 2448 | return; | 2466 | return; |
| 2449 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; | 2467 | ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); |
| 2450 | wake_nocb_leader(rdp, false); | 2468 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; |
| 2451 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); | 2469 | wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); |
| 2470 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); | ||
| 2471 | } | ||
| 2472 | |||
| 2473 | void __init rcu_init_nohz(void) | ||
| 2474 | { | ||
| 2475 | int cpu; | ||
| 2476 | bool need_rcu_nocb_mask = true; | ||
| 2477 | struct rcu_state *rsp; | ||
| 2478 | |||
| 2479 | #ifdef CONFIG_RCU_NOCB_CPU_NONE | ||
| 2480 | need_rcu_nocb_mask = false; | ||
| 2481 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 2482 | |||
| 2483 | #if defined(CONFIG_NO_HZ_FULL) | ||
| 2484 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) | ||
| 2485 | need_rcu_nocb_mask = true; | ||
| 2486 | #endif /* #if defined(CONFIG_NO_HZ_FULL) */ | ||
| 2487 | |||
| 2488 | if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { | ||
| 2489 | if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { | ||
| 2490 | pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); | ||
| 2491 | return; | ||
| 2492 | } | ||
| 2493 | have_rcu_nocb_mask = true; | ||
| 2494 | } | ||
| 2495 | if (!have_rcu_nocb_mask) | ||
| 2496 | return; | ||
| 2497 | |||
| 2498 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 2499 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 2500 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 2501 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 2502 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 2503 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 2504 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 2505 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 2506 | #if defined(CONFIG_NO_HZ_FULL) | ||
| 2507 | if (tick_nohz_full_running) | ||
| 2508 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | ||
| 2509 | #endif /* #if defined(CONFIG_NO_HZ_FULL) */ | ||
| 2510 | |||
| 2511 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 2512 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 2513 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 2514 | rcu_nocb_mask); | ||
| 2515 | } | ||
| 2516 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
| 2517 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
| 2518 | if (rcu_nocb_poll) | ||
| 2519 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
| 2520 | |||
| 2521 | for_each_rcu_flavor(rsp) { | ||
| 2522 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
| 2523 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2524 | |||
| 2525 | /* | ||
| 2526 | * If there are early callbacks, they will need | ||
| 2527 | * to be moved to the nocb lists. | ||
| 2528 | */ | ||
| 2529 | WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != | ||
| 2530 | &rdp->nxtlist && | ||
| 2531 | rdp->nxttail[RCU_NEXT_TAIL] != NULL); | ||
| 2532 | init_nocb_callback_list(rdp); | ||
| 2533 | } | ||
| 2534 | rcu_organize_nocb_kthreads(rsp); | ||
| 2535 | } | ||
| 2452 | } | 2536 | } |
| 2453 | 2537 | ||
| 2454 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | 2538 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ |
| @@ -2459,15 +2543,89 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | |||
| 2459 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | 2543 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; |
| 2460 | } | 2544 | } |
| 2461 | 2545 | ||
| 2546 | /* | ||
| 2547 | * If the specified CPU is a no-CBs CPU that does not already have its | ||
| 2548 | * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are | ||
| 2549 | * brought online out of order, this can require re-organizing the | ||
| 2550 | * leader-follower relationships. | ||
| 2551 | */ | ||
| 2552 | static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) | ||
| 2553 | { | ||
| 2554 | struct rcu_data *rdp; | ||
| 2555 | struct rcu_data *rdp_last; | ||
| 2556 | struct rcu_data *rdp_old_leader; | ||
| 2557 | struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); | ||
| 2558 | struct task_struct *t; | ||
| 2559 | |||
| 2560 | /* | ||
| 2561 | * If this isn't a no-CBs CPU or if it already has an rcuo kthread, | ||
| 2562 | * then nothing to do. | ||
| 2563 | */ | ||
| 2564 | if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) | ||
| 2565 | return; | ||
| 2566 | |||
| 2567 | /* If we didn't spawn the leader first, reorganize! */ | ||
| 2568 | rdp_old_leader = rdp_spawn->nocb_leader; | ||
| 2569 | if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { | ||
| 2570 | rdp_last = NULL; | ||
| 2571 | rdp = rdp_old_leader; | ||
| 2572 | do { | ||
| 2573 | rdp->nocb_leader = rdp_spawn; | ||
| 2574 | if (rdp_last && rdp != rdp_spawn) | ||
| 2575 | rdp_last->nocb_next_follower = rdp; | ||
| 2576 | if (rdp == rdp_spawn) { | ||
| 2577 | rdp = rdp->nocb_next_follower; | ||
| 2578 | } else { | ||
| 2579 | rdp_last = rdp; | ||
| 2580 | rdp = rdp->nocb_next_follower; | ||
| 2581 | rdp_last->nocb_next_follower = NULL; | ||
| 2582 | } | ||
| 2583 | } while (rdp); | ||
| 2584 | rdp_spawn->nocb_next_follower = rdp_old_leader; | ||
| 2585 | } | ||
| 2586 | |||
| 2587 | /* Spawn the kthread for this CPU and RCU flavor. */ | ||
| 2588 | t = kthread_run(rcu_nocb_kthread, rdp_spawn, | ||
| 2589 | "rcuo%c/%d", rsp->abbr, cpu); | ||
| 2590 | BUG_ON(IS_ERR(t)); | ||
| 2591 | ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; | ||
| 2592 | } | ||
| 2593 | |||
| 2594 | /* | ||
| 2595 | * If the specified CPU is a no-CBs CPU that does not already have its | ||
| 2596 | * rcuo kthreads, spawn them. | ||
| 2597 | */ | ||
| 2598 | static void rcu_spawn_all_nocb_kthreads(int cpu) | ||
| 2599 | { | ||
| 2600 | struct rcu_state *rsp; | ||
| 2601 | |||
| 2602 | if (rcu_scheduler_fully_active) | ||
| 2603 | for_each_rcu_flavor(rsp) | ||
| 2604 | rcu_spawn_one_nocb_kthread(rsp, cpu); | ||
| 2605 | } | ||
| 2606 | |||
| 2607 | /* | ||
| 2608 | * Once the scheduler is running, spawn rcuo kthreads for all online | ||
| 2609 | * no-CBs CPUs. This assumes that the early_initcall()s happen before | ||
| 2610 | * non-boot CPUs come online -- if this changes, we will need to add | ||
| 2611 | * some mutual exclusion. | ||
| 2612 | */ | ||
| 2613 | static void __init rcu_spawn_nocb_kthreads(void) | ||
| 2614 | { | ||
| 2615 | int cpu; | ||
| 2616 | |||
| 2617 | for_each_online_cpu(cpu) | ||
| 2618 | rcu_spawn_all_nocb_kthreads(cpu); | ||
| 2619 | } | ||
| 2620 | |||
| 2462 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ | 2621 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ |
| 2463 | static int rcu_nocb_leader_stride = -1; | 2622 | static int rcu_nocb_leader_stride = -1; |
| 2464 | module_param(rcu_nocb_leader_stride, int, 0444); | 2623 | module_param(rcu_nocb_leader_stride, int, 0444); |
| 2465 | 2624 | ||
| 2466 | /* | 2625 | /* |
| 2467 | * Create a kthread for each RCU flavor for each no-CBs CPU. | 2626 | * Initialize leader-follower relationships for all no-CBs CPU. |
| 2468 | * Also initialize leader-follower relationships. | ||
| 2469 | */ | 2627 | */ |
| 2470 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2628 | static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) |
| 2471 | { | 2629 | { |
| 2472 | int cpu; | 2630 | int cpu; |
| 2473 | int ls = rcu_nocb_leader_stride; | 2631 | int ls = rcu_nocb_leader_stride; |
| @@ -2475,14 +2633,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
| 2475 | struct rcu_data *rdp; | 2633 | struct rcu_data *rdp; |
| 2476 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ | 2634 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ |
| 2477 | struct rcu_data *rdp_prev = NULL; | 2635 | struct rcu_data *rdp_prev = NULL; |
| 2478 | struct task_struct *t; | ||
| 2479 | 2636 | ||
| 2480 | if (rcu_nocb_mask == NULL) | 2637 | if (!have_rcu_nocb_mask) |
| 2481 | return; | 2638 | return; |
| 2482 | #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) | ||
| 2483 | if (tick_nohz_full_running) | ||
| 2484 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | ||
| 2485 | #endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ | ||
| 2486 | if (ls == -1) { | 2639 | if (ls == -1) { |
| 2487 | ls = int_sqrt(nr_cpu_ids); | 2640 | ls = int_sqrt(nr_cpu_ids); |
| 2488 | rcu_nocb_leader_stride = ls; | 2641 | rcu_nocb_leader_stride = ls; |
| @@ -2505,27 +2658,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
| 2505 | rdp_prev->nocb_next_follower = rdp; | 2658 | rdp_prev->nocb_next_follower = rdp; |
| 2506 | } | 2659 | } |
| 2507 | rdp_prev = rdp; | 2660 | rdp_prev = rdp; |
| 2508 | |||
| 2509 | /* Spawn the kthread for this CPU. */ | ||
| 2510 | t = kthread_run(rcu_nocb_kthread, rdp, | ||
| 2511 | "rcuo%c/%d", rsp->abbr, cpu); | ||
| 2512 | BUG_ON(IS_ERR(t)); | ||
| 2513 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
| 2514 | } | 2661 | } |
| 2515 | } | 2662 | } |
| 2516 | 2663 | ||
| 2517 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | 2664 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ |
| 2518 | static bool init_nocb_callback_list(struct rcu_data *rdp) | 2665 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
| 2519 | { | 2666 | { |
| 2520 | if (rcu_nocb_mask == NULL || | 2667 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
| 2521 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
| 2522 | return false; | 2668 | return false; |
| 2669 | |||
| 2523 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2670 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
| 2524 | return true; | 2671 | return true; |
| 2525 | } | 2672 | } |
| 2526 | 2673 | ||
| 2527 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2674 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 2528 | 2675 | ||
| 2676 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | ||
| 2677 | { | ||
| 2678 | WARN_ON_ONCE(1); /* Should be dead code. */ | ||
| 2679 | return false; | ||
| 2680 | } | ||
| 2681 | |||
| 2529 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2682 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
| 2530 | { | 2683 | { |
| 2531 | } | 2684 | } |
| @@ -2541,21 +2694,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
| 2541 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2694 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
| 2542 | bool lazy, unsigned long flags) | 2695 | bool lazy, unsigned long flags) |
| 2543 | { | 2696 | { |
| 2544 | return 0; | 2697 | return false; |
| 2545 | } | 2698 | } |
| 2546 | 2699 | ||
| 2547 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 2700 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
| 2548 | struct rcu_data *rdp, | 2701 | struct rcu_data *rdp, |
| 2549 | unsigned long flags) | 2702 | unsigned long flags) |
| 2550 | { | 2703 | { |
| 2551 | return 0; | 2704 | return false; |
| 2552 | } | 2705 | } |
| 2553 | 2706 | ||
| 2554 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2707 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
| 2555 | { | 2708 | { |
| 2556 | } | 2709 | } |
| 2557 | 2710 | ||
| 2558 | static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) | 2711 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) |
| 2559 | { | 2712 | { |
| 2560 | return false; | 2713 | return false; |
| 2561 | } | 2714 | } |
| @@ -2564,7 +2717,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | |||
| 2564 | { | 2717 | { |
| 2565 | } | 2718 | } |
| 2566 | 2719 | ||
| 2567 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2720 | static void rcu_spawn_all_nocb_kthreads(int cpu) |
| 2721 | { | ||
| 2722 | } | ||
| 2723 | |||
| 2724 | static void __init rcu_spawn_nocb_kthreads(void) | ||
| 2568 | { | 2725 | { |
| 2569 | } | 2726 | } |
| 2570 | 2727 | ||
| @@ -2595,16 +2752,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) | |||
| 2595 | 2752 | ||
| 2596 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 2753 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
| 2597 | 2754 | ||
| 2598 | /* | ||
| 2599 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
| 2600 | * most active flavor of RCU. | ||
| 2601 | */ | ||
| 2602 | #ifdef CONFIG_PREEMPT_RCU | ||
| 2603 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
| 2604 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2605 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
| 2606 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2607 | |||
| 2608 | static int full_sysidle_state; /* Current system-idle state. */ | 2755 | static int full_sysidle_state; /* Current system-idle state. */ |
| 2609 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | 2756 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ |
| 2610 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | 2757 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ |
| @@ -2618,9 +2765,14 @@ static int full_sysidle_state; /* Current system-idle state. */ | |||
| 2618 | * to detect full-system idle states, not RCU quiescent states and grace | 2765 | * to detect full-system idle states, not RCU quiescent states and grace |
| 2619 | * periods. The caller must have disabled interrupts. | 2766 | * periods. The caller must have disabled interrupts. |
| 2620 | */ | 2767 | */ |
| 2621 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | 2768 | static void rcu_sysidle_enter(int irq) |
| 2622 | { | 2769 | { |
| 2623 | unsigned long j; | 2770 | unsigned long j; |
| 2771 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2772 | |||
| 2773 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2774 | if (!tick_nohz_full_enabled()) | ||
| 2775 | return; | ||
| 2624 | 2776 | ||
| 2625 | /* Adjust nesting, check for fully idle. */ | 2777 | /* Adjust nesting, check for fully idle. */ |
| 2626 | if (irq) { | 2778 | if (irq) { |
| @@ -2685,8 +2837,14 @@ void rcu_sysidle_force_exit(void) | |||
| 2685 | * usermode execution does -not- count as idle here! The caller must | 2837 | * usermode execution does -not- count as idle here! The caller must |
| 2686 | * have disabled interrupts. | 2838 | * have disabled interrupts. |
| 2687 | */ | 2839 | */ |
| 2688 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | 2840 | static void rcu_sysidle_exit(int irq) |
| 2689 | { | 2841 | { |
| 2842 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2843 | |||
| 2844 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2845 | if (!tick_nohz_full_enabled()) | ||
| 2846 | return; | ||
| 2847 | |||
| 2690 | /* Adjust nesting, check for already non-idle. */ | 2848 | /* Adjust nesting, check for already non-idle. */ |
| 2691 | if (irq) { | 2849 | if (irq) { |
| 2692 | rdtp->dynticks_idle_nesting++; | 2850 | rdtp->dynticks_idle_nesting++; |
| @@ -2741,12 +2899,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
| 2741 | unsigned long j; | 2899 | unsigned long j; |
| 2742 | struct rcu_dynticks *rdtp = rdp->dynticks; | 2900 | struct rcu_dynticks *rdtp = rdp->dynticks; |
| 2743 | 2901 | ||
| 2902 | /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ | ||
| 2903 | if (!tick_nohz_full_enabled()) | ||
| 2904 | return; | ||
| 2905 | |||
| 2744 | /* | 2906 | /* |
| 2745 | * If some other CPU has already reported non-idle, if this is | 2907 | * If some other CPU has already reported non-idle, if this is |
| 2746 | * not the flavor of RCU that tracks sysidle state, or if this | 2908 | * not the flavor of RCU that tracks sysidle state, or if this |
| 2747 | * is an offline or the timekeeping CPU, nothing to do. | 2909 | * is an offline or the timekeeping CPU, nothing to do. |
| 2748 | */ | 2910 | */ |
| 2749 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | 2911 | if (!*isidle || rdp->rsp != rcu_state_p || |
| 2750 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | 2912 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) |
| 2751 | return; | 2913 | return; |
| 2752 | if (rcu_gp_in_progress(rdp->rsp)) | 2914 | if (rcu_gp_in_progress(rdp->rsp)) |
| @@ -2772,7 +2934,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
| 2772 | */ | 2934 | */ |
| 2773 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | 2935 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) |
| 2774 | { | 2936 | { |
| 2775 | return rsp == rcu_sysidle_state; | 2937 | return rsp == rcu_state_p; |
| 2776 | } | 2938 | } |
| 2777 | 2939 | ||
| 2778 | /* | 2940 | /* |
| @@ -2850,7 +3012,7 @@ static void rcu_sysidle_cancel(void) | |||
| 2850 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | 3012 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, |
| 2851 | unsigned long maxj, bool gpkt) | 3013 | unsigned long maxj, bool gpkt) |
| 2852 | { | 3014 | { |
| 2853 | if (rsp != rcu_sysidle_state) | 3015 | if (rsp != rcu_state_p) |
| 2854 | return; /* Wrong flavor, ignore. */ | 3016 | return; /* Wrong flavor, ignore. */ |
| 2855 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | 3017 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) |
| 2856 | return; /* Running state machine from timekeeping CPU. */ | 3018 | return; /* Running state machine from timekeeping CPU. */ |
| @@ -2867,6 +3029,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | |||
| 2867 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | 3029 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, |
| 2868 | unsigned long maxj) | 3030 | unsigned long maxj) |
| 2869 | { | 3031 | { |
| 3032 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 3033 | if (!tick_nohz_full_enabled()) | ||
| 3034 | return; | ||
| 3035 | |||
| 2870 | rcu_sysidle_report(rsp, isidle, maxj, true); | 3036 | rcu_sysidle_report(rsp, isidle, maxj, true); |
| 2871 | } | 3037 | } |
| 2872 | 3038 | ||
| @@ -2893,7 +3059,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) | |||
| 2893 | 3059 | ||
| 2894 | /* | 3060 | /* |
| 2895 | * Check to see if the system is fully idle, other than the timekeeping CPU. | 3061 | * Check to see if the system is fully idle, other than the timekeeping CPU. |
| 2896 | * The caller must have disabled interrupts. | 3062 | * The caller must have disabled interrupts. This is not intended to be |
| 3063 | * called unless tick_nohz_full_enabled(). | ||
| 2897 | */ | 3064 | */ |
| 2898 | bool rcu_sys_is_idle(void) | 3065 | bool rcu_sys_is_idle(void) |
| 2899 | { | 3066 | { |
| @@ -2919,13 +3086,12 @@ bool rcu_sys_is_idle(void) | |||
| 2919 | 3086 | ||
| 2920 | /* Scan all the CPUs looking for nonidle CPUs. */ | 3087 | /* Scan all the CPUs looking for nonidle CPUs. */ |
| 2921 | for_each_possible_cpu(cpu) { | 3088 | for_each_possible_cpu(cpu) { |
| 2922 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | 3089 | rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
| 2923 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | 3090 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); |
| 2924 | if (!isidle) | 3091 | if (!isidle) |
| 2925 | break; | 3092 | break; |
| 2926 | } | 3093 | } |
| 2927 | rcu_sysidle_report(rcu_sysidle_state, | 3094 | rcu_sysidle_report(rcu_state_p, isidle, maxj, false); |
| 2928 | isidle, maxj, false); | ||
| 2929 | oldrss = rss; | 3095 | oldrss = rss; |
| 2930 | rss = ACCESS_ONCE(full_sysidle_state); | 3096 | rss = ACCESS_ONCE(full_sysidle_state); |
| 2931 | } | 3097 | } |
| @@ -2952,7 +3118,7 @@ bool rcu_sys_is_idle(void) | |||
| 2952 | * provided by the memory allocator. | 3118 | * provided by the memory allocator. |
| 2953 | */ | 3119 | */ |
| 2954 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | 3120 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && |
| 2955 | !rcu_gp_in_progress(rcu_sysidle_state) && | 3121 | !rcu_gp_in_progress(rcu_state_p) && |
| 2956 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | 3122 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) |
| 2957 | call_rcu(&rsh.rh, rcu_sysidle_cb); | 3123 | call_rcu(&rsh.rh, rcu_sysidle_cb); |
| 2958 | return false; | 3124 | return false; |
| @@ -2968,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | |||
| 2968 | 3134 | ||
| 2969 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3135 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
| 2970 | 3136 | ||
| 2971 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | 3137 | static void rcu_sysidle_enter(int irq) |
| 2972 | { | 3138 | { |
| 2973 | } | 3139 | } |
| 2974 | 3140 | ||
| 2975 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | 3141 | static void rcu_sysidle_exit(int irq) |
| 2976 | { | 3142 | { |
| 2977 | } | 3143 | } |
| 2978 | 3144 | ||
| @@ -3036,3 +3202,19 @@ static void rcu_bind_gp_kthread(void) | |||
| 3036 | housekeeping_affine(current); | 3202 | housekeeping_affine(current); |
| 3037 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3203 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
| 3038 | } | 3204 | } |
| 3205 | |||
| 3206 | /* Record the current task on dyntick-idle entry. */ | ||
| 3207 | static void rcu_dynticks_task_enter(void) | ||
| 3208 | { | ||
| 3209 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | ||
| 3210 | ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); | ||
| 3211 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | ||
| 3212 | } | ||
| 3213 | |||
| 3214 | /* Record no current task on dyntick-idle exit. */ | ||
| 3215 | static void rcu_dynticks_task_exit(void) | ||
| 3216 | { | ||
| 3217 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | ||
| 3218 | ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; | ||
| 3219 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | ||
| 3220 | } | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..e0d31a345ee6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -47,6 +47,8 @@ | |||
| 47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
| 48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
| 50 | #include <linux/kthread.h> | ||
| 51 | #include <linux/tick.h> | ||
| 50 | 52 | ||
| 51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
| 52 | 54 | ||
| @@ -91,7 +93,7 @@ void __rcu_read_unlock(void) | |||
| 91 | barrier(); /* critical section before exit code. */ | 93 | barrier(); /* critical section before exit code. */ |
| 92 | t->rcu_read_lock_nesting = INT_MIN; | 94 | t->rcu_read_lock_nesting = INT_MIN; |
| 93 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 95 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
| 94 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 96 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) |
| 95 | rcu_read_unlock_special(t); | 97 | rcu_read_unlock_special(t); |
| 96 | barrier(); /* ->rcu_read_unlock_special load before assign */ | 98 | barrier(); /* ->rcu_read_unlock_special load before assign */ |
| 97 | t->rcu_read_lock_nesting = 0; | 99 | t->rcu_read_lock_nesting = 0; |
| @@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void) | |||
| 137 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 139 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
| 138 | 140 | ||
| 139 | /** | 141 | /** |
| 142 | * rcu_read_lock_held() - might we be in RCU read-side critical section? | ||
| 143 | * | ||
| 144 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU | ||
| 145 | * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, | ||
| 146 | * this assumes we are in an RCU read-side critical section unless it can | ||
| 147 | * prove otherwise. This is useful for debug checks in functions that | ||
| 148 | * require that they be called within an RCU read-side critical section. | ||
| 149 | * | ||
| 150 | * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot | ||
| 151 | * and while lockdep is disabled. | ||
| 152 | * | ||
| 153 | * Note that rcu_read_lock() and the matching rcu_read_unlock() must | ||
| 154 | * occur in the same context, for example, it is illegal to invoke | ||
| 155 | * rcu_read_unlock() in process context if the matching rcu_read_lock() | ||
| 156 | * was invoked from within an irq handler. | ||
| 157 | * | ||
| 158 | * Note that rcu_read_lock() is disallowed if the CPU is either idle or | ||
| 159 | * offline from an RCU perspective, so check for those as well. | ||
| 160 | */ | ||
| 161 | int rcu_read_lock_held(void) | ||
| 162 | { | ||
| 163 | if (!debug_lockdep_rcu_enabled()) | ||
| 164 | return 1; | ||
| 165 | if (!rcu_is_watching()) | ||
| 166 | return 0; | ||
| 167 | if (!rcu_lockdep_current_cpu_online()) | ||
| 168 | return 0; | ||
| 169 | return lock_is_held(&rcu_lock_map); | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); | ||
| 172 | |||
| 173 | /** | ||
| 140 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? | 174 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
| 141 | * | 175 | * |
| 142 | * Check for bottom half being disabled, which covers both the | 176 | * Check for bottom half being disabled, which covers both the |
| @@ -272,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = { | |||
| 272 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 306 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
| 273 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 307 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 274 | 308 | ||
| 275 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 309 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
| 276 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, | 310 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
| 277 | unsigned long secs, | 311 | unsigned long secs, |
| 278 | unsigned long c_old, unsigned long c) | 312 | unsigned long c_old, unsigned long c) |
| @@ -347,3 +381,397 @@ static int __init check_cpu_stall_init(void) | |||
| 347 | early_initcall(check_cpu_stall_init); | 381 | early_initcall(check_cpu_stall_init); |
| 348 | 382 | ||
| 349 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 383 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
| 384 | |||
| 385 | #ifdef CONFIG_TASKS_RCU | ||
| 386 | |||
| 387 | /* | ||
| 388 | * Simple variant of RCU whose quiescent states are voluntary context switch, | ||
| 389 | * user-space execution, and idle. As such, grace periods can take one good | ||
| 390 | * long time. There are no read-side primitives similar to rcu_read_lock() | ||
| 391 | * and rcu_read_unlock() because this implementation is intended to get | ||
| 392 | * the system into a safe state for some of the manipulations involved in | ||
| 393 | * tracing and the like. Finally, this implementation does not support | ||
| 394 | * high call_rcu_tasks() rates from multiple CPUs. If this is required, | ||
| 395 | * per-CPU callback lists will be needed. | ||
| 396 | */ | ||
| 397 | |||
| 398 | /* Global list of callbacks and associated lock. */ | ||
| 399 | static struct rcu_head *rcu_tasks_cbs_head; | ||
| 400 | static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | ||
| 401 | static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); | ||
| 402 | static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); | ||
| 403 | |||
| 404 | /* Track exiting tasks in order to allow them to be waited for. */ | ||
| 405 | DEFINE_SRCU(tasks_rcu_exit_srcu); | ||
| 406 | |||
| 407 | /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ | ||
| 408 | static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; | ||
| 409 | module_param(rcu_task_stall_timeout, int, 0644); | ||
| 410 | |||
| 411 | static void rcu_spawn_tasks_kthread(void); | ||
| 412 | |||
| 413 | /* | ||
| 414 | * Post an RCU-tasks callback. First call must be from process context | ||
| 415 | * after the scheduler if fully operational. | ||
| 416 | */ | ||
| 417 | void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) | ||
| 418 | { | ||
| 419 | unsigned long flags; | ||
| 420 | bool needwake; | ||
| 421 | |||
| 422 | rhp->next = NULL; | ||
| 423 | rhp->func = func; | ||
| 424 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | ||
| 425 | needwake = !rcu_tasks_cbs_head; | ||
| 426 | *rcu_tasks_cbs_tail = rhp; | ||
| 427 | rcu_tasks_cbs_tail = &rhp->next; | ||
| 428 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | ||
| 429 | if (needwake) { | ||
| 430 | rcu_spawn_tasks_kthread(); | ||
| 431 | wake_up(&rcu_tasks_cbs_wq); | ||
| 432 | } | ||
| 433 | } | ||
| 434 | EXPORT_SYMBOL_GPL(call_rcu_tasks); | ||
| 435 | |||
| 436 | /** | ||
| 437 | * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. | ||
| 438 | * | ||
| 439 | * Control will return to the caller some time after a full rcu-tasks | ||
| 440 | * grace period has elapsed, in other words after all currently | ||
| 441 | * executing rcu-tasks read-side critical sections have elapsed. These | ||
| 442 | * read-side critical sections are delimited by calls to schedule(), | ||
| 443 | * cond_resched_rcu_qs(), idle execution, userspace execution, calls | ||
| 444 | * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). | ||
| 445 | * | ||
| 446 | * This is a very specialized primitive, intended only for a few uses in | ||
| 447 | * tracing and other situations requiring manipulation of function | ||
| 448 | * preambles and profiling hooks. The synchronize_rcu_tasks() function | ||
| 449 | * is not (yet) intended for heavy use from multiple CPUs. | ||
| 450 | * | ||
| 451 | * Note that this guarantee implies further memory-ordering guarantees. | ||
| 452 | * On systems with more than one CPU, when synchronize_rcu_tasks() returns, | ||
| 453 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
| 454 | * end of its last RCU-tasks read-side critical section whose beginning | ||
| 455 | * preceded the call to synchronize_rcu_tasks(). In addition, each CPU | ||
| 456 | * having an RCU-tasks read-side critical section that extends beyond | ||
| 457 | * the return from synchronize_rcu_tasks() is guaranteed to have executed | ||
| 458 | * a full memory barrier after the beginning of synchronize_rcu_tasks() | ||
| 459 | * and before the beginning of that RCU-tasks read-side critical section. | ||
| 460 | * Note that these guarantees include CPUs that are offline, idle, or | ||
| 461 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
| 462 | * | ||
| 463 | * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned | ||
| 464 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
| 465 | * to have executed a full memory barrier during the execution of | ||
| 466 | * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU | ||
| 467 | * (but again only if the system has more than one CPU). | ||
| 468 | */ | ||
| 469 | void synchronize_rcu_tasks(void) | ||
| 470 | { | ||
| 471 | /* Complain if the scheduler has not started. */ | ||
| 472 | rcu_lockdep_assert(!rcu_scheduler_active, | ||
| 473 | "synchronize_rcu_tasks called too soon"); | ||
| 474 | |||
| 475 | /* Wait for the grace period. */ | ||
| 476 | wait_rcu_gp(call_rcu_tasks); | ||
| 477 | } | ||
| 478 | EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); | ||
| 479 | |||
| 480 | /** | ||
| 481 | * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. | ||
| 482 | * | ||
| 483 | * Although the current implementation is guaranteed to wait, it is not | ||
| 484 | * obligated to, for example, if there are no pending callbacks. | ||
| 485 | */ | ||
| 486 | void rcu_barrier_tasks(void) | ||
| 487 | { | ||
| 488 | /* There is only one callback queue, so this is easy. ;-) */ | ||
| 489 | synchronize_rcu_tasks(); | ||
| 490 | } | ||
| 491 | EXPORT_SYMBOL_GPL(rcu_barrier_tasks); | ||
| 492 | |||
| 493 | /* See if tasks are still holding out, complain if so. */ | ||
| 494 | static void check_holdout_task(struct task_struct *t, | ||
| 495 | bool needreport, bool *firstreport) | ||
| 496 | { | ||
| 497 | int cpu; | ||
| 498 | |||
| 499 | if (!ACCESS_ONCE(t->rcu_tasks_holdout) || | ||
| 500 | t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || | ||
| 501 | !ACCESS_ONCE(t->on_rq) || | ||
| 502 | (IS_ENABLED(CONFIG_NO_HZ_FULL) && | ||
| 503 | !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { | ||
| 504 | ACCESS_ONCE(t->rcu_tasks_holdout) = false; | ||
| 505 | list_del_init(&t->rcu_tasks_holdout_list); | ||
| 506 | put_task_struct(t); | ||
| 507 | return; | ||
| 508 | } | ||
| 509 | if (!needreport) | ||
| 510 | return; | ||
| 511 | if (*firstreport) { | ||
| 512 | pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); | ||
| 513 | *firstreport = false; | ||
| 514 | } | ||
| 515 | cpu = task_cpu(t); | ||
| 516 | pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", | ||
| 517 | t, ".I"[is_idle_task(t)], | ||
| 518 | "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], | ||
| 519 | t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, | ||
| 520 | t->rcu_tasks_idle_cpu, cpu); | ||
| 521 | sched_show_task(t); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ | ||
| 525 | static int __noreturn rcu_tasks_kthread(void *arg) | ||
| 526 | { | ||
| 527 | unsigned long flags; | ||
| 528 | struct task_struct *g, *t; | ||
| 529 | unsigned long lastreport; | ||
| 530 | struct rcu_head *list; | ||
| 531 | struct rcu_head *next; | ||
| 532 | LIST_HEAD(rcu_tasks_holdouts); | ||
| 533 | |||
| 534 | /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ | ||
| 535 | housekeeping_affine(current); | ||
| 536 | |||
| 537 | /* | ||
| 538 | * Each pass through the following loop makes one check for | ||
| 539 | * newly arrived callbacks, and, if there are some, waits for | ||
| 540 | * one RCU-tasks grace period and then invokes the callbacks. | ||
| 541 | * This loop is terminated by the system going down. ;-) | ||
| 542 | */ | ||
| 543 | for (;;) { | ||
| 544 | |||
| 545 | /* Pick up any new callbacks. */ | ||
| 546 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | ||
| 547 | list = rcu_tasks_cbs_head; | ||
| 548 | rcu_tasks_cbs_head = NULL; | ||
| 549 | rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | ||
| 550 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | ||
| 551 | |||
| 552 | /* If there were none, wait a bit and start over. */ | ||
| 553 | if (!list) { | ||
| 554 | wait_event_interruptible(rcu_tasks_cbs_wq, | ||
| 555 | rcu_tasks_cbs_head); | ||
| 556 | if (!rcu_tasks_cbs_head) { | ||
| 557 | WARN_ON(signal_pending(current)); | ||
| 558 | schedule_timeout_interruptible(HZ/10); | ||
| 559 | } | ||
| 560 | continue; | ||
| 561 | } | ||
| 562 | |||
| 563 | /* | ||
| 564 | * Wait for all pre-existing t->on_rq and t->nvcsw | ||
| 565 | * transitions to complete. Invoking synchronize_sched() | ||
| 566 | * suffices because all these transitions occur with | ||
| 567 | * interrupts disabled. Without this synchronize_sched(), | ||
| 568 | * a read-side critical section that started before the | ||
| 569 | * grace period might be incorrectly seen as having started | ||
| 570 | * after the grace period. | ||
| 571 | * | ||
| 572 | * This synchronize_sched() also dispenses with the | ||
| 573 | * need for a memory barrier on the first store to | ||
| 574 | * ->rcu_tasks_holdout, as it forces the store to happen | ||
| 575 | * after the beginning of the grace period. | ||
| 576 | */ | ||
| 577 | synchronize_sched(); | ||
| 578 | |||
| 579 | /* | ||
| 580 | * There were callbacks, so we need to wait for an | ||
| 581 | * RCU-tasks grace period. Start off by scanning | ||
| 582 | * the task list for tasks that are not already | ||
| 583 | * voluntarily blocked. Mark these tasks and make | ||
| 584 | * a list of them in rcu_tasks_holdouts. | ||
| 585 | */ | ||
| 586 | rcu_read_lock(); | ||
| 587 | for_each_process_thread(g, t) { | ||
| 588 | if (t != current && ACCESS_ONCE(t->on_rq) && | ||
| 589 | !is_idle_task(t)) { | ||
| 590 | get_task_struct(t); | ||
| 591 | t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); | ||
| 592 | ACCESS_ONCE(t->rcu_tasks_holdout) = true; | ||
| 593 | list_add(&t->rcu_tasks_holdout_list, | ||
| 594 | &rcu_tasks_holdouts); | ||
| 595 | } | ||
| 596 | } | ||
| 597 | rcu_read_unlock(); | ||
| 598 | |||
| 599 | /* | ||
| 600 | * Wait for tasks that are in the process of exiting. | ||
| 601 | * This does only part of the job, ensuring that all | ||
| 602 | * tasks that were previously exiting reach the point | ||
| 603 | * where they have disabled preemption, allowing the | ||
| 604 | * later synchronize_sched() to finish the job. | ||
| 605 | */ | ||
| 606 | synchronize_srcu(&tasks_rcu_exit_srcu); | ||
| 607 | |||
| 608 | /* | ||
| 609 | * Each pass through the following loop scans the list | ||
| 610 | * of holdout tasks, removing any that are no longer | ||
| 611 | * holdouts. When the list is empty, we are done. | ||
| 612 | */ | ||
| 613 | lastreport = jiffies; | ||
| 614 | while (!list_empty(&rcu_tasks_holdouts)) { | ||
| 615 | bool firstreport; | ||
| 616 | bool needreport; | ||
| 617 | int rtst; | ||
| 618 | struct task_struct *t1; | ||
| 619 | |||
| 620 | schedule_timeout_interruptible(HZ); | ||
| 621 | rtst = ACCESS_ONCE(rcu_task_stall_timeout); | ||
| 622 | needreport = rtst > 0 && | ||
| 623 | time_after(jiffies, lastreport + rtst); | ||
| 624 | if (needreport) | ||
| 625 | lastreport = jiffies; | ||
| 626 | firstreport = true; | ||
| 627 | WARN_ON(signal_pending(current)); | ||
| 628 | list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, | ||
| 629 | rcu_tasks_holdout_list) { | ||
| 630 | check_holdout_task(t, needreport, &firstreport); | ||
| 631 | cond_resched(); | ||
| 632 | } | ||
| 633 | } | ||
| 634 | |||
| 635 | /* | ||
| 636 | * Because ->on_rq and ->nvcsw are not guaranteed | ||
| 637 | * to have a full memory barriers prior to them in the | ||
| 638 | * schedule() path, memory reordering on other CPUs could | ||
| 639 | * cause their RCU-tasks read-side critical sections to | ||
| 640 | * extend past the end of the grace period. However, | ||
| 641 | * because these ->nvcsw updates are carried out with | ||
| 642 | * interrupts disabled, we can use synchronize_sched() | ||
| 643 | * to force the needed ordering on all such CPUs. | ||
| 644 | * | ||
| 645 | * This synchronize_sched() also confines all | ||
| 646 | * ->rcu_tasks_holdout accesses to be within the grace | ||
| 647 | * period, avoiding the need for memory barriers for | ||
| 648 | * ->rcu_tasks_holdout accesses. | ||
| 649 | * | ||
| 650 | * In addition, this synchronize_sched() waits for exiting | ||
| 651 | * tasks to complete their final preempt_disable() region | ||
| 652 | * of execution, cleaning up after the synchronize_srcu() | ||
| 653 | * above. | ||
| 654 | */ | ||
| 655 | synchronize_sched(); | ||
| 656 | |||
| 657 | /* Invoke the callbacks. */ | ||
| 658 | while (list) { | ||
| 659 | next = list->next; | ||
| 660 | local_bh_disable(); | ||
| 661 | list->func(list); | ||
| 662 | local_bh_enable(); | ||
| 663 | list = next; | ||
| 664 | cond_resched(); | ||
| 665 | } | ||
| 666 | schedule_timeout_uninterruptible(HZ/10); | ||
| 667 | } | ||
| 668 | } | ||
| 669 | |||
| 670 | /* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ | ||
| 671 | static void rcu_spawn_tasks_kthread(void) | ||
| 672 | { | ||
| 673 | static DEFINE_MUTEX(rcu_tasks_kthread_mutex); | ||
| 674 | static struct task_struct *rcu_tasks_kthread_ptr; | ||
| 675 | struct task_struct *t; | ||
| 676 | |||
| 677 | if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { | ||
| 678 | smp_mb(); /* Ensure caller sees full kthread. */ | ||
| 679 | return; | ||
| 680 | } | ||
| 681 | mutex_lock(&rcu_tasks_kthread_mutex); | ||
| 682 | if (rcu_tasks_kthread_ptr) { | ||
| 683 | mutex_unlock(&rcu_tasks_kthread_mutex); | ||
| 684 | return; | ||
| 685 | } | ||
| 686 | t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); | ||
| 687 | BUG_ON(IS_ERR(t)); | ||
| 688 | smp_mb(); /* Ensure others see full kthread. */ | ||
| 689 | ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; | ||
| 690 | mutex_unlock(&rcu_tasks_kthread_mutex); | ||
| 691 | } | ||
| 692 | |||
| 693 | #endif /* #ifdef CONFIG_TASKS_RCU */ | ||
| 694 | |||
| 695 | #ifdef CONFIG_PROVE_RCU | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Early boot self test parameters, one for each flavor | ||
| 699 | */ | ||
| 700 | static bool rcu_self_test; | ||
| 701 | static bool rcu_self_test_bh; | ||
| 702 | static bool rcu_self_test_sched; | ||
| 703 | |||
| 704 | module_param(rcu_self_test, bool, 0444); | ||
| 705 | module_param(rcu_self_test_bh, bool, 0444); | ||
| 706 | module_param(rcu_self_test_sched, bool, 0444); | ||
| 707 | |||
| 708 | static int rcu_self_test_counter; | ||
| 709 | |||
| 710 | static void test_callback(struct rcu_head *r) | ||
| 711 | { | ||
| 712 | rcu_self_test_counter++; | ||
| 713 | pr_info("RCU test callback executed %d\n", rcu_self_test_counter); | ||
| 714 | } | ||
| 715 | |||
| 716 | static void early_boot_test_call_rcu(void) | ||
| 717 | { | ||
| 718 | static struct rcu_head head; | ||
| 719 | |||
| 720 | call_rcu(&head, test_callback); | ||
| 721 | } | ||
| 722 | |||
| 723 | static void early_boot_test_call_rcu_bh(void) | ||
| 724 | { | ||
| 725 | static struct rcu_head head; | ||
| 726 | |||
| 727 | call_rcu_bh(&head, test_callback); | ||
| 728 | } | ||
| 729 | |||
| 730 | static void early_boot_test_call_rcu_sched(void) | ||
| 731 | { | ||
| 732 | static struct rcu_head head; | ||
| 733 | |||
| 734 | call_rcu_sched(&head, test_callback); | ||
| 735 | } | ||
| 736 | |||
| 737 | void rcu_early_boot_tests(void) | ||
| 738 | { | ||
| 739 | pr_info("Running RCU self tests\n"); | ||
| 740 | |||
| 741 | if (rcu_self_test) | ||
| 742 | early_boot_test_call_rcu(); | ||
| 743 | if (rcu_self_test_bh) | ||
| 744 | early_boot_test_call_rcu_bh(); | ||
| 745 | if (rcu_self_test_sched) | ||
| 746 | early_boot_test_call_rcu_sched(); | ||
| 747 | } | ||
| 748 | |||
| 749 | static int rcu_verify_early_boot_tests(void) | ||
| 750 | { | ||
| 751 | int ret = 0; | ||
| 752 | int early_boot_test_counter = 0; | ||
| 753 | |||
| 754 | if (rcu_self_test) { | ||
| 755 | early_boot_test_counter++; | ||
| 756 | rcu_barrier(); | ||
| 757 | } | ||
| 758 | if (rcu_self_test_bh) { | ||
| 759 | early_boot_test_counter++; | ||
| 760 | rcu_barrier_bh(); | ||
| 761 | } | ||
| 762 | if (rcu_self_test_sched) { | ||
| 763 | early_boot_test_counter++; | ||
| 764 | rcu_barrier_sched(); | ||
| 765 | } | ||
| 766 | |||
| 767 | if (rcu_self_test_counter != early_boot_test_counter) { | ||
| 768 | WARN_ON(1); | ||
| 769 | ret = -1; | ||
| 770 | } | ||
| 771 | |||
| 772 | return ret; | ||
| 773 | } | ||
| 774 | late_initcall(rcu_verify_early_boot_tests); | ||
| 775 | #else | ||
| 776 | void rcu_early_boot_tests(void) {} | ||
| 777 | #endif /* CONFIG_PROVE_RCU */ | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index a3a9e240fcdb..5925f5ae8dff 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb) | |||
| 104 | } | 104 | } |
| 105 | EXPORT_SYMBOL(unregister_reboot_notifier); | 105 | EXPORT_SYMBOL(unregister_reboot_notifier); |
| 106 | 106 | ||
| 107 | /* | ||
| 108 | * Notifier list for kernel code which wants to be called | ||
| 109 | * to restart the system. | ||
| 110 | */ | ||
| 111 | static ATOMIC_NOTIFIER_HEAD(restart_handler_list); | ||
| 112 | |||
| 113 | /** | ||
| 114 | * register_restart_handler - Register function to be called to reset | ||
| 115 | * the system | ||
| 116 | * @nb: Info about handler function to be called | ||
| 117 | * @nb->priority: Handler priority. Handlers should follow the | ||
| 118 | * following guidelines for setting priorities. | ||
| 119 | * 0: Restart handler of last resort, | ||
| 120 | * with limited restart capabilities | ||
| 121 | * 128: Default restart handler; use if no other | ||
| 122 | * restart handler is expected to be available, | ||
| 123 | * and/or if restart functionality is | ||
| 124 | * sufficient to restart the entire system | ||
| 125 | * 255: Highest priority restart handler, will | ||
| 126 | * preempt all other restart handlers | ||
| 127 | * | ||
| 128 | * Registers a function with code to be called to restart the | ||
| 129 | * system. | ||
| 130 | * | ||
| 131 | * Registered functions will be called from machine_restart as last | ||
| 132 | * step of the restart sequence (if the architecture specific | ||
| 133 | * machine_restart function calls do_kernel_restart - see below | ||
| 134 | * for details). | ||
| 135 | * Registered functions are expected to restart the system immediately. | ||
| 136 | * If more than one function is registered, the restart handler priority | ||
| 137 | * selects which function will be called first. | ||
| 138 | * | ||
| 139 | * Restart handlers are expected to be registered from non-architecture | ||
| 140 | * code, typically from drivers. A typical use case would be a system | ||
| 141 | * where restart functionality is provided through a watchdog. Multiple | ||
| 142 | * restart handlers may exist; for example, one restart handler might | ||
| 143 | * restart the entire system, while another only restarts the CPU. | ||
| 144 | * In such cases, the restart handler which only restarts part of the | ||
| 145 | * hardware is expected to register with low priority to ensure that | ||
| 146 | * it only runs if no other means to restart the system is available. | ||
| 147 | * | ||
| 148 | * Currently always returns zero, as atomic_notifier_chain_register() | ||
| 149 | * always returns zero. | ||
| 150 | */ | ||
| 151 | int register_restart_handler(struct notifier_block *nb) | ||
| 152 | { | ||
| 153 | return atomic_notifier_chain_register(&restart_handler_list, nb); | ||
| 154 | } | ||
| 155 | EXPORT_SYMBOL(register_restart_handler); | ||
| 156 | |||
| 157 | /** | ||
| 158 | * unregister_restart_handler - Unregister previously registered | ||
| 159 | * restart handler | ||
| 160 | * @nb: Hook to be unregistered | ||
| 161 | * | ||
| 162 | * Unregisters a previously registered restart handler function. | ||
| 163 | * | ||
| 164 | * Returns zero on success, or %-ENOENT on failure. | ||
| 165 | */ | ||
| 166 | int unregister_restart_handler(struct notifier_block *nb) | ||
| 167 | { | ||
| 168 | return atomic_notifier_chain_unregister(&restart_handler_list, nb); | ||
| 169 | } | ||
| 170 | EXPORT_SYMBOL(unregister_restart_handler); | ||
| 171 | |||
| 172 | /** | ||
| 173 | * do_kernel_restart - Execute kernel restart handler call chain | ||
| 174 | * | ||
| 175 | * Calls functions registered with register_restart_handler. | ||
| 176 | * | ||
| 177 | * Expected to be called from machine_restart as last step of the restart | ||
| 178 | * sequence. | ||
| 179 | * | ||
| 180 | * Restarts the system immediately if a restart handler function has been | ||
| 181 | * registered. Otherwise does nothing. | ||
| 182 | */ | ||
| 183 | void do_kernel_restart(char *cmd) | ||
| 184 | { | ||
| 185 | atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); | ||
| 186 | } | ||
| 187 | |||
| 107 | void migrate_to_reboot_cpu(void) | 188 | void migrate_to_reboot_cpu(void) |
| 108 | { | 189 | { |
| 109 | /* The boot cpu is always logical cpu 0 */ | 190 | /* The boot cpu is always logical cpu 0 */ |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null | |||
| @@ -1,211 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * resource cgroups | ||
| 3 | * | ||
| 4 | * Copyright 2007 OpenVZ SWsoft Inc | ||
| 5 | * | ||
| 6 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
| 7 | * | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/types.h> | ||
| 11 | #include <linux/parser.h> | ||
| 12 | #include <linux/fs.h> | ||
| 13 | #include <linux/res_counter.h> | ||
| 14 | #include <linux/uaccess.h> | ||
| 15 | #include <linux/mm.h> | ||
| 16 | |||
| 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | ||
| 18 | { | ||
| 19 | spin_lock_init(&counter->lock); | ||
| 20 | counter->limit = RES_COUNTER_MAX; | ||
| 21 | counter->soft_limit = RES_COUNTER_MAX; | ||
| 22 | counter->parent = parent; | ||
| 23 | } | ||
| 24 | |||
| 25 | static u64 res_counter_uncharge_locked(struct res_counter *counter, | ||
| 26 | unsigned long val) | ||
| 27 | { | ||
| 28 | if (WARN_ON(counter->usage < val)) | ||
| 29 | val = counter->usage; | ||
| 30 | |||
| 31 | counter->usage -= val; | ||
| 32 | return counter->usage; | ||
| 33 | } | ||
| 34 | |||
| 35 | static int res_counter_charge_locked(struct res_counter *counter, | ||
| 36 | unsigned long val, bool force) | ||
| 37 | { | ||
| 38 | int ret = 0; | ||
| 39 | |||
| 40 | if (counter->usage + val > counter->limit) { | ||
| 41 | counter->failcnt++; | ||
| 42 | ret = -ENOMEM; | ||
| 43 | if (!force) | ||
| 44 | return ret; | ||
| 45 | } | ||
| 46 | |||
| 47 | counter->usage += val; | ||
| 48 | if (counter->usage > counter->max_usage) | ||
| 49 | counter->max_usage = counter->usage; | ||
| 50 | return ret; | ||
| 51 | } | ||
| 52 | |||
| 53 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, | ||
| 54 | struct res_counter **limit_fail_at, bool force) | ||
| 55 | { | ||
| 56 | int ret, r; | ||
| 57 | unsigned long flags; | ||
| 58 | struct res_counter *c, *u; | ||
| 59 | |||
| 60 | r = ret = 0; | ||
| 61 | *limit_fail_at = NULL; | ||
| 62 | local_irq_save(flags); | ||
| 63 | for (c = counter; c != NULL; c = c->parent) { | ||
| 64 | spin_lock(&c->lock); | ||
| 65 | r = res_counter_charge_locked(c, val, force); | ||
| 66 | spin_unlock(&c->lock); | ||
| 67 | if (r < 0 && !ret) { | ||
| 68 | ret = r; | ||
| 69 | *limit_fail_at = c; | ||
| 70 | if (!force) | ||
| 71 | break; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | |||
| 75 | if (ret < 0 && !force) { | ||
| 76 | for (u = counter; u != c; u = u->parent) { | ||
| 77 | spin_lock(&u->lock); | ||
| 78 | res_counter_uncharge_locked(u, val); | ||
| 79 | spin_unlock(&u->lock); | ||
| 80 | } | ||
| 81 | } | ||
| 82 | local_irq_restore(flags); | ||
| 83 | |||
| 84 | return ret; | ||
| 85 | } | ||
| 86 | |||
| 87 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
| 88 | struct res_counter **limit_fail_at) | ||
| 89 | { | ||
| 90 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
| 91 | } | ||
| 92 | |||
| 93 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | ||
| 94 | struct res_counter **limit_fail_at) | ||
| 95 | { | ||
| 96 | return __res_counter_charge(counter, val, limit_fail_at, true); | ||
| 97 | } | ||
| 98 | |||
| 99 | u64 res_counter_uncharge_until(struct res_counter *counter, | ||
| 100 | struct res_counter *top, | ||
| 101 | unsigned long val) | ||
| 102 | { | ||
| 103 | unsigned long flags; | ||
| 104 | struct res_counter *c; | ||
| 105 | u64 ret = 0; | ||
| 106 | |||
| 107 | local_irq_save(flags); | ||
| 108 | for (c = counter; c != top; c = c->parent) { | ||
| 109 | u64 r; | ||
| 110 | spin_lock(&c->lock); | ||
| 111 | r = res_counter_uncharge_locked(c, val); | ||
| 112 | if (c == counter) | ||
| 113 | ret = r; | ||
| 114 | spin_unlock(&c->lock); | ||
| 115 | } | ||
| 116 | local_irq_restore(flags); | ||
| 117 | return ret; | ||
| 118 | } | ||
| 119 | |||
| 120 | u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
| 121 | { | ||
| 122 | return res_counter_uncharge_until(counter, NULL, val); | ||
| 123 | } | ||
| 124 | |||
| 125 | static inline unsigned long long * | ||
| 126 | res_counter_member(struct res_counter *counter, int member) | ||
| 127 | { | ||
| 128 | switch (member) { | ||
| 129 | case RES_USAGE: | ||
| 130 | return &counter->usage; | ||
| 131 | case RES_MAX_USAGE: | ||
| 132 | return &counter->max_usage; | ||
| 133 | case RES_LIMIT: | ||
| 134 | return &counter->limit; | ||
| 135 | case RES_FAILCNT: | ||
| 136 | return &counter->failcnt; | ||
| 137 | case RES_SOFT_LIMIT: | ||
| 138 | return &counter->soft_limit; | ||
| 139 | }; | ||
| 140 | |||
| 141 | BUG(); | ||
| 142 | return NULL; | ||
| 143 | } | ||
| 144 | |||
| 145 | ssize_t res_counter_read(struct res_counter *counter, int member, | ||
| 146 | const char __user *userbuf, size_t nbytes, loff_t *pos, | ||
| 147 | int (*read_strategy)(unsigned long long val, char *st_buf)) | ||
| 148 | { | ||
| 149 | unsigned long long *val; | ||
| 150 | char buf[64], *s; | ||
| 151 | |||
| 152 | s = buf; | ||
| 153 | val = res_counter_member(counter, member); | ||
| 154 | if (read_strategy) | ||
| 155 | s += read_strategy(*val, s); | ||
| 156 | else | ||
| 157 | s += sprintf(s, "%llu\n", *val); | ||
| 158 | return simple_read_from_buffer((void __user *)userbuf, nbytes, | ||
| 159 | pos, buf, s - buf); | ||
| 160 | } | ||
| 161 | |||
| 162 | #if BITS_PER_LONG == 32 | ||
| 163 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 164 | { | ||
| 165 | unsigned long flags; | ||
| 166 | u64 ret; | ||
| 167 | |||
| 168 | spin_lock_irqsave(&counter->lock, flags); | ||
| 169 | ret = *res_counter_member(counter, member); | ||
| 170 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 171 | |||
| 172 | return ret; | ||
| 173 | } | ||
| 174 | #else | ||
| 175 | u64 res_counter_read_u64(struct res_counter *counter, int member) | ||
| 176 | { | ||
| 177 | return *res_counter_member(counter, member); | ||
| 178 | } | ||
| 179 | #endif | ||
| 180 | |||
| 181 | int res_counter_memparse_write_strategy(const char *buf, | ||
| 182 | unsigned long long *resp) | ||
| 183 | { | ||
| 184 | char *end; | ||
| 185 | unsigned long long res; | ||
| 186 | |||
| 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | ||
| 188 | if (*buf == '-') { | ||
| 189 | int rc = kstrtoull(buf + 1, 10, &res); | ||
| 190 | |||
| 191 | if (rc) | ||
| 192 | return rc; | ||
| 193 | if (res != 1) | ||
| 194 | return -EINVAL; | ||
| 195 | *resp = RES_COUNTER_MAX; | ||
| 196 | return 0; | ||
| 197 | } | ||
| 198 | |||
| 199 | res = memparse(buf, &end); | ||
| 200 | if (*end != '\0') | ||
| 201 | return -EINVAL; | ||
| 202 | |||
| 203 | if (PAGE_ALIGN(res) >= res) | ||
| 204 | res = PAGE_ALIGN(res); | ||
| 205 | else | ||
| 206 | res = RES_COUNTER_MAX; | ||
| 207 | |||
| 208 | *resp = res; | ||
| 209 | |||
| 210 | return 0; | ||
| 211 | } | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 60c5a3856ab7..0bcebffc4e77 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn) | |||
| 491 | } | 491 | } |
| 492 | EXPORT_SYMBOL_GPL(page_is_ram); | 492 | EXPORT_SYMBOL_GPL(page_is_ram); |
| 493 | 493 | ||
| 494 | /* | ||
| 495 | * Search for a resouce entry that fully contains the specified region. | ||
| 496 | * If found, return 1 if it is RAM, 0 if not. | ||
| 497 | * If not found, or region is not fully contained, return -1 | ||
| 498 | * | ||
| 499 | * Used by the ioremap functions to ensure the user is not remapping RAM and is | ||
| 500 | * a vast speed up over walking through the resource table page by page. | ||
| 501 | */ | ||
| 502 | int region_is_ram(resource_size_t start, unsigned long size) | ||
| 503 | { | ||
| 504 | struct resource *p; | ||
| 505 | resource_size_t end = start + size - 1; | ||
| 506 | int flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
| 507 | const char *name = "System RAM"; | ||
| 508 | int ret = -1; | ||
| 509 | |||
| 510 | read_lock(&resource_lock); | ||
| 511 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
| 512 | if (end < p->start) | ||
| 513 | continue; | ||
| 514 | |||
| 515 | if (p->start <= start && end <= p->end) { | ||
| 516 | /* resource fully contains region */ | ||
| 517 | if ((p->flags != flags) || strcmp(p->name, name)) | ||
| 518 | ret = 0; | ||
| 519 | else | ||
| 520 | ret = 1; | ||
| 521 | break; | ||
| 522 | } | ||
| 523 | if (p->end < start) | ||
| 524 | break; /* not found */ | ||
| 525 | } | ||
| 526 | read_unlock(&resource_lock); | ||
| 527 | return ret; | ||
| 528 | } | ||
| 529 | |||
| 494 | void __weak arch_remove_reservations(struct resource *avail) | 530 | void __weak arch_remove_reservations(struct resource *avail) |
| 495 | { | 531 | { |
| 496 | } | 532 | } |
| @@ -1245,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent, | |||
| 1245 | /* | 1281 | /* |
| 1246 | * Managed region resource | 1282 | * Managed region resource |
| 1247 | */ | 1283 | */ |
| 1284 | static void devm_resource_release(struct device *dev, void *ptr) | ||
| 1285 | { | ||
| 1286 | struct resource **r = ptr; | ||
| 1287 | |||
| 1288 | release_resource(*r); | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | /** | ||
| 1292 | * devm_request_resource() - request and reserve an I/O or memory resource | ||
| 1293 | * @dev: device for which to request the resource | ||
| 1294 | * @root: root of the resource tree from which to request the resource | ||
| 1295 | * @new: descriptor of the resource to request | ||
| 1296 | * | ||
| 1297 | * This is a device-managed version of request_resource(). There is usually | ||
| 1298 | * no need to release resources requested by this function explicitly since | ||
| 1299 | * that will be taken care of when the device is unbound from its driver. | ||
| 1300 | * If for some reason the resource needs to be released explicitly, because | ||
| 1301 | * of ordering issues for example, drivers must call devm_release_resource() | ||
| 1302 | * rather than the regular release_resource(). | ||
| 1303 | * | ||
| 1304 | * When a conflict is detected between any existing resources and the newly | ||
| 1305 | * requested resource, an error message will be printed. | ||
| 1306 | * | ||
| 1307 | * Returns 0 on success or a negative error code on failure. | ||
| 1308 | */ | ||
| 1309 | int devm_request_resource(struct device *dev, struct resource *root, | ||
| 1310 | struct resource *new) | ||
| 1311 | { | ||
| 1312 | struct resource *conflict, **ptr; | ||
| 1313 | |||
| 1314 | ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); | ||
| 1315 | if (!ptr) | ||
| 1316 | return -ENOMEM; | ||
| 1317 | |||
| 1318 | *ptr = new; | ||
| 1319 | |||
| 1320 | conflict = request_resource_conflict(root, new); | ||
| 1321 | if (conflict) { | ||
| 1322 | dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", | ||
| 1323 | new, conflict->name, conflict); | ||
| 1324 | devres_free(ptr); | ||
| 1325 | return -EBUSY; | ||
| 1326 | } | ||
| 1327 | |||
| 1328 | devres_add(dev, ptr); | ||
| 1329 | return 0; | ||
| 1330 | } | ||
| 1331 | EXPORT_SYMBOL(devm_request_resource); | ||
| 1332 | |||
| 1333 | static int devm_resource_match(struct device *dev, void *res, void *data) | ||
| 1334 | { | ||
| 1335 | struct resource **ptr = res; | ||
| 1336 | |||
| 1337 | return *ptr == data; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | /** | ||
| 1341 | * devm_release_resource() - release a previously requested resource | ||
| 1342 | * @dev: device for which to release the resource | ||
| 1343 | * @new: descriptor of the resource to release | ||
| 1344 | * | ||
| 1345 | * Releases a resource previously requested using devm_request_resource(). | ||
| 1346 | */ | ||
| 1347 | void devm_release_resource(struct device *dev, struct resource *new) | ||
| 1348 | { | ||
| 1349 | WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, | ||
| 1350 | new)); | ||
| 1351 | } | ||
| 1352 | EXPORT_SYMBOL(devm_release_resource); | ||
| 1353 | |||
| 1248 | struct region_devres { | 1354 | struct region_devres { |
| 1249 | struct resource *parent; | 1355 | struct resource *parent; |
| 1250 | resource_size_t start; | 1356 | resource_size_t start; |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
| @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
| 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | 148 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) |
| 149 | goto out; | 149 | goto out; |
| 150 | 150 | ||
| 151 | t = p; | 151 | for_each_thread(p, t) |
| 152 | do { | ||
| 153 | sched_move_task(t); | 152 | sched_move_task(t); |
| 154 | } while_each_thread(p, t); | ||
| 155 | |||
| 156 | out: | 153 | out: |
| 157 | unlock_task_sighand(p, &flags); | 154 | unlock_task_sighand(p, &flags); |
| 158 | autogroup_kref_put(prev); | 155 | autogroup_kref_put(prev); |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c27e4f8f4879 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | |||
| 134 | 134 | ||
| 135 | static inline struct sched_clock_data *this_scd(void) | 135 | static inline struct sched_clock_data *this_scd(void) |
| 136 | { | 136 | { |
| 137 | return &__get_cpu_var(sched_clock_data); | 137 | return this_cpu_ptr(&sched_clock_data); |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | static inline struct sched_clock_data *cpu_sdc(int cpu) | 140 | static inline struct sched_clock_data *cpu_sdc(int cpu) |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..607f852b4d04 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
| @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
| 148 | * | 148 | * |
| 149 | * This waits to be signaled for completion of a specific task. It is NOT | 149 | * This waits to be signaled for completion of a specific task. It is NOT |
| 150 | * interruptible and there is no timeout. The caller is accounted as waiting | 150 | * interruptible and there is no timeout. The caller is accounted as waiting |
| 151 | * for IO. | 151 | * for IO (which traditionally means blkio only). |
| 152 | */ | 152 | */ |
| 153 | void __sched wait_for_completion_io(struct completion *x) | 153 | void __sched wait_for_completion_io(struct completion *x) |
| 154 | { | 154 | { |
| @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); | |||
| 163 | * | 163 | * |
| 164 | * This waits for either a completion of a specific task to be signaled or for a | 164 | * This waits for either a completion of a specific task to be signaled or for a |
| 165 | * specified timeout to expire. The timeout is in jiffies. It is not | 165 | * specified timeout to expire. The timeout is in jiffies. It is not |
| 166 | * interruptible. The caller is accounted as waiting for IO. | 166 | * interruptible. The caller is accounted as waiting for IO (which traditionally |
| 167 | * means blkio only). | ||
| 167 | * | 168 | * |
| 168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | 169 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
| 169 | * till timeout) if completed. | 170 | * till timeout) if completed. |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec1a286684a5..c0accc00566e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -90,22 +90,6 @@ | |||
| 90 | #define CREATE_TRACE_POINTS | 90 | #define CREATE_TRACE_POINTS |
| 91 | #include <trace/events/sched.h> | 91 | #include <trace/events/sched.h> |
| 92 | 92 | ||
| 93 | #ifdef smp_mb__before_atomic | ||
| 94 | void __smp_mb__before_atomic(void) | ||
| 95 | { | ||
| 96 | smp_mb__before_atomic(); | ||
| 97 | } | ||
| 98 | EXPORT_SYMBOL(__smp_mb__before_atomic); | ||
| 99 | #endif | ||
| 100 | |||
| 101 | #ifdef smp_mb__after_atomic | ||
| 102 | void __smp_mb__after_atomic(void) | ||
| 103 | { | ||
| 104 | smp_mb__after_atomic(); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL(__smp_mb__after_atomic); | ||
| 107 | #endif | ||
| 108 | |||
| 109 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | 93 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
| 110 | { | 94 | { |
| 111 | unsigned long delta; | 95 | unsigned long delta; |
| @@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
| 333 | for (;;) { | 317 | for (;;) { |
| 334 | rq = task_rq(p); | 318 | rq = task_rq(p); |
| 335 | raw_spin_lock(&rq->lock); | 319 | raw_spin_lock(&rq->lock); |
| 336 | if (likely(rq == task_rq(p))) | 320 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 337 | return rq; | 321 | return rq; |
| 338 | raw_spin_unlock(&rq->lock); | 322 | raw_spin_unlock(&rq->lock); |
| 323 | |||
| 324 | while (unlikely(task_on_rq_migrating(p))) | ||
| 325 | cpu_relax(); | ||
| 339 | } | 326 | } |
| 340 | } | 327 | } |
| 341 | 328 | ||
| @@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
| 352 | raw_spin_lock_irqsave(&p->pi_lock, *flags); | 339 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
| 353 | rq = task_rq(p); | 340 | rq = task_rq(p); |
| 354 | raw_spin_lock(&rq->lock); | 341 | raw_spin_lock(&rq->lock); |
| 355 | if (likely(rq == task_rq(p))) | 342 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) |
| 356 | return rq; | 343 | return rq; |
| 357 | raw_spin_unlock(&rq->lock); | 344 | raw_spin_unlock(&rq->lock); |
| 358 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | 345 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); |
| 346 | |||
| 347 | while (unlikely(task_on_rq_migrating(p))) | ||
| 348 | cpu_relax(); | ||
| 359 | } | 349 | } |
| 360 | } | 350 | } |
| 361 | 351 | ||
| @@ -449,7 +439,15 @@ static void __hrtick_start(void *arg) | |||
| 449 | void hrtick_start(struct rq *rq, u64 delay) | 439 | void hrtick_start(struct rq *rq, u64 delay) |
| 450 | { | 440 | { |
| 451 | struct hrtimer *timer = &rq->hrtick_timer; | 441 | struct hrtimer *timer = &rq->hrtick_timer; |
| 452 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 442 | ktime_t time; |
| 443 | s64 delta; | ||
| 444 | |||
| 445 | /* | ||
| 446 | * Don't schedule slices shorter than 10000ns, that just | ||
| 447 | * doesn't make sense and can cause timer DoS. | ||
| 448 | */ | ||
| 449 | delta = max_t(s64, delay, 10000LL); | ||
| 450 | time = ktime_add_ns(timer->base->get_time(), delta); | ||
| 453 | 451 | ||
| 454 | hrtimer_set_expires(timer, time); | 452 | hrtimer_set_expires(timer, time); |
| 455 | 453 | ||
| @@ -1010,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) | |||
| 1010 | return cpu_curr(task_cpu(p)) == p; | 1008 | return cpu_curr(task_cpu(p)) == p; |
| 1011 | } | 1009 | } |
| 1012 | 1010 | ||
| 1011 | /* | ||
| 1012 | * Can drop rq->lock because from sched_class::switched_from() methods drop it. | ||
| 1013 | */ | ||
| 1013 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 1014 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
| 1014 | const struct sched_class *prev_class, | 1015 | const struct sched_class *prev_class, |
| 1015 | int oldprio) | 1016 | int oldprio) |
| @@ -1017,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 1017 | if (prev_class != p->sched_class) { | 1018 | if (prev_class != p->sched_class) { |
| 1018 | if (prev_class->switched_from) | 1019 | if (prev_class->switched_from) |
| 1019 | prev_class->switched_from(rq, p); | 1020 | prev_class->switched_from(rq, p); |
| 1021 | /* Possble rq->lock 'hole'. */ | ||
| 1020 | p->sched_class->switched_to(rq, p); | 1022 | p->sched_class->switched_to(rq, p); |
| 1021 | } else if (oldprio != p->prio || dl_task(p)) | 1023 | } else if (oldprio != p->prio || dl_task(p)) |
| 1022 | p->sched_class->prio_changed(rq, p, oldprio); | 1024 | p->sched_class->prio_changed(rq, p, oldprio); |
| @@ -1043,7 +1045,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 1043 | * A queue event has occurred, and we're going to schedule. In | 1045 | * A queue event has occurred, and we're going to schedule. In |
| 1044 | * this case, we can save a useless back to back clock update. | 1046 | * this case, we can save a useless back to back clock update. |
| 1045 | */ | 1047 | */ |
| 1046 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) | 1048 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
| 1047 | rq->skip_clock_update = 1; | 1049 | rq->skip_clock_update = 1; |
| 1048 | } | 1050 | } |
| 1049 | 1051 | ||
| @@ -1056,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1056 | * ttwu() will sort out the placement. | 1058 | * ttwu() will sort out the placement. |
| 1057 | */ | 1059 | */ |
| 1058 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 1060 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
| 1059 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); | 1061 | !p->on_rq); |
| 1060 | 1062 | ||
| 1061 | #ifdef CONFIG_LOCKDEP | 1063 | #ifdef CONFIG_LOCKDEP |
| 1062 | /* | 1064 | /* |
| @@ -1088,7 +1090,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1088 | 1090 | ||
| 1089 | static void __migrate_swap_task(struct task_struct *p, int cpu) | 1091 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
| 1090 | { | 1092 | { |
| 1091 | if (p->on_rq) { | 1093 | if (task_on_rq_queued(p)) { |
| 1092 | struct rq *src_rq, *dst_rq; | 1094 | struct rq *src_rq, *dst_rq; |
| 1093 | 1095 | ||
| 1094 | src_rq = task_rq(p); | 1096 | src_rq = task_rq(p); |
| @@ -1214,7 +1216,7 @@ static int migration_cpu_stop(void *data); | |||
| 1214 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) | 1216 | unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
| 1215 | { | 1217 | { |
| 1216 | unsigned long flags; | 1218 | unsigned long flags; |
| 1217 | int running, on_rq; | 1219 | int running, queued; |
| 1218 | unsigned long ncsw; | 1220 | unsigned long ncsw; |
| 1219 | struct rq *rq; | 1221 | struct rq *rq; |
| 1220 | 1222 | ||
| @@ -1252,7 +1254,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1252 | rq = task_rq_lock(p, &flags); | 1254 | rq = task_rq_lock(p, &flags); |
| 1253 | trace_sched_wait_task(p); | 1255 | trace_sched_wait_task(p); |
| 1254 | running = task_running(rq, p); | 1256 | running = task_running(rq, p); |
| 1255 | on_rq = p->on_rq; | 1257 | queued = task_on_rq_queued(p); |
| 1256 | ncsw = 0; | 1258 | ncsw = 0; |
| 1257 | if (!match_state || p->state == match_state) | 1259 | if (!match_state || p->state == match_state) |
| 1258 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 1260 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| @@ -1284,7 +1286,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 1284 | * running right now), it's preempted, and we should | 1286 | * running right now), it's preempted, and we should |
| 1285 | * yield - it could be a while. | 1287 | * yield - it could be a while. |
| 1286 | */ | 1288 | */ |
| 1287 | if (unlikely(on_rq)) { | 1289 | if (unlikely(queued)) { |
| 1288 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); | 1290 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
| 1289 | 1291 | ||
| 1290 | set_current_state(TASK_UNINTERRUPTIBLE); | 1292 | set_current_state(TASK_UNINTERRUPTIBLE); |
| @@ -1409,7 +1411,8 @@ out: | |||
| 1409 | static inline | 1411 | static inline |
| 1410 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | 1412 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
| 1411 | { | 1413 | { |
| 1412 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | 1414 | if (p->nr_cpus_allowed > 1) |
| 1415 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | ||
| 1413 | 1416 | ||
| 1414 | /* | 1417 | /* |
| 1415 | * In order not to call set_task_cpu() on a blocking task we need | 1418 | * In order not to call set_task_cpu() on a blocking task we need |
| @@ -1478,7 +1481,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
| 1478 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1481 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
| 1479 | { | 1482 | { |
| 1480 | activate_task(rq, p, en_flags); | 1483 | activate_task(rq, p, en_flags); |
| 1481 | p->on_rq = 1; | 1484 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 1482 | 1485 | ||
| 1483 | /* if a worker is waking up, notify workqueue */ | 1486 | /* if a worker is waking up, notify workqueue */ |
| 1484 | if (p->flags & PF_WQ_WORKER) | 1487 | if (p->flags & PF_WQ_WORKER) |
| @@ -1537,7 +1540,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
| 1537 | int ret = 0; | 1540 | int ret = 0; |
| 1538 | 1541 | ||
| 1539 | rq = __task_rq_lock(p); | 1542 | rq = __task_rq_lock(p); |
| 1540 | if (p->on_rq) { | 1543 | if (task_on_rq_queued(p)) { |
| 1541 | /* check_preempt_curr() may use rq clock */ | 1544 | /* check_preempt_curr() may use rq clock */ |
| 1542 | update_rq_clock(rq); | 1545 | update_rq_clock(rq); |
| 1543 | ttwu_do_wakeup(rq, p, wake_flags); | 1546 | ttwu_do_wakeup(rq, p, wake_flags); |
| @@ -1620,6 +1623,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1620 | } | 1623 | } |
| 1621 | } | 1624 | } |
| 1622 | 1625 | ||
| 1626 | void wake_up_if_idle(int cpu) | ||
| 1627 | { | ||
| 1628 | struct rq *rq = cpu_rq(cpu); | ||
| 1629 | unsigned long flags; | ||
| 1630 | |||
| 1631 | rcu_read_lock(); | ||
| 1632 | |||
| 1633 | if (!is_idle_task(rcu_dereference(rq->curr))) | ||
| 1634 | goto out; | ||
| 1635 | |||
| 1636 | if (set_nr_if_polling(rq->idle)) { | ||
| 1637 | trace_sched_wake_idle_without_ipi(cpu); | ||
| 1638 | } else { | ||
| 1639 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 1640 | if (is_idle_task(rq->curr)) | ||
| 1641 | smp_send_reschedule(cpu); | ||
| 1642 | /* Else cpu is not in idle, do nothing here */ | ||
| 1643 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | out: | ||
| 1647 | rcu_read_unlock(); | ||
| 1648 | } | ||
| 1649 | |||
| 1623 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1650 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1624 | { | 1651 | { |
| 1625 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1652 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1742,7 +1769,7 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
| 1742 | if (!(p->state & TASK_NORMAL)) | 1769 | if (!(p->state & TASK_NORMAL)) |
| 1743 | goto out; | 1770 | goto out; |
| 1744 | 1771 | ||
| 1745 | if (!p->on_rq) | 1772 | if (!task_on_rq_queued(p)) |
| 1746 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 1773 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 1747 | 1774 | ||
| 1748 | ttwu_do_wakeup(rq, p, 0); | 1775 | ttwu_do_wakeup(rq, p, 0); |
| @@ -1776,6 +1803,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1776 | } | 1803 | } |
| 1777 | 1804 | ||
| 1778 | /* | 1805 | /* |
| 1806 | * This function clears the sched_dl_entity static params. | ||
| 1807 | */ | ||
| 1808 | void __dl_clear_params(struct task_struct *p) | ||
| 1809 | { | ||
| 1810 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 1811 | |||
| 1812 | dl_se->dl_runtime = 0; | ||
| 1813 | dl_se->dl_deadline = 0; | ||
| 1814 | dl_se->dl_period = 0; | ||
| 1815 | dl_se->flags = 0; | ||
| 1816 | dl_se->dl_bw = 0; | ||
| 1817 | } | ||
| 1818 | |||
| 1819 | /* | ||
| 1779 | * Perform scheduler related setup for a newly forked process p. | 1820 | * Perform scheduler related setup for a newly forked process p. |
| 1780 | * p is forked by current. | 1821 | * p is forked by current. |
| 1781 | * | 1822 | * |
| @@ -1799,10 +1840,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1799 | 1840 | ||
| 1800 | RB_CLEAR_NODE(&p->dl.rb_node); | 1841 | RB_CLEAR_NODE(&p->dl.rb_node); |
| 1801 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1842 | hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 1802 | p->dl.dl_runtime = p->dl.runtime = 0; | 1843 | __dl_clear_params(p); |
| 1803 | p->dl.dl_deadline = p->dl.deadline = 0; | ||
| 1804 | p->dl.dl_period = 0; | ||
| 1805 | p->dl.flags = 0; | ||
| 1806 | 1844 | ||
| 1807 | INIT_LIST_HEAD(&p->rt.run_list); | 1845 | INIT_LIST_HEAD(&p->rt.run_list); |
| 1808 | 1846 | ||
| @@ -1825,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 1825 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1863 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
| 1826 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1864 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
| 1827 | p->numa_work.next = &p->numa_work; | 1865 | p->numa_work.next = &p->numa_work; |
| 1828 | p->numa_faults_memory = NULL; | 1866 | p->numa_faults = NULL; |
| 1829 | p->numa_faults_buffer_memory = NULL; | ||
| 1830 | p->last_task_numa_placement = 0; | 1867 | p->last_task_numa_placement = 0; |
| 1831 | p->last_sum_exec_runtime = 0; | 1868 | p->last_sum_exec_runtime = 0; |
| 1832 | 1869 | ||
| 1833 | INIT_LIST_HEAD(&p->numa_entry); | ||
| 1834 | p->numa_group = NULL; | 1870 | p->numa_group = NULL; |
| 1835 | #endif /* CONFIG_NUMA_BALANCING */ | 1871 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1836 | } | 1872 | } |
| @@ -1977,6 +2013,8 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
| 1977 | #ifdef CONFIG_SMP | 2013 | #ifdef CONFIG_SMP |
| 1978 | inline struct dl_bw *dl_bw_of(int i) | 2014 | inline struct dl_bw *dl_bw_of(int i) |
| 1979 | { | 2015 | { |
| 2016 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2017 | "sched RCU must be held"); | ||
| 1980 | return &cpu_rq(i)->rd->dl_bw; | 2018 | return &cpu_rq(i)->rd->dl_bw; |
| 1981 | } | 2019 | } |
| 1982 | 2020 | ||
| @@ -1985,6 +2023,8 @@ static inline int dl_bw_cpus(int i) | |||
| 1985 | struct root_domain *rd = cpu_rq(i)->rd; | 2023 | struct root_domain *rd = cpu_rq(i)->rd; |
| 1986 | int cpus = 0; | 2024 | int cpus = 0; |
| 1987 | 2025 | ||
| 2026 | rcu_lockdep_assert(rcu_read_lock_sched_held(), | ||
| 2027 | "sched RCU must be held"); | ||
| 1988 | for_each_cpu_and(i, rd->span, cpu_active_mask) | 2028 | for_each_cpu_and(i, rd->span, cpu_active_mask) |
| 1989 | cpus++; | 2029 | cpus++; |
| 1990 | 2030 | ||
| @@ -2002,25 +2042,6 @@ static inline int dl_bw_cpus(int i) | |||
| 2002 | } | 2042 | } |
| 2003 | #endif | 2043 | #endif |
| 2004 | 2044 | ||
| 2005 | static inline | ||
| 2006 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 2007 | { | ||
| 2008 | dl_b->total_bw -= tsk_bw; | ||
| 2009 | } | ||
| 2010 | |||
| 2011 | static inline | ||
| 2012 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 2013 | { | ||
| 2014 | dl_b->total_bw += tsk_bw; | ||
| 2015 | } | ||
| 2016 | |||
| 2017 | static inline | ||
| 2018 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
| 2019 | { | ||
| 2020 | return dl_b->bw != -1 && | ||
| 2021 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
| 2022 | } | ||
| 2023 | |||
| 2024 | /* | 2045 | /* |
| 2025 | * We must be sure that accepting a new task (or allowing changing the | 2046 | * We must be sure that accepting a new task (or allowing changing the |
| 2026 | * parameters of an existing one) is consistent with the bandwidth | 2047 | * parameters of an existing one) is consistent with the bandwidth |
| @@ -2095,7 +2116,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2095 | init_task_runnable_average(p); | 2116 | init_task_runnable_average(p); |
| 2096 | rq = __task_rq_lock(p); | 2117 | rq = __task_rq_lock(p); |
| 2097 | activate_task(rq, p, 0); | 2118 | activate_task(rq, p, 0); |
| 2098 | p->on_rq = 1; | 2119 | p->on_rq = TASK_ON_RQ_QUEUED; |
| 2099 | trace_sched_wakeup_new(p, true); | 2120 | trace_sched_wakeup_new(p, true); |
| 2100 | check_preempt_curr(rq, p, WF_FORK); | 2121 | check_preempt_curr(rq, p, WF_FORK); |
| 2101 | #ifdef CONFIG_SMP | 2122 | #ifdef CONFIG_SMP |
| @@ -2188,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 2188 | 2209 | ||
| 2189 | /** | 2210 | /** |
| 2190 | * finish_task_switch - clean up after a task-switch | 2211 | * finish_task_switch - clean up after a task-switch |
| 2191 | * @rq: runqueue associated with task-switch | ||
| 2192 | * @prev: the thread we just switched away from. | 2212 | * @prev: the thread we just switched away from. |
| 2193 | * | 2213 | * |
| 2194 | * finish_task_switch must be called after the context switch, paired | 2214 | * finish_task_switch must be called after the context switch, paired |
| @@ -2200,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 2200 | * so, we finish that here outside of the runqueue lock. (Doing it | 2220 | * so, we finish that here outside of the runqueue lock. (Doing it |
| 2201 | * with the lock held can cause deadlocks; see schedule() for | 2221 | * with the lock held can cause deadlocks; see schedule() for |
| 2202 | * details.) | 2222 | * details.) |
| 2223 | * | ||
| 2224 | * The context switch have flipped the stack from under us and restored the | ||
| 2225 | * local variables which were saved when this task called schedule() in the | ||
| 2226 | * past. prev == current is still correct but we need to recalculate this_rq | ||
| 2227 | * because prev may have moved to another CPU. | ||
| 2203 | */ | 2228 | */ |
| 2204 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) | 2229 | static struct rq *finish_task_switch(struct task_struct *prev) |
| 2205 | __releases(rq->lock) | 2230 | __releases(rq->lock) |
| 2206 | { | 2231 | { |
| 2232 | struct rq *rq = this_rq(); | ||
| 2207 | struct mm_struct *mm = rq->prev_mm; | 2233 | struct mm_struct *mm = rq->prev_mm; |
| 2208 | long prev_state; | 2234 | long prev_state; |
| 2209 | 2235 | ||
| @@ -2243,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 2243 | } | 2269 | } |
| 2244 | 2270 | ||
| 2245 | tick_nohz_task_switch(current); | 2271 | tick_nohz_task_switch(current); |
| 2272 | return rq; | ||
| 2246 | } | 2273 | } |
| 2247 | 2274 | ||
| 2248 | #ifdef CONFIG_SMP | 2275 | #ifdef CONFIG_SMP |
| @@ -2277,29 +2304,22 @@ static inline void post_schedule(struct rq *rq) | |||
| 2277 | asmlinkage __visible void schedule_tail(struct task_struct *prev) | 2304 | asmlinkage __visible void schedule_tail(struct task_struct *prev) |
| 2278 | __releases(rq->lock) | 2305 | __releases(rq->lock) |
| 2279 | { | 2306 | { |
| 2280 | struct rq *rq = this_rq(); | 2307 | struct rq *rq; |
| 2281 | |||
| 2282 | finish_task_switch(rq, prev); | ||
| 2283 | 2308 | ||
| 2284 | /* | 2309 | /* finish_task_switch() drops rq->lock and enables preemtion */ |
| 2285 | * FIXME: do we need to worry about rq being invalidated by the | 2310 | preempt_disable(); |
| 2286 | * task_switch? | 2311 | rq = finish_task_switch(prev); |
| 2287 | */ | ||
| 2288 | post_schedule(rq); | 2312 | post_schedule(rq); |
| 2289 | |||
| 2290 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2291 | /* In this case, finish_task_switch does not reenable preemption */ | ||
| 2292 | preempt_enable(); | 2313 | preempt_enable(); |
| 2293 | #endif | 2314 | |
| 2294 | if (current->set_child_tid) | 2315 | if (current->set_child_tid) |
| 2295 | put_user(task_pid_vnr(current), current->set_child_tid); | 2316 | put_user(task_pid_vnr(current), current->set_child_tid); |
| 2296 | } | 2317 | } |
| 2297 | 2318 | ||
| 2298 | /* | 2319 | /* |
| 2299 | * context_switch - switch to the new MM and the new | 2320 | * context_switch - switch to the new MM and the new thread's register state. |
| 2300 | * thread's register state. | ||
| 2301 | */ | 2321 | */ |
| 2302 | static inline void | 2322 | static inline struct rq * |
| 2303 | context_switch(struct rq *rq, struct task_struct *prev, | 2323 | context_switch(struct rq *rq, struct task_struct *prev, |
| 2304 | struct task_struct *next) | 2324 | struct task_struct *next) |
| 2305 | { | 2325 | { |
| @@ -2333,21 +2353,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2333 | * of the scheduler it's an obvious special-case), so we | 2353 | * of the scheduler it's an obvious special-case), so we |
| 2334 | * do an early lockdep release here: | 2354 | * do an early lockdep release here: |
| 2335 | */ | 2355 | */ |
| 2336 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 2337 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2356 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
| 2338 | #endif | ||
| 2339 | 2357 | ||
| 2340 | context_tracking_task_switch(prev, next); | 2358 | context_tracking_task_switch(prev, next); |
| 2341 | /* Here we just switch the register state and the stack. */ | 2359 | /* Here we just switch the register state and the stack. */ |
| 2342 | switch_to(prev, next, prev); | 2360 | switch_to(prev, next, prev); |
| 2343 | |||
| 2344 | barrier(); | 2361 | barrier(); |
| 2345 | /* | 2362 | |
| 2346 | * this_rq must be evaluated again because prev may have moved | 2363 | return finish_task_switch(prev); |
| 2347 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
| 2348 | * frame will be invalid. | ||
| 2349 | */ | ||
| 2350 | finish_task_switch(this_rq(), prev); | ||
| 2351 | } | 2364 | } |
| 2352 | 2365 | ||
| 2353 | /* | 2366 | /* |
| @@ -2366,6 +2379,18 @@ unsigned long nr_running(void) | |||
| 2366 | return sum; | 2379 | return sum; |
| 2367 | } | 2380 | } |
| 2368 | 2381 | ||
| 2382 | /* | ||
| 2383 | * Check if only the current task is running on the cpu. | ||
| 2384 | */ | ||
| 2385 | bool single_task_running(void) | ||
| 2386 | { | ||
| 2387 | if (cpu_rq(smp_processor_id())->nr_running == 1) | ||
| 2388 | return true; | ||
| 2389 | else | ||
| 2390 | return false; | ||
| 2391 | } | ||
| 2392 | EXPORT_SYMBOL(single_task_running); | ||
| 2393 | |||
| 2369 | unsigned long long nr_context_switches(void) | 2394 | unsigned long long nr_context_switches(void) |
| 2370 | { | 2395 | { |
| 2371 | int i; | 2396 | int i; |
| @@ -2437,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
| 2437 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); | 2462 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); |
| 2438 | 2463 | ||
| 2439 | /* | 2464 | /* |
| 2440 | * Return any ns on the sched_clock that have not yet been accounted in | ||
| 2441 | * @p in case that task is currently running. | ||
| 2442 | * | ||
| 2443 | * Called with task_rq_lock() held on @rq. | ||
| 2444 | */ | ||
| 2445 | static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | ||
| 2446 | { | ||
| 2447 | u64 ns = 0; | ||
| 2448 | |||
| 2449 | /* | ||
| 2450 | * Must be ->curr _and_ ->on_rq. If dequeued, we would | ||
| 2451 | * project cycles that may never be accounted to this | ||
| 2452 | * thread, breaking clock_gettime(). | ||
| 2453 | */ | ||
| 2454 | if (task_current(rq, p) && p->on_rq) { | ||
| 2455 | update_rq_clock(rq); | ||
| 2456 | ns = rq_clock_task(rq) - p->se.exec_start; | ||
| 2457 | if ((s64)ns < 0) | ||
| 2458 | ns = 0; | ||
| 2459 | } | ||
| 2460 | |||
| 2461 | return ns; | ||
| 2462 | } | ||
| 2463 | |||
| 2464 | unsigned long long task_delta_exec(struct task_struct *p) | ||
| 2465 | { | ||
| 2466 | unsigned long flags; | ||
| 2467 | struct rq *rq; | ||
| 2468 | u64 ns = 0; | ||
| 2469 | |||
| 2470 | rq = task_rq_lock(p, &flags); | ||
| 2471 | ns = do_task_delta_exec(p, rq); | ||
| 2472 | task_rq_unlock(rq, p, &flags); | ||
| 2473 | |||
| 2474 | return ns; | ||
| 2475 | } | ||
| 2476 | |||
| 2477 | /* | ||
| 2478 | * Return accounted runtime for the task. | 2465 | * Return accounted runtime for the task. |
| 2479 | * In case the task is currently running, return the runtime plus current's | 2466 | * In case the task is currently running, return the runtime plus current's |
| 2480 | * pending runtime that have not been accounted yet. | 2467 | * pending runtime that have not been accounted yet. |
| @@ -2483,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2483 | { | 2470 | { |
| 2484 | unsigned long flags; | 2471 | unsigned long flags; |
| 2485 | struct rq *rq; | 2472 | struct rq *rq; |
| 2486 | u64 ns = 0; | 2473 | u64 ns; |
| 2487 | 2474 | ||
| 2488 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | 2475 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) |
| 2489 | /* | 2476 | /* |
| @@ -2497,12 +2484,21 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2497 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 2484 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
| 2498 | * been accounted, so we're correct here as well. | 2485 | * been accounted, so we're correct here as well. |
| 2499 | */ | 2486 | */ |
| 2500 | if (!p->on_cpu || !p->on_rq) | 2487 | if (!p->on_cpu || !task_on_rq_queued(p)) |
| 2501 | return p->se.sum_exec_runtime; | 2488 | return p->se.sum_exec_runtime; |
| 2502 | #endif | 2489 | #endif |
| 2503 | 2490 | ||
| 2504 | rq = task_rq_lock(p, &flags); | 2491 | rq = task_rq_lock(p, &flags); |
| 2505 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 2492 | /* |
| 2493 | * Must be ->curr _and_ ->on_rq. If dequeued, we would | ||
| 2494 | * project cycles that may never be accounted to this | ||
| 2495 | * thread, breaking clock_gettime(). | ||
| 2496 | */ | ||
| 2497 | if (task_current(rq, p) && task_on_rq_queued(p)) { | ||
| 2498 | update_rq_clock(rq); | ||
| 2499 | p->sched_class->update_curr(rq); | ||
| 2500 | } | ||
| 2501 | ns = p->se.sum_exec_runtime; | ||
| 2506 | task_rq_unlock(rq, p, &flags); | 2502 | task_rq_unlock(rq, p, &flags); |
| 2507 | 2503 | ||
| 2508 | return ns; | 2504 | return ns; |
| @@ -2660,6 +2656,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2660 | */ | 2656 | */ |
| 2661 | static inline void schedule_debug(struct task_struct *prev) | 2657 | static inline void schedule_debug(struct task_struct *prev) |
| 2662 | { | 2658 | { |
| 2659 | #ifdef CONFIG_SCHED_STACK_END_CHECK | ||
| 2660 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | ||
| 2661 | #endif | ||
| 2663 | /* | 2662 | /* |
| 2664 | * Test if we are atomic. Since do_exit() needs to call into | 2663 | * Test if we are atomic. Since do_exit() needs to call into |
| 2665 | * schedule() atomically, we ignore that path. Otherwise whine | 2664 | * schedule() atomically, we ignore that path. Otherwise whine |
| @@ -2761,7 +2760,7 @@ need_resched: | |||
| 2761 | preempt_disable(); | 2760 | preempt_disable(); |
| 2762 | cpu = smp_processor_id(); | 2761 | cpu = smp_processor_id(); |
| 2763 | rq = cpu_rq(cpu); | 2762 | rq = cpu_rq(cpu); |
| 2764 | rcu_note_context_switch(cpu); | 2763 | rcu_note_context_switch(); |
| 2765 | prev = rq->curr; | 2764 | prev = rq->curr; |
| 2766 | 2765 | ||
| 2767 | schedule_debug(prev); | 2766 | schedule_debug(prev); |
| @@ -2801,7 +2800,7 @@ need_resched: | |||
| 2801 | switch_count = &prev->nvcsw; | 2800 | switch_count = &prev->nvcsw; |
| 2802 | } | 2801 | } |
| 2803 | 2802 | ||
| 2804 | if (prev->on_rq || rq->skip_clock_update < 0) | 2803 | if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) |
| 2805 | update_rq_clock(rq); | 2804 | update_rq_clock(rq); |
| 2806 | 2805 | ||
| 2807 | next = pick_next_task(rq, prev); | 2806 | next = pick_next_task(rq, prev); |
| @@ -2814,15 +2813,8 @@ need_resched: | |||
| 2814 | rq->curr = next; | 2813 | rq->curr = next; |
| 2815 | ++*switch_count; | 2814 | ++*switch_count; |
| 2816 | 2815 | ||
| 2817 | context_switch(rq, prev, next); /* unlocks the rq */ | 2816 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
| 2818 | /* | 2817 | cpu = cpu_of(rq); |
| 2819 | * The context switch have flipped the stack from under us | ||
| 2820 | * and restored the local variables which were saved when | ||
| 2821 | * this task called schedule() in the past. prev == current | ||
| 2822 | * is still correct, but it can be moved to another cpu/rq. | ||
| 2823 | */ | ||
| 2824 | cpu = smp_processor_id(); | ||
| 2825 | rq = cpu_rq(cpu); | ||
| 2826 | } else | 2818 | } else |
| 2827 | raw_spin_unlock_irq(&rq->lock); | 2819 | raw_spin_unlock_irq(&rq->lock); |
| 2828 | 2820 | ||
| @@ -2862,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void) | |||
| 2862 | * or we have been woken up remotely but the IPI has not yet arrived, | 2854 | * or we have been woken up remotely but the IPI has not yet arrived, |
| 2863 | * we haven't yet exited the RCU idle mode. Do it here manually until | 2855 | * we haven't yet exited the RCU idle mode. Do it here manually until |
| 2864 | * we find a better solution. | 2856 | * we find a better solution. |
| 2857 | * | ||
| 2858 | * NB: There are buggy callers of this function. Ideally we | ||
| 2859 | * should warn if prev_state != IN_USER, but that will trigger | ||
| 2860 | * too frequently to make sense yet. | ||
| 2865 | */ | 2861 | */ |
| 2866 | user_exit(); | 2862 | enum ctx_state prev_state = exception_enter(); |
| 2867 | schedule(); | 2863 | schedule(); |
| 2868 | user_enter(); | 2864 | exception_exit(prev_state); |
| 2869 | } | 2865 | } |
| 2870 | #endif | 2866 | #endif |
| 2871 | 2867 | ||
| @@ -2910,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) | |||
| 2910 | } | 2906 | } |
| 2911 | NOKPROBE_SYMBOL(preempt_schedule); | 2907 | NOKPROBE_SYMBOL(preempt_schedule); |
| 2912 | EXPORT_SYMBOL(preempt_schedule); | 2908 | EXPORT_SYMBOL(preempt_schedule); |
| 2909 | |||
| 2910 | #ifdef CONFIG_CONTEXT_TRACKING | ||
| 2911 | /** | ||
| 2912 | * preempt_schedule_context - preempt_schedule called by tracing | ||
| 2913 | * | ||
| 2914 | * The tracing infrastructure uses preempt_enable_notrace to prevent | ||
| 2915 | * recursion and tracing preempt enabling caused by the tracing | ||
| 2916 | * infrastructure itself. But as tracing can happen in areas coming | ||
| 2917 | * from userspace or just about to enter userspace, a preempt enable | ||
| 2918 | * can occur before user_exit() is called. This will cause the scheduler | ||
| 2919 | * to be called when the system is still in usermode. | ||
| 2920 | * | ||
| 2921 | * To prevent this, the preempt_enable_notrace will use this function | ||
| 2922 | * instead of preempt_schedule() to exit user context if needed before | ||
| 2923 | * calling the scheduler. | ||
| 2924 | */ | ||
| 2925 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) | ||
| 2926 | { | ||
| 2927 | enum ctx_state prev_ctx; | ||
| 2928 | |||
| 2929 | if (likely(!preemptible())) | ||
| 2930 | return; | ||
| 2931 | |||
| 2932 | do { | ||
| 2933 | __preempt_count_add(PREEMPT_ACTIVE); | ||
| 2934 | /* | ||
| 2935 | * Needs preempt disabled in case user_exit() is traced | ||
| 2936 | * and the tracer calls preempt_enable_notrace() causing | ||
| 2937 | * an infinite recursion. | ||
| 2938 | */ | ||
| 2939 | prev_ctx = exception_enter(); | ||
| 2940 | __schedule(); | ||
| 2941 | exception_exit(prev_ctx); | ||
| 2942 | |||
| 2943 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
| 2944 | barrier(); | ||
| 2945 | } while (need_resched()); | ||
| 2946 | } | ||
| 2947 | EXPORT_SYMBOL_GPL(preempt_schedule_context); | ||
| 2948 | #endif /* CONFIG_CONTEXT_TRACKING */ | ||
| 2949 | |||
| 2913 | #endif /* CONFIG_PREEMPT */ | 2950 | #endif /* CONFIG_PREEMPT */ |
| 2914 | 2951 | ||
| 2915 | /* | 2952 | /* |
| @@ -2966,7 +3003,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 2966 | */ | 3003 | */ |
| 2967 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3004 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 2968 | { | 3005 | { |
| 2969 | int oldprio, on_rq, running, enqueue_flag = 0; | 3006 | int oldprio, queued, running, enqueue_flag = 0; |
| 2970 | struct rq *rq; | 3007 | struct rq *rq; |
| 2971 | const struct sched_class *prev_class; | 3008 | const struct sched_class *prev_class; |
| 2972 | 3009 | ||
| @@ -2995,12 +3032,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 2995 | trace_sched_pi_setprio(p, prio); | 3032 | trace_sched_pi_setprio(p, prio); |
| 2996 | oldprio = p->prio; | 3033 | oldprio = p->prio; |
| 2997 | prev_class = p->sched_class; | 3034 | prev_class = p->sched_class; |
| 2998 | on_rq = p->on_rq; | 3035 | queued = task_on_rq_queued(p); |
| 2999 | running = task_current(rq, p); | 3036 | running = task_current(rq, p); |
| 3000 | if (on_rq) | 3037 | if (queued) |
| 3001 | dequeue_task(rq, p, 0); | 3038 | dequeue_task(rq, p, 0); |
| 3002 | if (running) | 3039 | if (running) |
| 3003 | p->sched_class->put_prev_task(rq, p); | 3040 | put_prev_task(rq, p); |
| 3004 | 3041 | ||
| 3005 | /* | 3042 | /* |
| 3006 | * Boosting condition are: | 3043 | * Boosting condition are: |
| @@ -3037,7 +3074,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3037 | 3074 | ||
| 3038 | if (running) | 3075 | if (running) |
| 3039 | p->sched_class->set_curr_task(rq); | 3076 | p->sched_class->set_curr_task(rq); |
| 3040 | if (on_rq) | 3077 | if (queued) |
| 3041 | enqueue_task(rq, p, enqueue_flag); | 3078 | enqueue_task(rq, p, enqueue_flag); |
| 3042 | 3079 | ||
| 3043 | check_class_changed(rq, p, prev_class, oldprio); | 3080 | check_class_changed(rq, p, prev_class, oldprio); |
| @@ -3048,7 +3085,7 @@ out_unlock: | |||
| 3048 | 3085 | ||
| 3049 | void set_user_nice(struct task_struct *p, long nice) | 3086 | void set_user_nice(struct task_struct *p, long nice) |
| 3050 | { | 3087 | { |
| 3051 | int old_prio, delta, on_rq; | 3088 | int old_prio, delta, queued; |
| 3052 | unsigned long flags; | 3089 | unsigned long flags; |
| 3053 | struct rq *rq; | 3090 | struct rq *rq; |
| 3054 | 3091 | ||
| @@ -3069,8 +3106,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3069 | p->static_prio = NICE_TO_PRIO(nice); | 3106 | p->static_prio = NICE_TO_PRIO(nice); |
| 3070 | goto out_unlock; | 3107 | goto out_unlock; |
| 3071 | } | 3108 | } |
| 3072 | on_rq = p->on_rq; | 3109 | queued = task_on_rq_queued(p); |
| 3073 | if (on_rq) | 3110 | if (queued) |
| 3074 | dequeue_task(rq, p, 0); | 3111 | dequeue_task(rq, p, 0); |
| 3075 | 3112 | ||
| 3076 | p->static_prio = NICE_TO_PRIO(nice); | 3113 | p->static_prio = NICE_TO_PRIO(nice); |
| @@ -3079,7 +3116,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3079 | p->prio = effective_prio(p); | 3116 | p->prio = effective_prio(p); |
| 3080 | delta = p->prio - old_prio; | 3117 | delta = p->prio - old_prio; |
| 3081 | 3118 | ||
| 3082 | if (on_rq) { | 3119 | if (queued) { |
| 3083 | enqueue_task(rq, p, 0); | 3120 | enqueue_task(rq, p, 0); |
| 3084 | /* | 3121 | /* |
| 3085 | * If the task increased its priority or is running and | 3122 | * If the task increased its priority or is running and |
| @@ -3351,7 +3388,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 3351 | { | 3388 | { |
| 3352 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : | 3389 | int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : |
| 3353 | MAX_RT_PRIO - 1 - attr->sched_priority; | 3390 | MAX_RT_PRIO - 1 - attr->sched_priority; |
| 3354 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 3391 | int retval, oldprio, oldpolicy = -1, queued, running; |
| 3355 | int policy = attr->sched_policy; | 3392 | int policy = attr->sched_policy; |
| 3356 | unsigned long flags; | 3393 | unsigned long flags; |
| 3357 | const struct sched_class *prev_class; | 3394 | const struct sched_class *prev_class; |
| @@ -3548,19 +3585,19 @@ change: | |||
| 3548 | return 0; | 3585 | return 0; |
| 3549 | } | 3586 | } |
| 3550 | 3587 | ||
| 3551 | on_rq = p->on_rq; | 3588 | queued = task_on_rq_queued(p); |
| 3552 | running = task_current(rq, p); | 3589 | running = task_current(rq, p); |
| 3553 | if (on_rq) | 3590 | if (queued) |
| 3554 | dequeue_task(rq, p, 0); | 3591 | dequeue_task(rq, p, 0); |
| 3555 | if (running) | 3592 | if (running) |
| 3556 | p->sched_class->put_prev_task(rq, p); | 3593 | put_prev_task(rq, p); |
| 3557 | 3594 | ||
| 3558 | prev_class = p->sched_class; | 3595 | prev_class = p->sched_class; |
| 3559 | __setscheduler(rq, p, attr); | 3596 | __setscheduler(rq, p, attr); |
| 3560 | 3597 | ||
| 3561 | if (running) | 3598 | if (running) |
| 3562 | p->sched_class->set_curr_task(rq); | 3599 | p->sched_class->set_curr_task(rq); |
| 3563 | if (on_rq) { | 3600 | if (queued) { |
| 3564 | /* | 3601 | /* |
| 3565 | * We enqueue to tail when the priority of a task is | 3602 | * We enqueue to tail when the priority of a task is |
| 3566 | * increased (user space view). | 3603 | * increased (user space view). |
| @@ -3984,14 +4021,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3984 | rcu_read_lock(); | 4021 | rcu_read_lock(); |
| 3985 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { | 4022 | if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { |
| 3986 | rcu_read_unlock(); | 4023 | rcu_read_unlock(); |
| 3987 | goto out_unlock; | 4024 | goto out_free_new_mask; |
| 3988 | } | 4025 | } |
| 3989 | rcu_read_unlock(); | 4026 | rcu_read_unlock(); |
| 3990 | } | 4027 | } |
| 3991 | 4028 | ||
| 3992 | retval = security_task_setscheduler(p); | 4029 | retval = security_task_setscheduler(p); |
| 3993 | if (retval) | 4030 | if (retval) |
| 3994 | goto out_unlock; | 4031 | goto out_free_new_mask; |
| 3995 | 4032 | ||
| 3996 | 4033 | ||
| 3997 | cpuset_cpus_allowed(p, cpus_allowed); | 4034 | cpuset_cpus_allowed(p, cpus_allowed); |
| @@ -4004,13 +4041,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 4004 | * root_domain. | 4041 | * root_domain. |
| 4005 | */ | 4042 | */ |
| 4006 | #ifdef CONFIG_SMP | 4043 | #ifdef CONFIG_SMP |
| 4007 | if (task_has_dl_policy(p)) { | 4044 | if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { |
| 4008 | const struct cpumask *span = task_rq(p)->rd->span; | 4045 | rcu_read_lock(); |
| 4009 | 4046 | if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { | |
| 4010 | if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { | ||
| 4011 | retval = -EBUSY; | 4047 | retval = -EBUSY; |
| 4012 | goto out_unlock; | 4048 | rcu_read_unlock(); |
| 4049 | goto out_free_new_mask; | ||
| 4013 | } | 4050 | } |
| 4051 | rcu_read_unlock(); | ||
| 4014 | } | 4052 | } |
| 4015 | #endif | 4053 | #endif |
| 4016 | again: | 4054 | again: |
| @@ -4028,7 +4066,7 @@ again: | |||
| 4028 | goto again; | 4066 | goto again; |
| 4029 | } | 4067 | } |
| 4030 | } | 4068 | } |
| 4031 | out_unlock: | 4069 | out_free_new_mask: |
| 4032 | free_cpumask_var(new_mask); | 4070 | free_cpumask_var(new_mask); |
| 4033 | out_free_cpus_allowed: | 4071 | out_free_cpus_allowed: |
| 4034 | free_cpumask_var(cpus_allowed); | 4072 | free_cpumask_var(cpus_allowed); |
| @@ -4489,8 +4527,10 @@ void sched_show_task(struct task_struct *p) | |||
| 4489 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4527 | #ifdef CONFIG_DEBUG_STACK_USAGE |
| 4490 | free = stack_not_used(p); | 4528 | free = stack_not_used(p); |
| 4491 | #endif | 4529 | #endif |
| 4530 | ppid = 0; | ||
| 4492 | rcu_read_lock(); | 4531 | rcu_read_lock(); |
| 4493 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | 4532 | if (pid_alive(p)) |
| 4533 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); | ||
| 4494 | rcu_read_unlock(); | 4534 | rcu_read_unlock(); |
| 4495 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4535 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
| 4496 | task_pid_nr(p), ppid, | 4536 | task_pid_nr(p), ppid, |
| @@ -4512,7 +4552,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4512 | " task PC stack pid father\n"); | 4552 | " task PC stack pid father\n"); |
| 4513 | #endif | 4553 | #endif |
| 4514 | rcu_read_lock(); | 4554 | rcu_read_lock(); |
| 4515 | do_each_thread(g, p) { | 4555 | for_each_process_thread(g, p) { |
| 4516 | /* | 4556 | /* |
| 4517 | * reset the NMI-timeout, listing all files on a slow | 4557 | * reset the NMI-timeout, listing all files on a slow |
| 4518 | * console might take a lot of time: | 4558 | * console might take a lot of time: |
| @@ -4520,7 +4560,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 4520 | touch_nmi_watchdog(); | 4560 | touch_nmi_watchdog(); |
| 4521 | if (!state_filter || (p->state & state_filter)) | 4561 | if (!state_filter || (p->state & state_filter)) |
| 4522 | sched_show_task(p); | 4562 | sched_show_task(p); |
| 4523 | } while_each_thread(g, p); | 4563 | } |
| 4524 | 4564 | ||
| 4525 | touch_all_softlockup_watchdogs(); | 4565 | touch_all_softlockup_watchdogs(); |
| 4526 | 4566 | ||
| @@ -4575,7 +4615,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4575 | rcu_read_unlock(); | 4615 | rcu_read_unlock(); |
| 4576 | 4616 | ||
| 4577 | rq->curr = rq->idle = idle; | 4617 | rq->curr = rq->idle = idle; |
| 4578 | idle->on_rq = 1; | 4618 | idle->on_rq = TASK_ON_RQ_QUEUED; |
| 4579 | #if defined(CONFIG_SMP) | 4619 | #if defined(CONFIG_SMP) |
| 4580 | idle->on_cpu = 1; | 4620 | idle->on_cpu = 1; |
| 4581 | #endif | 4621 | #endif |
| @@ -4595,7 +4635,109 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4595 | #endif | 4635 | #endif |
| 4596 | } | 4636 | } |
| 4597 | 4637 | ||
| 4638 | int cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
| 4639 | const struct cpumask *trial) | ||
| 4640 | { | ||
| 4641 | int ret = 1, trial_cpus; | ||
| 4642 | struct dl_bw *cur_dl_b; | ||
| 4643 | unsigned long flags; | ||
| 4644 | |||
| 4645 | rcu_read_lock_sched(); | ||
| 4646 | cur_dl_b = dl_bw_of(cpumask_any(cur)); | ||
| 4647 | trial_cpus = cpumask_weight(trial); | ||
| 4648 | |||
| 4649 | raw_spin_lock_irqsave(&cur_dl_b->lock, flags); | ||
| 4650 | if (cur_dl_b->bw != -1 && | ||
| 4651 | cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) | ||
| 4652 | ret = 0; | ||
| 4653 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | ||
| 4654 | rcu_read_unlock_sched(); | ||
| 4655 | |||
| 4656 | return ret; | ||
| 4657 | } | ||
| 4658 | |||
| 4659 | int task_can_attach(struct task_struct *p, | ||
| 4660 | const struct cpumask *cs_cpus_allowed) | ||
| 4661 | { | ||
| 4662 | int ret = 0; | ||
| 4663 | |||
| 4664 | /* | ||
| 4665 | * Kthreads which disallow setaffinity shouldn't be moved | ||
| 4666 | * to a new cpuset; we don't want to change their cpu | ||
| 4667 | * affinity and isolating such threads by their set of | ||
| 4668 | * allowed nodes is unnecessary. Thus, cpusets are not | ||
| 4669 | * applicable for such threads. This prevents checking for | ||
| 4670 | * success of set_cpus_allowed_ptr() on all attached tasks | ||
| 4671 | * before cpus_allowed may be changed. | ||
| 4672 | */ | ||
| 4673 | if (p->flags & PF_NO_SETAFFINITY) { | ||
| 4674 | ret = -EINVAL; | ||
| 4675 | goto out; | ||
| 4676 | } | ||
| 4677 | |||
| 4678 | #ifdef CONFIG_SMP | ||
| 4679 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, | ||
| 4680 | cs_cpus_allowed)) { | ||
| 4681 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | ||
| 4682 | cs_cpus_allowed); | ||
| 4683 | struct dl_bw *dl_b; | ||
| 4684 | bool overflow; | ||
| 4685 | int cpus; | ||
| 4686 | unsigned long flags; | ||
| 4687 | |||
| 4688 | rcu_read_lock_sched(); | ||
| 4689 | dl_b = dl_bw_of(dest_cpu); | ||
| 4690 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 4691 | cpus = dl_bw_cpus(dest_cpu); | ||
| 4692 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
| 4693 | if (overflow) | ||
| 4694 | ret = -EBUSY; | ||
| 4695 | else { | ||
| 4696 | /* | ||
| 4697 | * We reserve space for this task in the destination | ||
| 4698 | * root_domain, as we can't fail after this point. | ||
| 4699 | * We will free resources in the source root_domain | ||
| 4700 | * later on (see set_cpus_allowed_dl()). | ||
| 4701 | */ | ||
| 4702 | __dl_add(dl_b, p->dl.dl_bw); | ||
| 4703 | } | ||
| 4704 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 4705 | rcu_read_unlock_sched(); | ||
| 4706 | |||
| 4707 | } | ||
| 4708 | #endif | ||
| 4709 | out: | ||
| 4710 | return ret; | ||
| 4711 | } | ||
| 4712 | |||
| 4598 | #ifdef CONFIG_SMP | 4713 | #ifdef CONFIG_SMP |
| 4714 | /* | ||
| 4715 | * move_queued_task - move a queued task to new rq. | ||
| 4716 | * | ||
| 4717 | * Returns (locked) new rq. Old rq's lock is released. | ||
| 4718 | */ | ||
| 4719 | static struct rq *move_queued_task(struct task_struct *p, int new_cpu) | ||
| 4720 | { | ||
| 4721 | struct rq *rq = task_rq(p); | ||
| 4722 | |||
| 4723 | lockdep_assert_held(&rq->lock); | ||
| 4724 | |||
| 4725 | dequeue_task(rq, p, 0); | ||
| 4726 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 4727 | set_task_cpu(p, new_cpu); | ||
| 4728 | raw_spin_unlock(&rq->lock); | ||
| 4729 | |||
| 4730 | rq = cpu_rq(new_cpu); | ||
| 4731 | |||
| 4732 | raw_spin_lock(&rq->lock); | ||
| 4733 | BUG_ON(task_cpu(p) != new_cpu); | ||
| 4734 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 4735 | enqueue_task(rq, p, 0); | ||
| 4736 | check_preempt_curr(rq, p, 0); | ||
| 4737 | |||
| 4738 | return rq; | ||
| 4739 | } | ||
| 4740 | |||
| 4599 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4741 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
| 4600 | { | 4742 | { |
| 4601 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4743 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
| @@ -4652,14 +4794,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 4652 | goto out; | 4794 | goto out; |
| 4653 | 4795 | ||
| 4654 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 4796 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
| 4655 | if (p->on_rq) { | 4797 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
| 4656 | struct migration_arg arg = { p, dest_cpu }; | 4798 | struct migration_arg arg = { p, dest_cpu }; |
| 4657 | /* Need help from migration thread: drop lock and wait. */ | 4799 | /* Need help from migration thread: drop lock and wait. */ |
| 4658 | task_rq_unlock(rq, p, &flags); | 4800 | task_rq_unlock(rq, p, &flags); |
| 4659 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 4801 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
| 4660 | tlb_migrate_finish(p->mm); | 4802 | tlb_migrate_finish(p->mm); |
| 4661 | return 0; | 4803 | return 0; |
| 4662 | } | 4804 | } else if (task_on_rq_queued(p)) |
| 4805 | rq = move_queued_task(p, dest_cpu); | ||
| 4663 | out: | 4806 | out: |
| 4664 | task_rq_unlock(rq, p, &flags); | 4807 | task_rq_unlock(rq, p, &flags); |
| 4665 | 4808 | ||
| @@ -4680,20 +4823,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); | |||
| 4680 | */ | 4823 | */ |
| 4681 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4824 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4682 | { | 4825 | { |
| 4683 | struct rq *rq_dest, *rq_src; | 4826 | struct rq *rq; |
| 4684 | int ret = 0; | 4827 | int ret = 0; |
| 4685 | 4828 | ||
| 4686 | if (unlikely(!cpu_active(dest_cpu))) | 4829 | if (unlikely(!cpu_active(dest_cpu))) |
| 4687 | return ret; | 4830 | return ret; |
| 4688 | 4831 | ||
| 4689 | rq_src = cpu_rq(src_cpu); | 4832 | rq = cpu_rq(src_cpu); |
| 4690 | rq_dest = cpu_rq(dest_cpu); | ||
| 4691 | 4833 | ||
| 4692 | raw_spin_lock(&p->pi_lock); | 4834 | raw_spin_lock(&p->pi_lock); |
| 4693 | double_rq_lock(rq_src, rq_dest); | 4835 | raw_spin_lock(&rq->lock); |
| 4694 | /* Already moved. */ | 4836 | /* Already moved. */ |
| 4695 | if (task_cpu(p) != src_cpu) | 4837 | if (task_cpu(p) != src_cpu) |
| 4696 | goto done; | 4838 | goto done; |
| 4839 | |||
| 4697 | /* Affinity changed (again). */ | 4840 | /* Affinity changed (again). */ |
| 4698 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 4841 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
| 4699 | goto fail; | 4842 | goto fail; |
| @@ -4702,16 +4845,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4702 | * If we're not on a rq, the next wake-up will ensure we're | 4845 | * If we're not on a rq, the next wake-up will ensure we're |
| 4703 | * placed properly. | 4846 | * placed properly. |
| 4704 | */ | 4847 | */ |
| 4705 | if (p->on_rq) { | 4848 | if (task_on_rq_queued(p)) |
| 4706 | dequeue_task(rq_src, p, 0); | 4849 | rq = move_queued_task(p, dest_cpu); |
| 4707 | set_task_cpu(p, dest_cpu); | ||
| 4708 | enqueue_task(rq_dest, p, 0); | ||
| 4709 | check_preempt_curr(rq_dest, p, 0); | ||
| 4710 | } | ||
| 4711 | done: | 4850 | done: |
| 4712 | ret = 1; | 4851 | ret = 1; |
| 4713 | fail: | 4852 | fail: |
| 4714 | double_rq_unlock(rq_src, rq_dest); | 4853 | raw_spin_unlock(&rq->lock); |
| 4715 | raw_spin_unlock(&p->pi_lock); | 4854 | raw_spin_unlock(&p->pi_lock); |
| 4716 | return ret; | 4855 | return ret; |
| 4717 | } | 4856 | } |
| @@ -4743,22 +4882,22 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 4743 | { | 4882 | { |
| 4744 | struct rq *rq; | 4883 | struct rq *rq; |
| 4745 | unsigned long flags; | 4884 | unsigned long flags; |
| 4746 | bool on_rq, running; | 4885 | bool queued, running; |
| 4747 | 4886 | ||
| 4748 | rq = task_rq_lock(p, &flags); | 4887 | rq = task_rq_lock(p, &flags); |
| 4749 | on_rq = p->on_rq; | 4888 | queued = task_on_rq_queued(p); |
| 4750 | running = task_current(rq, p); | 4889 | running = task_current(rq, p); |
| 4751 | 4890 | ||
| 4752 | if (on_rq) | 4891 | if (queued) |
| 4753 | dequeue_task(rq, p, 0); | 4892 | dequeue_task(rq, p, 0); |
| 4754 | if (running) | 4893 | if (running) |
| 4755 | p->sched_class->put_prev_task(rq, p); | 4894 | put_prev_task(rq, p); |
| 4756 | 4895 | ||
| 4757 | p->numa_preferred_nid = nid; | 4896 | p->numa_preferred_nid = nid; |
| 4758 | 4897 | ||
| 4759 | if (running) | 4898 | if (running) |
| 4760 | p->sched_class->set_curr_task(rq); | 4899 | p->sched_class->set_curr_task(rq); |
| 4761 | if (on_rq) | 4900 | if (queued) |
| 4762 | enqueue_task(rq, p, 0); | 4901 | enqueue_task(rq, p, 0); |
| 4763 | task_rq_unlock(rq, p, &flags); | 4902 | task_rq_unlock(rq, p, &flags); |
| 4764 | } | 4903 | } |
| @@ -4778,6 +4917,12 @@ static int migration_cpu_stop(void *data) | |||
| 4778 | * be on another cpu but it doesn't matter. | 4917 | * be on another cpu but it doesn't matter. |
| 4779 | */ | 4918 | */ |
| 4780 | local_irq_disable(); | 4919 | local_irq_disable(); |
| 4920 | /* | ||
| 4921 | * We need to explicitly wake pending tasks before running | ||
| 4922 | * __migrate_task() such that we will not miss enforcing cpus_allowed | ||
| 4923 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | ||
| 4924 | */ | ||
| 4925 | sched_ttwu_pending(); | ||
| 4781 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); | 4926 | __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); |
| 4782 | local_irq_enable(); | 4927 | local_irq_enable(); |
| 4783 | return 0; | 4928 | return 0; |
| @@ -5188,6 +5333,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5188 | { | 5333 | { |
| 5189 | unsigned long flags; | 5334 | unsigned long flags; |
| 5190 | long cpu = (long)hcpu; | 5335 | long cpu = (long)hcpu; |
| 5336 | struct dl_bw *dl_b; | ||
| 5191 | 5337 | ||
| 5192 | switch (action & ~CPU_TASKS_FROZEN) { | 5338 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5193 | case CPU_DOWN_PREPARE: | 5339 | case CPU_DOWN_PREPARE: |
| @@ -5195,15 +5341,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, | |||
| 5195 | 5341 | ||
| 5196 | /* explicitly allow suspend */ | 5342 | /* explicitly allow suspend */ |
| 5197 | if (!(action & CPU_TASKS_FROZEN)) { | 5343 | if (!(action & CPU_TASKS_FROZEN)) { |
| 5198 | struct dl_bw *dl_b = dl_bw_of(cpu); | ||
| 5199 | bool overflow; | 5344 | bool overflow; |
| 5200 | int cpus; | 5345 | int cpus; |
| 5201 | 5346 | ||
| 5347 | rcu_read_lock_sched(); | ||
| 5348 | dl_b = dl_bw_of(cpu); | ||
| 5349 | |||
| 5202 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 5350 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 5203 | cpus = dl_bw_cpus(cpu); | 5351 | cpus = dl_bw_cpus(cpu); |
| 5204 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 5352 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
| 5205 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 5353 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 5206 | 5354 | ||
| 5355 | rcu_read_unlock_sched(); | ||
| 5356 | |||
| 5207 | if (overflow) | 5357 | if (overflow) |
| 5208 | return notifier_from_errno(-EBUSY); | 5358 | return notifier_from_errno(-EBUSY); |
| 5209 | } | 5359 | } |
| @@ -5746,7 +5896,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5746 | const struct cpumask *span = sched_domain_span(sd); | 5896 | const struct cpumask *span = sched_domain_span(sd); |
| 5747 | struct cpumask *covered = sched_domains_tmpmask; | 5897 | struct cpumask *covered = sched_domains_tmpmask; |
| 5748 | struct sd_data *sdd = sd->private; | 5898 | struct sd_data *sdd = sd->private; |
| 5749 | struct sched_domain *child; | 5899 | struct sched_domain *sibling; |
| 5750 | int i; | 5900 | int i; |
| 5751 | 5901 | ||
| 5752 | cpumask_clear(covered); | 5902 | cpumask_clear(covered); |
| @@ -5757,10 +5907,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5757 | if (cpumask_test_cpu(i, covered)) | 5907 | if (cpumask_test_cpu(i, covered)) |
| 5758 | continue; | 5908 | continue; |
| 5759 | 5909 | ||
| 5760 | child = *per_cpu_ptr(sdd->sd, i); | 5910 | sibling = *per_cpu_ptr(sdd->sd, i); |
| 5761 | 5911 | ||
| 5762 | /* See the comment near build_group_mask(). */ | 5912 | /* See the comment near build_group_mask(). */ |
| 5763 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | 5913 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) |
| 5764 | continue; | 5914 | continue; |
| 5765 | 5915 | ||
| 5766 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5916 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| @@ -5770,10 +5920,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5770 | goto fail; | 5920 | goto fail; |
| 5771 | 5921 | ||
| 5772 | sg_span = sched_group_cpus(sg); | 5922 | sg_span = sched_group_cpus(sg); |
| 5773 | if (child->child) { | 5923 | if (sibling->child) |
| 5774 | child = child->child; | 5924 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); |
| 5775 | cpumask_copy(sg_span, sched_domain_span(child)); | 5925 | else |
| 5776 | } else | ||
| 5777 | cpumask_set_cpu(i, sg_span); | 5926 | cpumask_set_cpu(i, sg_span); |
| 5778 | 5927 | ||
| 5779 | cpumask_or(covered, covered, sg_span); | 5928 | cpumask_or(covered, covered, sg_span); |
| @@ -6011,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
| 6011 | 6160 | ||
| 6012 | #ifdef CONFIG_NUMA | 6161 | #ifdef CONFIG_NUMA |
| 6013 | static int sched_domains_numa_levels; | 6162 | static int sched_domains_numa_levels; |
| 6163 | enum numa_topology_type sched_numa_topology_type; | ||
| 6014 | static int *sched_domains_numa_distance; | 6164 | static int *sched_domains_numa_distance; |
| 6165 | int sched_max_numa_distance; | ||
| 6015 | static struct cpumask ***sched_domains_numa_masks; | 6166 | static struct cpumask ***sched_domains_numa_masks; |
| 6016 | static int sched_domains_curr_level; | 6167 | static int sched_domains_curr_level; |
| 6017 | #endif | 6168 | #endif |
| @@ -6183,7 +6334,7 @@ static void sched_numa_warn(const char *str) | |||
| 6183 | printk(KERN_WARNING "\n"); | 6334 | printk(KERN_WARNING "\n"); |
| 6184 | } | 6335 | } |
| 6185 | 6336 | ||
| 6186 | static bool find_numa_distance(int distance) | 6337 | bool find_numa_distance(int distance) |
| 6187 | { | 6338 | { |
| 6188 | int i; | 6339 | int i; |
| 6189 | 6340 | ||
| @@ -6198,6 +6349,56 @@ static bool find_numa_distance(int distance) | |||
| 6198 | return false; | 6349 | return false; |
| 6199 | } | 6350 | } |
| 6200 | 6351 | ||
| 6352 | /* | ||
| 6353 | * A system can have three types of NUMA topology: | ||
| 6354 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
| 6355 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
| 6356 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
| 6357 | * | ||
| 6358 | * The difference between a glueless mesh topology and a backplane | ||
| 6359 | * topology lies in whether communication between not directly | ||
| 6360 | * connected nodes goes through intermediary nodes (where programs | ||
| 6361 | * could run), or through backplane controllers. This affects | ||
| 6362 | * placement of programs. | ||
| 6363 | * | ||
| 6364 | * The type of topology can be discerned with the following tests: | ||
| 6365 | * - If the maximum distance between any nodes is 1 hop, the system | ||
| 6366 | * is directly connected. | ||
| 6367 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
| 6368 | * there is an intermediary node C, which is < N hops away from both | ||
| 6369 | * nodes A and B, the system is a glueless mesh. | ||
| 6370 | */ | ||
| 6371 | static void init_numa_topology_type(void) | ||
| 6372 | { | ||
| 6373 | int a, b, c, n; | ||
| 6374 | |||
| 6375 | n = sched_max_numa_distance; | ||
| 6376 | |||
| 6377 | if (n <= 1) | ||
| 6378 | sched_numa_topology_type = NUMA_DIRECT; | ||
| 6379 | |||
| 6380 | for_each_online_node(a) { | ||
| 6381 | for_each_online_node(b) { | ||
| 6382 | /* Find two nodes furthest removed from each other. */ | ||
| 6383 | if (node_distance(a, b) < n) | ||
| 6384 | continue; | ||
| 6385 | |||
| 6386 | /* Is there an intermediary node between a and b? */ | ||
| 6387 | for_each_online_node(c) { | ||
| 6388 | if (node_distance(a, c) < n && | ||
| 6389 | node_distance(b, c) < n) { | ||
| 6390 | sched_numa_topology_type = | ||
| 6391 | NUMA_GLUELESS_MESH; | ||
| 6392 | return; | ||
| 6393 | } | ||
| 6394 | } | ||
| 6395 | |||
| 6396 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
| 6397 | return; | ||
| 6398 | } | ||
| 6399 | } | ||
| 6400 | } | ||
| 6401 | |||
| 6201 | static void sched_init_numa(void) | 6402 | static void sched_init_numa(void) |
| 6202 | { | 6403 | { |
| 6203 | int next_distance, curr_distance = node_distance(0, 0); | 6404 | int next_distance, curr_distance = node_distance(0, 0); |
| @@ -6251,6 +6452,10 @@ static void sched_init_numa(void) | |||
| 6251 | if (!sched_debug()) | 6452 | if (!sched_debug()) |
| 6252 | break; | 6453 | break; |
| 6253 | } | 6454 | } |
| 6455 | |||
| 6456 | if (!level) | ||
| 6457 | return; | ||
| 6458 | |||
| 6254 | /* | 6459 | /* |
| 6255 | * 'level' contains the number of unique distances, excluding the | 6460 | * 'level' contains the number of unique distances, excluding the |
| 6256 | * identity distance node_distance(i,i). | 6461 | * identity distance node_distance(i,i). |
| @@ -6330,6 +6535,9 @@ static void sched_init_numa(void) | |||
| 6330 | sched_domain_topology = tl; | 6535 | sched_domain_topology = tl; |
| 6331 | 6536 | ||
| 6332 | sched_domains_numa_levels = level; | 6537 | sched_domains_numa_levels = level; |
| 6538 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
| 6539 | |||
| 6540 | init_numa_topology_type(); | ||
| 6333 | } | 6541 | } |
| 6334 | 6542 | ||
| 6335 | static void sched_domains_numa_masks_set(int cpu) | 6543 | static void sched_domains_numa_masks_set(int cpu) |
| @@ -6905,9 +7113,6 @@ void __init sched_init(void) | |||
| 6905 | #ifdef CONFIG_RT_GROUP_SCHED | 7113 | #ifdef CONFIG_RT_GROUP_SCHED |
| 6906 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 7114 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
| 6907 | #endif | 7115 | #endif |
| 6908 | #ifdef CONFIG_CPUMASK_OFFSTACK | ||
| 6909 | alloc_size += num_possible_cpus() * cpumask_size(); | ||
| 6910 | #endif | ||
| 6911 | if (alloc_size) { | 7116 | if (alloc_size) { |
| 6912 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7117 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 6913 | 7118 | ||
| @@ -6927,13 +7132,13 @@ void __init sched_init(void) | |||
| 6927 | ptr += nr_cpu_ids * sizeof(void **); | 7132 | ptr += nr_cpu_ids * sizeof(void **); |
| 6928 | 7133 | ||
| 6929 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7134 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7135 | } | ||
| 6930 | #ifdef CONFIG_CPUMASK_OFFSTACK | 7136 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 6931 | for_each_possible_cpu(i) { | 7137 | for_each_possible_cpu(i) { |
| 6932 | per_cpu(load_balance_mask, i) = (void *)ptr; | 7138 | per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( |
| 6933 | ptr += cpumask_size(); | 7139 | cpumask_size(), GFP_KERNEL, cpu_to_node(i)); |
| 6934 | } | ||
| 6935 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | ||
| 6936 | } | 7140 | } |
| 7141 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | ||
| 6937 | 7142 | ||
| 6938 | init_rt_bandwidth(&def_rt_bandwidth, | 7143 | init_rt_bandwidth(&def_rt_bandwidth, |
| 6939 | global_rt_period(), global_rt_runtime()); | 7144 | global_rt_period(), global_rt_runtime()); |
| @@ -7082,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset) | |||
| 7082 | 7287 | ||
| 7083 | void __might_sleep(const char *file, int line, int preempt_offset) | 7288 | void __might_sleep(const char *file, int line, int preempt_offset) |
| 7084 | { | 7289 | { |
| 7290 | /* | ||
| 7291 | * Blocking primitives will set (and therefore destroy) current->state, | ||
| 7292 | * since we will exit with TASK_RUNNING make sure we enter with it, | ||
| 7293 | * otherwise we will destroy state. | ||
| 7294 | */ | ||
| 7295 | if (WARN_ONCE(current->state != TASK_RUNNING, | ||
| 7296 | "do not call blocking ops when !TASK_RUNNING; " | ||
| 7297 | "state=%lx set at [<%p>] %pS\n", | ||
| 7298 | current->state, | ||
| 7299 | (void *)current->task_state_change, | ||
| 7300 | (void *)current->task_state_change)) | ||
| 7301 | __set_current_state(TASK_RUNNING); | ||
| 7302 | |||
| 7303 | ___might_sleep(file, line, preempt_offset); | ||
| 7304 | } | ||
| 7305 | EXPORT_SYMBOL(__might_sleep); | ||
| 7306 | |||
| 7307 | void ___might_sleep(const char *file, int line, int preempt_offset) | ||
| 7308 | { | ||
| 7085 | static unsigned long prev_jiffy; /* ratelimiting */ | 7309 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 7086 | 7310 | ||
| 7087 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 7311 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ |
| @@ -7113,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
| 7113 | #endif | 7337 | #endif |
| 7114 | dump_stack(); | 7338 | dump_stack(); |
| 7115 | } | 7339 | } |
| 7116 | EXPORT_SYMBOL(__might_sleep); | 7340 | EXPORT_SYMBOL(___might_sleep); |
| 7117 | #endif | 7341 | #endif |
| 7118 | 7342 | ||
| 7119 | #ifdef CONFIG_MAGIC_SYSRQ | 7343 | #ifdef CONFIG_MAGIC_SYSRQ |
| @@ -7124,13 +7348,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
| 7124 | .sched_policy = SCHED_NORMAL, | 7348 | .sched_policy = SCHED_NORMAL, |
| 7125 | }; | 7349 | }; |
| 7126 | int old_prio = p->prio; | 7350 | int old_prio = p->prio; |
| 7127 | int on_rq; | 7351 | int queued; |
| 7128 | 7352 | ||
| 7129 | on_rq = p->on_rq; | 7353 | queued = task_on_rq_queued(p); |
| 7130 | if (on_rq) | 7354 | if (queued) |
| 7131 | dequeue_task(rq, p, 0); | 7355 | dequeue_task(rq, p, 0); |
| 7132 | __setscheduler(rq, p, &attr); | 7356 | __setscheduler(rq, p, &attr); |
| 7133 | if (on_rq) { | 7357 | if (queued) { |
| 7134 | enqueue_task(rq, p, 0); | 7358 | enqueue_task(rq, p, 0); |
| 7135 | resched_curr(rq); | 7359 | resched_curr(rq); |
| 7136 | } | 7360 | } |
| @@ -7144,12 +7368,12 @@ void normalize_rt_tasks(void) | |||
| 7144 | unsigned long flags; | 7368 | unsigned long flags; |
| 7145 | struct rq *rq; | 7369 | struct rq *rq; |
| 7146 | 7370 | ||
| 7147 | read_lock_irqsave(&tasklist_lock, flags); | 7371 | read_lock(&tasklist_lock); |
| 7148 | do_each_thread(g, p) { | 7372 | for_each_process_thread(g, p) { |
| 7149 | /* | 7373 | /* |
| 7150 | * Only normalize user tasks: | 7374 | * Only normalize user tasks: |
| 7151 | */ | 7375 | */ |
| 7152 | if (!p->mm) | 7376 | if (p->flags & PF_KTHREAD) |
| 7153 | continue; | 7377 | continue; |
| 7154 | 7378 | ||
| 7155 | p->se.exec_start = 0; | 7379 | p->se.exec_start = 0; |
| @@ -7164,21 +7388,16 @@ void normalize_rt_tasks(void) | |||
| 7164 | * Renice negative nice level userspace | 7388 | * Renice negative nice level userspace |
| 7165 | * tasks back to 0: | 7389 | * tasks back to 0: |
| 7166 | */ | 7390 | */ |
| 7167 | if (task_nice(p) < 0 && p->mm) | 7391 | if (task_nice(p) < 0) |
| 7168 | set_user_nice(p, 0); | 7392 | set_user_nice(p, 0); |
| 7169 | continue; | 7393 | continue; |
| 7170 | } | 7394 | } |
| 7171 | 7395 | ||
| 7172 | raw_spin_lock(&p->pi_lock); | 7396 | rq = task_rq_lock(p, &flags); |
| 7173 | rq = __task_rq_lock(p); | ||
| 7174 | |||
| 7175 | normalize_task(rq, p); | 7397 | normalize_task(rq, p); |
| 7176 | 7398 | task_rq_unlock(rq, p, &flags); | |
| 7177 | __task_rq_unlock(rq); | 7399 | } |
| 7178 | raw_spin_unlock(&p->pi_lock); | 7400 | read_unlock(&tasklist_lock); |
| 7179 | } while_each_thread(g, p); | ||
| 7180 | |||
| 7181 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 7182 | } | 7401 | } |
| 7183 | 7402 | ||
| 7184 | #endif /* CONFIG_MAGIC_SYSRQ */ | 7403 | #endif /* CONFIG_MAGIC_SYSRQ */ |
| @@ -7318,36 +7537,40 @@ void sched_offline_group(struct task_group *tg) | |||
| 7318 | void sched_move_task(struct task_struct *tsk) | 7537 | void sched_move_task(struct task_struct *tsk) |
| 7319 | { | 7538 | { |
| 7320 | struct task_group *tg; | 7539 | struct task_group *tg; |
| 7321 | int on_rq, running; | 7540 | int queued, running; |
| 7322 | unsigned long flags; | 7541 | unsigned long flags; |
| 7323 | struct rq *rq; | 7542 | struct rq *rq; |
| 7324 | 7543 | ||
| 7325 | rq = task_rq_lock(tsk, &flags); | 7544 | rq = task_rq_lock(tsk, &flags); |
| 7326 | 7545 | ||
| 7327 | running = task_current(rq, tsk); | 7546 | running = task_current(rq, tsk); |
| 7328 | on_rq = tsk->on_rq; | 7547 | queued = task_on_rq_queued(tsk); |
| 7329 | 7548 | ||
| 7330 | if (on_rq) | 7549 | if (queued) |
| 7331 | dequeue_task(rq, tsk, 0); | 7550 | dequeue_task(rq, tsk, 0); |
| 7332 | if (unlikely(running)) | 7551 | if (unlikely(running)) |
| 7333 | tsk->sched_class->put_prev_task(rq, tsk); | 7552 | put_prev_task(rq, tsk); |
| 7334 | 7553 | ||
| 7335 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, | 7554 | /* |
| 7336 | lockdep_is_held(&tsk->sighand->siglock)), | 7555 | * All callers are synchronized by task_rq_lock(); we do not use RCU |
| 7556 | * which is pointless here. Thus, we pass "true" to task_css_check() | ||
| 7557 | * to prevent lockdep warnings. | ||
| 7558 | */ | ||
| 7559 | tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), | ||
| 7337 | struct task_group, css); | 7560 | struct task_group, css); |
| 7338 | tg = autogroup_task_group(tsk, tg); | 7561 | tg = autogroup_task_group(tsk, tg); |
| 7339 | tsk->sched_task_group = tg; | 7562 | tsk->sched_task_group = tg; |
| 7340 | 7563 | ||
| 7341 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7564 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7342 | if (tsk->sched_class->task_move_group) | 7565 | if (tsk->sched_class->task_move_group) |
| 7343 | tsk->sched_class->task_move_group(tsk, on_rq); | 7566 | tsk->sched_class->task_move_group(tsk, queued); |
| 7344 | else | 7567 | else |
| 7345 | #endif | 7568 | #endif |
| 7346 | set_task_rq(tsk, task_cpu(tsk)); | 7569 | set_task_rq(tsk, task_cpu(tsk)); |
| 7347 | 7570 | ||
| 7348 | if (unlikely(running)) | 7571 | if (unlikely(running)) |
| 7349 | tsk->sched_class->set_curr_task(rq); | 7572 | tsk->sched_class->set_curr_task(rq); |
| 7350 | if (on_rq) | 7573 | if (queued) |
| 7351 | enqueue_task(rq, tsk, 0); | 7574 | enqueue_task(rq, tsk, 0); |
| 7352 | 7575 | ||
| 7353 | task_rq_unlock(rq, tsk, &flags); | 7576 | task_rq_unlock(rq, tsk, &flags); |
| @@ -7365,10 +7588,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 7365 | { | 7588 | { |
| 7366 | struct task_struct *g, *p; | 7589 | struct task_struct *g, *p; |
| 7367 | 7590 | ||
| 7368 | do_each_thread(g, p) { | 7591 | for_each_process_thread(g, p) { |
| 7369 | if (rt_task(p) && task_rq(p)->rt.tg == tg) | 7592 | if (rt_task(p) && task_group(p) == tg) |
| 7370 | return 1; | 7593 | return 1; |
| 7371 | } while_each_thread(g, p); | 7594 | } |
| 7372 | 7595 | ||
| 7373 | return 0; | 7596 | return 0; |
| 7374 | } | 7597 | } |
| @@ -7577,6 +7800,7 @@ static int sched_dl_global_constraints(void) | |||
| 7577 | u64 runtime = global_rt_runtime(); | 7800 | u64 runtime = global_rt_runtime(); |
| 7578 | u64 period = global_rt_period(); | 7801 | u64 period = global_rt_period(); |
| 7579 | u64 new_bw = to_ratio(period, runtime); | 7802 | u64 new_bw = to_ratio(period, runtime); |
| 7803 | struct dl_bw *dl_b; | ||
| 7580 | int cpu, ret = 0; | 7804 | int cpu, ret = 0; |
| 7581 | unsigned long flags; | 7805 | unsigned long flags; |
| 7582 | 7806 | ||
| @@ -7590,13 +7814,16 @@ static int sched_dl_global_constraints(void) | |||
| 7590 | * solutions is welcome! | 7814 | * solutions is welcome! |
| 7591 | */ | 7815 | */ |
| 7592 | for_each_possible_cpu(cpu) { | 7816 | for_each_possible_cpu(cpu) { |
| 7593 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7817 | rcu_read_lock_sched(); |
| 7818 | dl_b = dl_bw_of(cpu); | ||
| 7594 | 7819 | ||
| 7595 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7820 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7596 | if (new_bw < dl_b->total_bw) | 7821 | if (new_bw < dl_b->total_bw) |
| 7597 | ret = -EBUSY; | 7822 | ret = -EBUSY; |
| 7598 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7823 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7599 | 7824 | ||
| 7825 | rcu_read_unlock_sched(); | ||
| 7826 | |||
| 7600 | if (ret) | 7827 | if (ret) |
| 7601 | break; | 7828 | break; |
| 7602 | } | 7829 | } |
| @@ -7607,6 +7834,7 @@ static int sched_dl_global_constraints(void) | |||
| 7607 | static void sched_dl_do_global(void) | 7834 | static void sched_dl_do_global(void) |
| 7608 | { | 7835 | { |
| 7609 | u64 new_bw = -1; | 7836 | u64 new_bw = -1; |
| 7837 | struct dl_bw *dl_b; | ||
| 7610 | int cpu; | 7838 | int cpu; |
| 7611 | unsigned long flags; | 7839 | unsigned long flags; |
| 7612 | 7840 | ||
| @@ -7620,11 +7848,14 @@ static void sched_dl_do_global(void) | |||
| 7620 | * FIXME: As above... | 7848 | * FIXME: As above... |
| 7621 | */ | 7849 | */ |
| 7622 | for_each_possible_cpu(cpu) { | 7850 | for_each_possible_cpu(cpu) { |
| 7623 | struct dl_bw *dl_b = dl_bw_of(cpu); | 7851 | rcu_read_lock_sched(); |
| 7852 | dl_b = dl_bw_of(cpu); | ||
| 7624 | 7853 | ||
| 7625 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 7854 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 7626 | dl_b->bw = new_bw; | 7855 | dl_b->bw = new_bw; |
| 7627 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 7856 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 7857 | |||
| 7858 | rcu_read_unlock_sched(); | ||
| 7628 | } | 7859 | } |
| 7629 | } | 7860 | } |
| 7630 | 7861 | ||
| @@ -7754,6 +7985,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
| 7754 | sched_offline_group(tg); | 7985 | sched_offline_group(tg); |
| 7755 | } | 7986 | } |
| 7756 | 7987 | ||
| 7988 | static void cpu_cgroup_fork(struct task_struct *task) | ||
| 7989 | { | ||
| 7990 | sched_move_task(task); | ||
| 7991 | } | ||
| 7992 | |||
| 7757 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, | 7993 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
| 7758 | struct cgroup_taskset *tset) | 7994 | struct cgroup_taskset *tset) |
| 7759 | { | 7995 | { |
| @@ -8005,7 +8241,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 8005 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; | 8241 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
| 8006 | 8242 | ||
| 8007 | quota = normalize_cfs_quota(tg, d); | 8243 | quota = normalize_cfs_quota(tg, d); |
| 8008 | parent_quota = parent_b->hierarchal_quota; | 8244 | parent_quota = parent_b->hierarchical_quota; |
| 8009 | 8245 | ||
| 8010 | /* | 8246 | /* |
| 8011 | * ensure max(child_quota) <= parent_quota, inherit when no | 8247 | * ensure max(child_quota) <= parent_quota, inherit when no |
| @@ -8016,7 +8252,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
| 8016 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | 8252 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) |
| 8017 | return -EINVAL; | 8253 | return -EINVAL; |
| 8018 | } | 8254 | } |
| 8019 | cfs_b->hierarchal_quota = quota; | 8255 | cfs_b->hierarchical_quota = quota; |
| 8020 | 8256 | ||
| 8021 | return 0; | 8257 | return 0; |
| 8022 | } | 8258 | } |
| @@ -8126,6 +8362,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { | |||
| 8126 | .css_free = cpu_cgroup_css_free, | 8362 | .css_free = cpu_cgroup_css_free, |
| 8127 | .css_online = cpu_cgroup_css_online, | 8363 | .css_online = cpu_cgroup_css_online, |
| 8128 | .css_offline = cpu_cgroup_css_offline, | 8364 | .css_offline = cpu_cgroup_css_offline, |
| 8365 | .fork = cpu_cgroup_fork, | ||
| 8129 | .can_attach = cpu_cgroup_can_attach, | 8366 | .can_attach = cpu_cgroup_can_attach, |
| 8130 | .attach = cpu_cgroup_attach, | 8367 | .attach = cpu_cgroup_attach, |
| 8131 | .exit = cpu_cgroup_exit, | 8368 | .exit = cpu_cgroup_exit, |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 107 | int best_cpu = -1; | 107 | int best_cpu = -1; |
| 108 | const struct sched_dl_entity *dl_se = &p->dl; | 108 | const struct sched_dl_entity *dl_se = &p->dl; |
| 109 | 109 | ||
| 110 | if (later_mask && cpumask_and(later_mask, cp->free_cpus, | 110 | if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { |
| 111 | &p->cpus_allowed) && cpumask_and(later_mask, | ||
| 112 | later_mask, cpu_active_mask)) { | ||
| 113 | best_cpu = cpumask_any(later_mask); | 111 | best_cpu = cpumask_any(later_mask); |
| 114 | goto out; | 112 | goto out; |
| 115 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | 113 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..020039bd1326 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); |
| 26 | int cpudl_init(struct cpudl *cp); | 26 | int cpudl_init(struct cpudl *cp); |
| 27 | void cpudl_cleanup(struct cpudl *cp); | 27 | void cpudl_cleanup(struct cpudl *cp); |
| 28 | #else | ||
| 29 | #define cpudl_set(cp, cpu, dl) do { } while (0) | ||
| 30 | #define cpudl_init() do { } while (0) | ||
| 31 | #endif /* CONFIG_SMP */ | 28 | #endif /* CONFIG_SMP */ |
| 32 | 29 | ||
| 33 | #endif /* _LINUX_CPUDL_H */ | 30 | #endif /* _LINUX_CPUDL_H */ |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
| @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, | |||
| 26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 26 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
| 27 | int cpupri_init(struct cpupri *cp); | 27 | int cpupri_init(struct cpupri *cp); |
| 28 | void cpupri_cleanup(struct cpupri *cp); | 28 | void cpupri_cleanup(struct cpupri *cp); |
| 29 | #else | ||
| 30 | #define cpupri_set(cp, cpu, pri) do { } while (0) | ||
| 31 | #define cpupri_init() do { } while (0) | ||
| 32 | #endif | 29 | #endif |
| 33 | 30 | ||
| 34 | #endif /* _LINUX_CPUPRI_H */ | 31 | #endif /* _LINUX_CPUPRI_H */ |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 288 | struct signal_struct *sig = tsk->signal; | 288 | struct signal_struct *sig = tsk->signal; |
| 289 | cputime_t utime, stime; | 289 | cputime_t utime, stime; |
| 290 | struct task_struct *t; | 290 | struct task_struct *t; |
| 291 | 291 | unsigned int seq, nextseq; | |
| 292 | times->utime = sig->utime; | 292 | unsigned long flags; |
| 293 | times->stime = sig->stime; | ||
| 294 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
| 295 | 293 | ||
| 296 | rcu_read_lock(); | 294 | rcu_read_lock(); |
| 297 | /* make sure we can trust tsk->thread_group list */ | 295 | /* Attempt a lockless read on the first round. */ |
| 298 | if (!likely(pid_alive(tsk))) | 296 | nextseq = 0; |
| 299 | goto out; | ||
| 300 | |||
| 301 | t = tsk; | ||
| 302 | do { | 297 | do { |
| 303 | task_cputime(t, &utime, &stime); | 298 | seq = nextseq; |
| 304 | times->utime += utime; | 299 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
| 305 | times->stime += stime; | 300 | times->utime = sig->utime; |
| 306 | times->sum_exec_runtime += task_sched_runtime(t); | 301 | times->stime = sig->stime; |
| 307 | } while_each_thread(tsk, t); | 302 | times->sum_exec_runtime = sig->sum_sched_runtime; |
| 308 | out: | 303 | |
| 304 | for_each_thread(tsk, t) { | ||
| 305 | task_cputime(t, &utime, &stime); | ||
| 306 | times->utime += utime; | ||
| 307 | times->stime += stime; | ||
| 308 | times->sum_exec_runtime += task_sched_runtime(t); | ||
| 309 | } | ||
| 310 | /* If lockless access failed, take the lock. */ | ||
| 311 | nextseq = 1; | ||
| 312 | } while (need_seqretry(&sig->stats_lock, seq)); | ||
| 313 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); | ||
| 309 | rcu_read_unlock(); | 314 | rcu_read_unlock(); |
| 310 | } | 315 | } |
| 311 | 316 | ||
| @@ -550,6 +555,23 @@ drop_precision: | |||
| 550 | } | 555 | } |
| 551 | 556 | ||
| 552 | /* | 557 | /* |
| 558 | * Atomically advance counter to the new value. Interrupts, vcpu | ||
| 559 | * scheduling, and scaling inaccuracies can cause cputime_advance | ||
| 560 | * to be occasionally called with a new value smaller than counter. | ||
| 561 | * Let's enforce atomicity. | ||
| 562 | * | ||
| 563 | * Normally a caller will only go through this loop once, or not | ||
| 564 | * at all in case a previous caller updated counter the same jiffy. | ||
| 565 | */ | ||
| 566 | static void cputime_advance(cputime_t *counter, cputime_t new) | ||
| 567 | { | ||
| 568 | cputime_t old; | ||
| 569 | |||
| 570 | while (new > (old = ACCESS_ONCE(*counter))) | ||
| 571 | cmpxchg_cputime(counter, old, new); | ||
| 572 | } | ||
| 573 | |||
| 574 | /* | ||
| 553 | * Adjust tick based cputime random precision against scheduler | 575 | * Adjust tick based cputime random precision against scheduler |
| 554 | * runtime accounting. | 576 | * runtime accounting. |
| 555 | */ | 577 | */ |
| @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 594 | utime = rtime - stime; | 616 | utime = rtime - stime; |
| 595 | } | 617 | } |
| 596 | 618 | ||
| 597 | /* | 619 | cputime_advance(&prev->stime, stime); |
| 598 | * If the tick based count grows faster than the scheduler one, | 620 | cputime_advance(&prev->utime, utime); |
| 599 | * the result of the scaling may go backward. | ||
| 600 | * Let's enforce monotonicity. | ||
| 601 | */ | ||
| 602 | prev->stime = max(prev->stime, stime); | ||
| 603 | prev->utime = max(prev->utime, utime); | ||
| 604 | 621 | ||
| 605 | out: | 622 | out: |
| 606 | *ut = prev->utime; | 623 | *ut = prev->utime; |
| @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 617 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 634 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
| 618 | } | 635 | } |
| 619 | 636 | ||
| 620 | /* | ||
| 621 | * Must be called with siglock held. | ||
| 622 | */ | ||
| 623 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 637 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 624 | { | 638 | { |
| 625 | struct task_cputime cputime; | 639 | struct task_cputime cputime; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..b52092f2636d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -518,21 +518,29 @@ again: | |||
| 518 | } | 518 | } |
| 519 | 519 | ||
| 520 | /* | 520 | /* |
| 521 | * We need to take care of a possible races here. In fact, the | 521 | * We need to take care of several possible races here: |
| 522 | * task might have changed its scheduling policy to something | 522 | * |
| 523 | * different from SCHED_DEADLINE or changed its reservation | 523 | * - the task might have changed its scheduling policy |
| 524 | * parameters (through sched_setattr()). | 524 | * to something different than SCHED_DEADLINE |
| 525 | * - the task might have changed its reservation parameters | ||
| 526 | * (through sched_setattr()) | ||
| 527 | * - the task might have been boosted by someone else and | ||
| 528 | * might be in the boosting/deboosting path | ||
| 529 | * | ||
| 530 | * In all this cases we bail out, as the task is already | ||
| 531 | * in the runqueue or is going to be enqueued back anyway. | ||
| 525 | */ | 532 | */ |
| 526 | if (!dl_task(p) || dl_se->dl_new) | 533 | if (!dl_task(p) || dl_se->dl_new || |
| 534 | dl_se->dl_boosted || !dl_se->dl_throttled) | ||
| 527 | goto unlock; | 535 | goto unlock; |
| 528 | 536 | ||
| 529 | sched_clock_tick(); | 537 | sched_clock_tick(); |
| 530 | update_rq_clock(rq); | 538 | update_rq_clock(rq); |
| 531 | dl_se->dl_throttled = 0; | 539 | dl_se->dl_throttled = 0; |
| 532 | dl_se->dl_yielded = 0; | 540 | dl_se->dl_yielded = 0; |
| 533 | if (p->on_rq) { | 541 | if (task_on_rq_queued(p)) { |
| 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 542 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
| 535 | if (task_has_dl_policy(rq->curr)) | 543 | if (dl_task(rq->curr)) |
| 536 | check_preempt_curr_dl(rq, p, 0); | 544 | check_preempt_curr_dl(rq, p, 0); |
| 537 | else | 545 | else |
| 538 | resched_curr(rq); | 546 | resched_curr(rq); |
| @@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
| 555 | { | 563 | { |
| 556 | struct hrtimer *timer = &dl_se->dl_timer; | 564 | struct hrtimer *timer = &dl_se->dl_timer; |
| 557 | 565 | ||
| 558 | if (hrtimer_active(timer)) { | ||
| 559 | hrtimer_try_to_cancel(timer); | ||
| 560 | return; | ||
| 561 | } | ||
| 562 | |||
| 563 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 566 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 564 | timer->function = dl_task_timer; | 567 | timer->function = dl_task_timer; |
| 565 | } | 568 | } |
| @@ -567,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) | |||
| 567 | static | 570 | static |
| 568 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) | 571 | int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) |
| 569 | { | 572 | { |
| 570 | int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); | 573 | return (dl_se->runtime <= 0); |
| 571 | int rorun = dl_se->runtime <= 0; | ||
| 572 | |||
| 573 | if (!rorun && !dmiss) | ||
| 574 | return 0; | ||
| 575 | |||
| 576 | /* | ||
| 577 | * If we are beyond our current deadline and we are still | ||
| 578 | * executing, then we have already used some of the runtime of | ||
| 579 | * the next instance. Thus, if we do not account that, we are | ||
| 580 | * stealing bandwidth from the system at each deadline miss! | ||
| 581 | */ | ||
| 582 | if (dmiss) { | ||
| 583 | dl_se->runtime = rorun ? dl_se->runtime : 0; | ||
| 584 | dl_se->runtime -= rq_clock(rq) - dl_se->deadline; | ||
| 585 | } | ||
| 586 | |||
| 587 | return 1; | ||
| 588 | } | 574 | } |
| 589 | 575 | ||
| 590 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | 576 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); |
| @@ -625,7 +611,7 @@ static void update_curr_dl(struct rq *rq) | |||
| 625 | 611 | ||
| 626 | sched_rt_avg_update(rq, delta_exec); | 612 | sched_rt_avg_update(rq, delta_exec); |
| 627 | 613 | ||
| 628 | dl_se->runtime -= delta_exec; | 614 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; |
| 629 | if (dl_runtime_exceeded(rq, dl_se)) { | 615 | if (dl_runtime_exceeded(rq, dl_se)) { |
| 630 | __dequeue_task_dl(rq, curr, 0); | 616 | __dequeue_task_dl(rq, curr, 0); |
| 631 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | 617 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) |
| @@ -823,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
| 823 | * parameters of the task might need updating. Otherwise, | 809 | * parameters of the task might need updating. Otherwise, |
| 824 | * we want a replenishment of its runtime. | 810 | * we want a replenishment of its runtime. |
| 825 | */ | 811 | */ |
| 826 | if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) | 812 | if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) |
| 827 | replenish_dl_entity(dl_se, pi_se); | ||
| 828 | else | ||
| 829 | update_dl_entity(dl_se, pi_se); | 813 | update_dl_entity(dl_se, pi_se); |
| 814 | else if (flags & ENQUEUE_REPLENISH) | ||
| 815 | replenish_dl_entity(dl_se, pi_se); | ||
| 830 | 816 | ||
| 831 | __enqueue_dl_entity(dl_se); | 817 | __enqueue_dl_entity(dl_se); |
| 832 | } | 818 | } |
| @@ -847,8 +833,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 847 | * smaller than our one... OTW we keep our runtime and | 833 | * smaller than our one... OTW we keep our runtime and |
| 848 | * deadline. | 834 | * deadline. |
| 849 | */ | 835 | */ |
| 850 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) | 836 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { |
| 851 | pi_se = &pi_task->dl; | 837 | pi_se = &pi_task->dl; |
| 838 | } else if (!dl_prio(p->normal_prio)) { | ||
| 839 | /* | ||
| 840 | * Special case in which we have a !SCHED_DEADLINE task | ||
| 841 | * that is going to be deboosted, but exceedes its | ||
| 842 | * runtime while doing so. No point in replenishing | ||
| 843 | * it, as it's going to return back to its original | ||
| 844 | * scheduling class after this. | ||
| 845 | */ | ||
| 846 | BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); | ||
| 847 | return; | ||
| 848 | } | ||
| 852 | 849 | ||
| 853 | /* | 850 | /* |
| 854 | * If p is throttled, we do nothing. In fact, if it exhausted | 851 | * If p is throttled, we do nothing. In fact, if it exhausted |
| @@ -914,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
| 914 | struct task_struct *curr; | 911 | struct task_struct *curr; |
| 915 | struct rq *rq; | 912 | struct rq *rq; |
| 916 | 913 | ||
| 917 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 914 | if (sd_flag != SD_BALANCE_WAKE) |
| 918 | goto out; | 915 | goto out; |
| 919 | 916 | ||
| 920 | rq = cpu_rq(cpu); | 917 | rq = cpu_rq(cpu); |
| @@ -997,10 +994,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
| 997 | #ifdef CONFIG_SCHED_HRTICK | 994 | #ifdef CONFIG_SCHED_HRTICK |
| 998 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) | 995 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) |
| 999 | { | 996 | { |
| 1000 | s64 delta = p->dl.dl_runtime - p->dl.runtime; | 997 | hrtick_start(rq, p->dl.runtime); |
| 1001 | 998 | } | |
| 1002 | if (delta > 10000) | 999 | #else /* !CONFIG_SCHED_HRTICK */ |
| 1003 | hrtick_start(rq, p->dl.runtime); | 1000 | static void start_hrtick_dl(struct rq *rq, struct task_struct *p) |
| 1001 | { | ||
| 1004 | } | 1002 | } |
| 1005 | #endif | 1003 | #endif |
| 1006 | 1004 | ||
| @@ -1030,7 +1028,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1030 | * means a stop task can slip in, in which case we need to | 1028 | * means a stop task can slip in, in which case we need to |
| 1031 | * re-start task selection. | 1029 | * re-start task selection. |
| 1032 | */ | 1030 | */ |
| 1033 | if (rq->stop && rq->stop->on_rq) | 1031 | if (rq->stop && task_on_rq_queued(rq->stop)) |
| 1034 | return RETRY_TASK; | 1032 | return RETRY_TASK; |
| 1035 | } | 1033 | } |
| 1036 | 1034 | ||
| @@ -1055,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1055 | /* Running task will never be pushed. */ | 1053 | /* Running task will never be pushed. */ |
| 1056 | dequeue_pushable_dl_task(rq, p); | 1054 | dequeue_pushable_dl_task(rq, p); |
| 1057 | 1055 | ||
| 1058 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1059 | if (hrtick_enabled(rq)) | 1056 | if (hrtick_enabled(rq)) |
| 1060 | start_hrtick_dl(rq, p); | 1057 | start_hrtick_dl(rq, p); |
| 1061 | #endif | ||
| 1062 | 1058 | ||
| 1063 | set_post_schedule(rq); | 1059 | set_post_schedule(rq); |
| 1064 | 1060 | ||
| @@ -1077,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
| 1077 | { | 1073 | { |
| 1078 | update_curr_dl(rq); | 1074 | update_curr_dl(rq); |
| 1079 | 1075 | ||
| 1080 | #ifdef CONFIG_SCHED_HRTICK | ||
| 1081 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | 1076 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) |
| 1082 | start_hrtick_dl(rq, p); | 1077 | start_hrtick_dl(rq, p); |
| 1083 | #endif | ||
| 1084 | } | 1078 | } |
| 1085 | 1079 | ||
| 1086 | static void task_fork_dl(struct task_struct *p) | 1080 | static void task_fork_dl(struct task_struct *p) |
| @@ -1124,10 +1118,8 @@ static void set_curr_task_dl(struct rq *rq) | |||
| 1124 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1118 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1125 | { | 1119 | { |
| 1126 | if (!task_running(rq, p) && | 1120 | if (!task_running(rq, p) && |
| 1127 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1121 | cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 1128 | (p->nr_cpus_allowed > 1)) | ||
| 1129 | return 1; | 1122 | return 1; |
| 1130 | |||
| 1131 | return 0; | 1123 | return 0; |
| 1132 | } | 1124 | } |
| 1133 | 1125 | ||
| @@ -1158,7 +1150,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); | |||
| 1158 | static int find_later_rq(struct task_struct *task) | 1150 | static int find_later_rq(struct task_struct *task) |
| 1159 | { | 1151 | { |
| 1160 | struct sched_domain *sd; | 1152 | struct sched_domain *sd; |
| 1161 | struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); | 1153 | struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); |
| 1162 | int this_cpu = smp_processor_id(); | 1154 | int this_cpu = smp_processor_id(); |
| 1163 | int best_cpu, cpu = task_cpu(task); | 1155 | int best_cpu, cpu = task_cpu(task); |
| 1164 | 1156 | ||
| @@ -1169,6 +1161,13 @@ static int find_later_rq(struct task_struct *task) | |||
| 1169 | if (task->nr_cpus_allowed == 1) | 1161 | if (task->nr_cpus_allowed == 1) |
| 1170 | return -1; | 1162 | return -1; |
| 1171 | 1163 | ||
| 1164 | /* | ||
| 1165 | * We have to consider system topology and task affinity | ||
| 1166 | * first, then we can look for a suitable cpu. | ||
| 1167 | */ | ||
| 1168 | cpumask_copy(later_mask, task_rq(task)->rd->span); | ||
| 1169 | cpumask_and(later_mask, later_mask, cpu_active_mask); | ||
| 1170 | cpumask_and(later_mask, later_mask, &task->cpus_allowed); | ||
| 1172 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | 1171 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, |
| 1173 | task, later_mask); | 1172 | task, later_mask); |
| 1174 | if (best_cpu == -1) | 1173 | if (best_cpu == -1) |
| @@ -1257,7 +1256,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
| 1257 | if (unlikely(task_rq(task) != rq || | 1256 | if (unlikely(task_rq(task) != rq || |
| 1258 | !cpumask_test_cpu(later_rq->cpu, | 1257 | !cpumask_test_cpu(later_rq->cpu, |
| 1259 | &task->cpus_allowed) || | 1258 | &task->cpus_allowed) || |
| 1260 | task_running(rq, task) || !task->on_rq)) { | 1259 | task_running(rq, task) || |
| 1260 | !task_on_rq_queued(task))) { | ||
| 1261 | double_unlock_balance(rq, later_rq); | 1261 | double_unlock_balance(rq, later_rq); |
| 1262 | later_rq = NULL; | 1262 | later_rq = NULL; |
| 1263 | break; | 1263 | break; |
| @@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) | |||
| 1296 | BUG_ON(task_current(rq, p)); | 1296 | BUG_ON(task_current(rq, p)); |
| 1297 | BUG_ON(p->nr_cpus_allowed <= 1); | 1297 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1298 | 1298 | ||
| 1299 | BUG_ON(!p->on_rq); | 1299 | BUG_ON(!task_on_rq_queued(p)); |
| 1300 | BUG_ON(!dl_task(p)); | 1300 | BUG_ON(!dl_task(p)); |
| 1301 | 1301 | ||
| 1302 | return p; | 1302 | return p; |
| @@ -1311,6 +1311,7 @@ static int push_dl_task(struct rq *rq) | |||
| 1311 | { | 1311 | { |
| 1312 | struct task_struct *next_task; | 1312 | struct task_struct *next_task; |
| 1313 | struct rq *later_rq; | 1313 | struct rq *later_rq; |
| 1314 | int ret = 0; | ||
| 1314 | 1315 | ||
| 1315 | if (!rq->dl.overloaded) | 1316 | if (!rq->dl.overloaded) |
| 1316 | return 0; | 1317 | return 0; |
| @@ -1356,7 +1357,6 @@ retry: | |||
| 1356 | * The task is still there. We don't try | 1357 | * The task is still there. We don't try |
| 1357 | * again, some other cpu will pull it when ready. | 1358 | * again, some other cpu will pull it when ready. |
| 1358 | */ | 1359 | */ |
| 1359 | dequeue_pushable_dl_task(rq, next_task); | ||
| 1360 | goto out; | 1360 | goto out; |
| 1361 | } | 1361 | } |
| 1362 | 1362 | ||
| @@ -1372,6 +1372,7 @@ retry: | |||
| 1372 | deactivate_task(rq, next_task, 0); | 1372 | deactivate_task(rq, next_task, 0); |
| 1373 | set_task_cpu(next_task, later_rq->cpu); | 1373 | set_task_cpu(next_task, later_rq->cpu); |
| 1374 | activate_task(later_rq, next_task, 0); | 1374 | activate_task(later_rq, next_task, 0); |
| 1375 | ret = 1; | ||
| 1375 | 1376 | ||
| 1376 | resched_curr(later_rq); | 1377 | resched_curr(later_rq); |
| 1377 | 1378 | ||
| @@ -1380,7 +1381,7 @@ retry: | |||
| 1380 | out: | 1381 | out: |
| 1381 | put_task_struct(next_task); | 1382 | put_task_struct(next_task); |
| 1382 | 1383 | ||
| 1383 | return 1; | 1384 | return ret; |
| 1384 | } | 1385 | } |
| 1385 | 1386 | ||
| 1386 | static void push_dl_tasks(struct rq *rq) | 1387 | static void push_dl_tasks(struct rq *rq) |
| @@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq) | |||
| 1443 | dl_time_before(p->dl.deadline, | 1444 | dl_time_before(p->dl.deadline, |
| 1444 | this_rq->dl.earliest_dl.curr))) { | 1445 | this_rq->dl.earliest_dl.curr))) { |
| 1445 | WARN_ON(p == src_rq->curr); | 1446 | WARN_ON(p == src_rq->curr); |
| 1446 | WARN_ON(!p->on_rq); | 1447 | WARN_ON(!task_on_rq_queued(p)); |
| 1447 | 1448 | ||
| 1448 | /* | 1449 | /* |
| 1449 | * Then we pull iff p has actually an earlier | 1450 | * Then we pull iff p has actually an earlier |
| @@ -1486,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) | |||
| 1486 | p->nr_cpus_allowed > 1 && | 1487 | p->nr_cpus_allowed > 1 && |
| 1487 | dl_task(rq->curr) && | 1488 | dl_task(rq->curr) && |
| 1488 | (rq->curr->nr_cpus_allowed < 2 || | 1489 | (rq->curr->nr_cpus_allowed < 2 || |
| 1489 | dl_entity_preempt(&rq->curr->dl, &p->dl))) { | 1490 | !dl_entity_preempt(&p->dl, &rq->curr->dl))) { |
| 1490 | push_dl_tasks(rq); | 1491 | push_dl_tasks(rq); |
| 1491 | } | 1492 | } |
| 1492 | } | 1493 | } |
| @@ -1495,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
| 1495 | const struct cpumask *new_mask) | 1496 | const struct cpumask *new_mask) |
| 1496 | { | 1497 | { |
| 1497 | struct rq *rq; | 1498 | struct rq *rq; |
| 1499 | struct root_domain *src_rd; | ||
| 1498 | int weight; | 1500 | int weight; |
| 1499 | 1501 | ||
| 1500 | BUG_ON(!dl_task(p)); | 1502 | BUG_ON(!dl_task(p)); |
| 1501 | 1503 | ||
| 1504 | rq = task_rq(p); | ||
| 1505 | src_rd = rq->rd; | ||
| 1506 | /* | ||
| 1507 | * Migrating a SCHED_DEADLINE task between exclusive | ||
| 1508 | * cpusets (different root_domains) entails a bandwidth | ||
| 1509 | * update. We already made space for us in the destination | ||
| 1510 | * domain (see cpuset_can_attach()). | ||
| 1511 | */ | ||
| 1512 | if (!cpumask_intersects(src_rd->span, new_mask)) { | ||
| 1513 | struct dl_bw *src_dl_b; | ||
| 1514 | |||
| 1515 | src_dl_b = dl_bw_of(cpu_of(rq)); | ||
| 1516 | /* | ||
| 1517 | * We now free resources of the root_domain we are migrating | ||
| 1518 | * off. In the worst case, sched_setattr() may temporary fail | ||
| 1519 | * until we complete the update. | ||
| 1520 | */ | ||
| 1521 | raw_spin_lock(&src_dl_b->lock); | ||
| 1522 | __dl_clear(src_dl_b, p->dl.dl_bw); | ||
| 1523 | raw_spin_unlock(&src_dl_b->lock); | ||
| 1524 | } | ||
| 1525 | |||
| 1502 | /* | 1526 | /* |
| 1503 | * Update only if the task is actually running (i.e., | 1527 | * Update only if the task is actually running (i.e., |
| 1504 | * it is on the rq AND it is not throttled). | 1528 | * it is on the rq AND it is not throttled). |
| @@ -1515,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
| 1515 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) | 1539 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
| 1516 | return; | 1540 | return; |
| 1517 | 1541 | ||
| 1518 | rq = task_rq(p); | ||
| 1519 | |||
| 1520 | /* | 1542 | /* |
| 1521 | * The process used to be able to migrate OR it can now migrate | 1543 | * The process used to be able to migrate OR it can now migrate |
| 1522 | */ | 1544 | */ |
| @@ -1564,20 +1586,48 @@ void init_sched_dl_class(void) | |||
| 1564 | 1586 | ||
| 1565 | #endif /* CONFIG_SMP */ | 1587 | #endif /* CONFIG_SMP */ |
| 1566 | 1588 | ||
| 1589 | /* | ||
| 1590 | * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. | ||
| 1591 | */ | ||
| 1592 | static void cancel_dl_timer(struct rq *rq, struct task_struct *p) | ||
| 1593 | { | ||
| 1594 | struct hrtimer *dl_timer = &p->dl.dl_timer; | ||
| 1595 | |||
| 1596 | /* Nobody will change task's class if pi_lock is held */ | ||
| 1597 | lockdep_assert_held(&p->pi_lock); | ||
| 1598 | |||
| 1599 | if (hrtimer_active(dl_timer)) { | ||
| 1600 | int ret = hrtimer_try_to_cancel(dl_timer); | ||
| 1601 | |||
| 1602 | if (unlikely(ret == -1)) { | ||
| 1603 | /* | ||
| 1604 | * Note, p may migrate OR new deadline tasks | ||
| 1605 | * may appear in rq when we are unlocking it. | ||
| 1606 | * A caller of us must be fine with that. | ||
| 1607 | */ | ||
| 1608 | raw_spin_unlock(&rq->lock); | ||
| 1609 | hrtimer_cancel(dl_timer); | ||
| 1610 | raw_spin_lock(&rq->lock); | ||
| 1611 | } | ||
| 1612 | } | ||
| 1613 | } | ||
| 1614 | |||
| 1567 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | 1615 | static void switched_from_dl(struct rq *rq, struct task_struct *p) |
| 1568 | { | 1616 | { |
| 1569 | if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) | 1617 | cancel_dl_timer(rq, p); |
| 1570 | hrtimer_try_to_cancel(&p->dl.dl_timer); | 1618 | |
| 1619 | __dl_clear_params(p); | ||
| 1571 | 1620 | ||
| 1572 | #ifdef CONFIG_SMP | ||
| 1573 | /* | 1621 | /* |
| 1574 | * Since this might be the only -deadline task on the rq, | 1622 | * Since this might be the only -deadline task on the rq, |
| 1575 | * this is the right place to try to pull some other one | 1623 | * this is the right place to try to pull some other one |
| 1576 | * from an overloaded cpu, if any. | 1624 | * from an overloaded cpu, if any. |
| 1577 | */ | 1625 | */ |
| 1578 | if (!rq->dl.dl_nr_running) | 1626 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
| 1579 | pull_dl_task(rq); | 1627 | return; |
| 1580 | #endif | 1628 | |
| 1629 | if (pull_dl_task(rq)) | ||
| 1630 | resched_curr(rq); | ||
| 1581 | } | 1631 | } |
| 1582 | 1632 | ||
| 1583 | /* | 1633 | /* |
| @@ -1596,14 +1646,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1596 | if (unlikely(p->dl.dl_throttled)) | 1646 | if (unlikely(p->dl.dl_throttled)) |
| 1597 | return; | 1647 | return; |
| 1598 | 1648 | ||
| 1599 | if (p->on_rq && rq->curr != p) { | 1649 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1600 | #ifdef CONFIG_SMP | 1650 | #ifdef CONFIG_SMP |
| 1601 | if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) | 1651 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
| 1652 | push_dl_task(rq) && rq != task_rq(p)) | ||
| 1602 | /* Only reschedule if pushing failed */ | 1653 | /* Only reschedule if pushing failed */ |
| 1603 | check_resched = 0; | 1654 | check_resched = 0; |
| 1604 | #endif /* CONFIG_SMP */ | 1655 | #endif /* CONFIG_SMP */ |
| 1605 | if (check_resched && task_has_dl_policy(rq->curr)) | 1656 | if (check_resched) { |
| 1606 | check_preempt_curr_dl(rq, p, 0); | 1657 | if (dl_task(rq->curr)) |
| 1658 | check_preempt_curr_dl(rq, p, 0); | ||
| 1659 | else | ||
| 1660 | resched_curr(rq); | ||
| 1661 | } | ||
| 1607 | } | 1662 | } |
| 1608 | } | 1663 | } |
| 1609 | 1664 | ||
| @@ -1614,7 +1669,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1614 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, | 1669 | static void prio_changed_dl(struct rq *rq, struct task_struct *p, |
| 1615 | int oldprio) | 1670 | int oldprio) |
| 1616 | { | 1671 | { |
| 1617 | if (p->on_rq || rq->curr == p) { | 1672 | if (task_on_rq_queued(p) || rq->curr == p) { |
| 1618 | #ifdef CONFIG_SMP | 1673 | #ifdef CONFIG_SMP |
| 1619 | /* | 1674 | /* |
| 1620 | * This might be too much, but unfortunately | 1675 | * This might be too much, but unfortunately |
| @@ -1673,4 +1728,15 @@ const struct sched_class dl_sched_class = { | |||
| 1673 | .prio_changed = prio_changed_dl, | 1728 | .prio_changed = prio_changed_dl, |
| 1674 | .switched_from = switched_from_dl, | 1729 | .switched_from = switched_from_dl, |
| 1675 | .switched_to = switched_to_dl, | 1730 | .switched_to = switched_to_dl, |
| 1731 | |||
| 1732 | .update_curr = update_curr_dl, | ||
| 1676 | }; | 1733 | }; |
| 1734 | |||
| 1735 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1736 | extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); | ||
| 1737 | |||
| 1738 | void print_dl_stats(struct seq_file *m, int cpu) | ||
| 1739 | { | ||
| 1740 | print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); | ||
| 1741 | } | ||
| 1742 | #endif /* CONFIG_SCHED_DEBUG */ | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..92cc52001e74 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 150 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
| 151 | { | 151 | { |
| 152 | struct task_struct *g, *p; | 152 | struct task_struct *g, *p; |
| 153 | unsigned long flags; | ||
| 154 | 153 | ||
| 155 | SEQ_printf(m, | 154 | SEQ_printf(m, |
| 156 | "\nrunnable tasks:\n" | 155 | "\nrunnable tasks:\n" |
| @@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 159 | "------------------------------------------------------" | 158 | "------------------------------------------------------" |
| 160 | "----------------------------------------------------\n"); | 159 | "----------------------------------------------------\n"); |
| 161 | 160 | ||
| 162 | read_lock_irqsave(&tasklist_lock, flags); | 161 | rcu_read_lock(); |
| 163 | 162 | for_each_process_thread(g, p) { | |
| 164 | do_each_thread(g, p) { | ||
| 165 | if (task_cpu(p) != rq_cpu) | 163 | if (task_cpu(p) != rq_cpu) |
| 166 | continue; | 164 | continue; |
| 167 | 165 | ||
| 168 | print_task(m, rq, p); | 166 | print_task(m, rq, p); |
| 169 | } while_each_thread(g, p); | 167 | } |
| 170 | 168 | rcu_read_unlock(); | |
| 171 | read_unlock_irqrestore(&tasklist_lock, flags); | ||
| 172 | } | 169 | } |
| 173 | 170 | ||
| 174 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 171 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
| @@ -264,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
| 264 | #undef P | 261 | #undef P |
| 265 | } | 262 | } |
| 266 | 263 | ||
| 264 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | ||
| 265 | { | ||
| 266 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | ||
| 267 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | ||
| 268 | } | ||
| 269 | |||
| 267 | extern __read_mostly int sched_clock_running; | 270 | extern __read_mostly int sched_clock_running; |
| 268 | 271 | ||
| 269 | static void print_cpu(struct seq_file *m, int cpu) | 272 | static void print_cpu(struct seq_file *m, int cpu) |
| @@ -332,10 +335,9 @@ do { \ | |||
| 332 | spin_lock_irqsave(&sched_debug_lock, flags); | 335 | spin_lock_irqsave(&sched_debug_lock, flags); |
| 333 | print_cfs_stats(m, cpu); | 336 | print_cfs_stats(m, cpu); |
| 334 | print_rt_stats(m, cpu); | 337 | print_rt_stats(m, cpu); |
| 338 | print_dl_stats(m, cpu); | ||
| 335 | 339 | ||
| 336 | rcu_read_lock(); | ||
| 337 | print_rq(m, rq, cpu); | 340 | print_rq(m, rq, cpu); |
| 338 | rcu_read_unlock(); | ||
| 339 | spin_unlock_irqrestore(&sched_debug_lock, flags); | 341 | spin_unlock_irqrestore(&sched_debug_lock, flags); |
| 340 | SEQ_printf(m, "\n"); | 342 | SEQ_printf(m, "\n"); |
| 341 | } | 343 | } |
| @@ -533,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) | |||
| 533 | unsigned long nr_faults = -1; | 535 | unsigned long nr_faults = -1; |
| 534 | int cpu_current, home_node; | 536 | int cpu_current, home_node; |
| 535 | 537 | ||
| 536 | if (p->numa_faults_memory) | 538 | if (p->numa_faults) |
| 537 | nr_faults = p->numa_faults_memory[2*node + i]; | 539 | nr_faults = p->numa_faults[2*node + i]; |
| 538 | 540 | ||
| 539 | cpu_current = !i ? (task_node(p) == node) : | 541 | cpu_current = !i ? (task_node(p) == node) : |
| 540 | (pol && node_isset(node, pol->v.nodes)); | 542 | (pol && node_isset(node, pol->v.nodes)); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..40667cbf371b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
| 26 | #include <linux/cpuidle.h> | ||
| 26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
| 27 | #include <linux/profile.h> | 28 | #include <linux/profile.h> |
| 28 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
| @@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 665 | } | 666 | } |
| 666 | 667 | ||
| 667 | #ifdef CONFIG_SMP | 668 | #ifdef CONFIG_SMP |
| 669 | static int select_idle_sibling(struct task_struct *p, int cpu); | ||
| 668 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 669 | 671 | ||
| 670 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| @@ -724,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
| 724 | account_cfs_rq_runtime(cfs_rq, delta_exec); | 726 | account_cfs_rq_runtime(cfs_rq, delta_exec); |
| 725 | } | 727 | } |
| 726 | 728 | ||
| 729 | static void update_curr_fair(struct rq *rq) | ||
| 730 | { | ||
| 731 | update_curr(cfs_rq_of(&rq->curr->se)); | ||
| 732 | } | ||
| 733 | |||
| 727 | static inline void | 734 | static inline void |
| 728 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 735 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 729 | { | 736 | { |
| @@ -826,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) | |||
| 826 | 833 | ||
| 827 | static unsigned int task_scan_min(struct task_struct *p) | 834 | static unsigned int task_scan_min(struct task_struct *p) |
| 828 | { | 835 | { |
| 836 | unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); | ||
| 829 | unsigned int scan, floor; | 837 | unsigned int scan, floor; |
| 830 | unsigned int windows = 1; | 838 | unsigned int windows = 1; |
| 831 | 839 | ||
| 832 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | 840 | if (scan_size < MAX_SCAN_WINDOW) |
| 833 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | 841 | windows = MAX_SCAN_WINDOW / scan_size; |
| 834 | floor = 1000 / windows; | 842 | floor = 1000 / windows; |
| 835 | 843 | ||
| 836 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | 844 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); |
| @@ -865,7 +873,6 @@ struct numa_group { | |||
| 865 | spinlock_t lock; /* nr_tasks, tasks */ | 873 | spinlock_t lock; /* nr_tasks, tasks */ |
| 866 | int nr_tasks; | 874 | int nr_tasks; |
| 867 | pid_t gid; | 875 | pid_t gid; |
| 868 | struct list_head task_list; | ||
| 869 | 876 | ||
| 870 | struct rcu_head rcu; | 877 | struct rcu_head rcu; |
| 871 | nodemask_t active_nodes; | 878 | nodemask_t active_nodes; |
| @@ -893,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
| 893 | return p->numa_group ? p->numa_group->gid : 0; | 900 | return p->numa_group ? p->numa_group->gid : 0; |
| 894 | } | 901 | } |
| 895 | 902 | ||
| 896 | static inline int task_faults_idx(int nid, int priv) | 903 | /* |
| 904 | * The averaged statistics, shared & private, memory & cpu, | ||
| 905 | * occupy the first half of the array. The second half of the | ||
| 906 | * array is for current counters, which are averaged into the | ||
| 907 | * first set by task_numa_placement. | ||
| 908 | */ | ||
| 909 | static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) | ||
| 897 | { | 910 | { |
| 898 | return NR_NUMA_HINT_FAULT_TYPES * nid + priv; | 911 | return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; |
| 899 | } | 912 | } |
| 900 | 913 | ||
| 901 | static inline unsigned long task_faults(struct task_struct *p, int nid) | 914 | static inline unsigned long task_faults(struct task_struct *p, int nid) |
| 902 | { | 915 | { |
| 903 | if (!p->numa_faults_memory) | 916 | if (!p->numa_faults) |
| 904 | return 0; | 917 | return 0; |
| 905 | 918 | ||
| 906 | return p->numa_faults_memory[task_faults_idx(nid, 0)] + | 919 | return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 907 | p->numa_faults_memory[task_faults_idx(nid, 1)]; | 920 | p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 908 | } | 921 | } |
| 909 | 922 | ||
| 910 | static inline unsigned long group_faults(struct task_struct *p, int nid) | 923 | static inline unsigned long group_faults(struct task_struct *p, int nid) |
| @@ -912,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) | |||
| 912 | if (!p->numa_group) | 925 | if (!p->numa_group) |
| 913 | return 0; | 926 | return 0; |
| 914 | 927 | ||
| 915 | return p->numa_group->faults[task_faults_idx(nid, 0)] + | 928 | return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 916 | p->numa_group->faults[task_faults_idx(nid, 1)]; | 929 | p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 917 | } | 930 | } |
| 918 | 931 | ||
| 919 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | 932 | static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) |
| 920 | { | 933 | { |
| 921 | return group->faults_cpu[task_faults_idx(nid, 0)] + | 934 | return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + |
| 922 | group->faults_cpu[task_faults_idx(nid, 1)]; | 935 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 936 | } | ||
| 937 | |||
| 938 | /* Handle placement on systems where not all nodes are directly connected. */ | ||
| 939 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | ||
| 940 | int maxdist, bool task) | ||
| 941 | { | ||
| 942 | unsigned long score = 0; | ||
| 943 | int node; | ||
| 944 | |||
| 945 | /* | ||
| 946 | * All nodes are directly connected, and the same distance | ||
| 947 | * from each other. No need for fancy placement algorithms. | ||
| 948 | */ | ||
| 949 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
| 950 | return 0; | ||
| 951 | |||
| 952 | /* | ||
| 953 | * This code is called for each node, introducing N^2 complexity, | ||
| 954 | * which should be ok given the number of nodes rarely exceeds 8. | ||
| 955 | */ | ||
| 956 | for_each_online_node(node) { | ||
| 957 | unsigned long faults; | ||
| 958 | int dist = node_distance(nid, node); | ||
| 959 | |||
| 960 | /* | ||
| 961 | * The furthest away nodes in the system are not interesting | ||
| 962 | * for placement; nid was already counted. | ||
| 963 | */ | ||
| 964 | if (dist == sched_max_numa_distance || node == nid) | ||
| 965 | continue; | ||
| 966 | |||
| 967 | /* | ||
| 968 | * On systems with a backplane NUMA topology, compare groups | ||
| 969 | * of nodes, and move tasks towards the group with the most | ||
| 970 | * memory accesses. When comparing two nodes at distance | ||
| 971 | * "hoplimit", only nodes closer by than "hoplimit" are part | ||
| 972 | * of each group. Skip other nodes. | ||
| 973 | */ | ||
| 974 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
| 975 | dist > maxdist) | ||
| 976 | continue; | ||
| 977 | |||
| 978 | /* Add up the faults from nearby nodes. */ | ||
| 979 | if (task) | ||
| 980 | faults = task_faults(p, node); | ||
| 981 | else | ||
| 982 | faults = group_faults(p, node); | ||
| 983 | |||
| 984 | /* | ||
| 985 | * On systems with a glueless mesh NUMA topology, there are | ||
| 986 | * no fixed "groups of nodes". Instead, nodes that are not | ||
| 987 | * directly connected bounce traffic through intermediate | ||
| 988 | * nodes; a numa_group can occupy any set of nodes. | ||
| 989 | * The further away a node is, the less the faults count. | ||
| 990 | * This seems to result in good task placement. | ||
| 991 | */ | ||
| 992 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
| 993 | faults *= (sched_max_numa_distance - dist); | ||
| 994 | faults /= (sched_max_numa_distance - LOCAL_DISTANCE); | ||
| 995 | } | ||
| 996 | |||
| 997 | score += faults; | ||
| 998 | } | ||
| 999 | |||
| 1000 | return score; | ||
| 923 | } | 1001 | } |
| 924 | 1002 | ||
| 925 | /* | 1003 | /* |
| @@ -928,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | |||
| 928 | * larger multiplier, in order to group tasks together that are almost | 1006 | * larger multiplier, in order to group tasks together that are almost |
| 929 | * evenly spread out between numa nodes. | 1007 | * evenly spread out between numa nodes. |
| 930 | */ | 1008 | */ |
| 931 | static inline unsigned long task_weight(struct task_struct *p, int nid) | 1009 | static inline unsigned long task_weight(struct task_struct *p, int nid, |
| 1010 | int dist) | ||
| 932 | { | 1011 | { |
| 933 | unsigned long total_faults; | 1012 | unsigned long faults, total_faults; |
| 934 | 1013 | ||
| 935 | if (!p->numa_faults_memory) | 1014 | if (!p->numa_faults) |
| 936 | return 0; | 1015 | return 0; |
| 937 | 1016 | ||
| 938 | total_faults = p->total_numa_faults; | 1017 | total_faults = p->total_numa_faults; |
| @@ -940,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) | |||
| 940 | if (!total_faults) | 1019 | if (!total_faults) |
| 941 | return 0; | 1020 | return 0; |
| 942 | 1021 | ||
| 943 | return 1000 * task_faults(p, nid) / total_faults; | 1022 | faults = task_faults(p, nid); |
| 1023 | faults += score_nearby_nodes(p, nid, dist, true); | ||
| 1024 | |||
| 1025 | return 1000 * faults / total_faults; | ||
| 944 | } | 1026 | } |
| 945 | 1027 | ||
| 946 | static inline unsigned long group_weight(struct task_struct *p, int nid) | 1028 | static inline unsigned long group_weight(struct task_struct *p, int nid, |
| 1029 | int dist) | ||
| 947 | { | 1030 | { |
| 948 | if (!p->numa_group || !p->numa_group->total_faults) | 1031 | unsigned long faults, total_faults; |
| 1032 | |||
| 1033 | if (!p->numa_group) | ||
| 949 | return 0; | 1034 | return 0; |
| 950 | 1035 | ||
| 951 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | 1036 | total_faults = p->numa_group->total_faults; |
| 1037 | |||
| 1038 | if (!total_faults) | ||
| 1039 | return 0; | ||
| 1040 | |||
| 1041 | faults = group_faults(p, nid); | ||
| 1042 | faults += score_nearby_nodes(p, nid, dist, false); | ||
| 1043 | |||
| 1044 | return 1000 * faults / total_faults; | ||
| 952 | } | 1045 | } |
| 953 | 1046 | ||
| 954 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | 1047 | bool should_numa_migrate_memory(struct task_struct *p, struct page * page, |
| @@ -1038,7 +1131,8 @@ struct numa_stats { | |||
| 1038 | */ | 1131 | */ |
| 1039 | static void update_numa_stats(struct numa_stats *ns, int nid) | 1132 | static void update_numa_stats(struct numa_stats *ns, int nid) |
| 1040 | { | 1133 | { |
| 1041 | int cpu, cpus = 0; | 1134 | int smt, cpu, cpus = 0; |
| 1135 | unsigned long capacity; | ||
| 1042 | 1136 | ||
| 1043 | memset(ns, 0, sizeof(*ns)); | 1137 | memset(ns, 0, sizeof(*ns)); |
| 1044 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1138 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
| @@ -1062,8 +1156,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
| 1062 | if (!cpus) | 1156 | if (!cpus) |
| 1063 | return; | 1157 | return; |
| 1064 | 1158 | ||
| 1065 | ns->task_capacity = | 1159 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ |
| 1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1160 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); |
| 1161 | capacity = cpus / smt; /* cores */ | ||
| 1162 | |||
| 1163 | ns->task_capacity = min_t(unsigned, capacity, | ||
| 1164 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); | ||
| 1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1165 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
| 1068 | } | 1166 | } |
| 1069 | 1167 | ||
| @@ -1076,6 +1174,7 @@ struct task_numa_env { | |||
| 1076 | struct numa_stats src_stats, dst_stats; | 1174 | struct numa_stats src_stats, dst_stats; |
| 1077 | 1175 | ||
| 1078 | int imbalance_pct; | 1176 | int imbalance_pct; |
| 1177 | int dist; | ||
| 1079 | 1178 | ||
| 1080 | struct task_struct *best_task; | 1179 | struct task_struct *best_task; |
| 1081 | long best_imp; | 1180 | long best_imp; |
| @@ -1155,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1155 | long load; | 1254 | long load; |
| 1156 | long imp = env->p->numa_group ? groupimp : taskimp; | 1255 | long imp = env->p->numa_group ? groupimp : taskimp; |
| 1157 | long moveimp = imp; | 1256 | long moveimp = imp; |
| 1257 | int dist = env->dist; | ||
| 1158 | 1258 | ||
| 1159 | rcu_read_lock(); | 1259 | rcu_read_lock(); |
| 1160 | cur = ACCESS_ONCE(dst_rq->curr); | 1260 | |
| 1161 | if (cur->pid == 0) /* idle */ | 1261 | raw_spin_lock_irq(&dst_rq->lock); |
| 1262 | cur = dst_rq->curr; | ||
| 1263 | /* | ||
| 1264 | * No need to move the exiting task, and this ensures that ->curr | ||
| 1265 | * wasn't reaped and thus get_task_struct() in task_numa_assign() | ||
| 1266 | * is safe under RCU read lock. | ||
| 1267 | * Note that rcu_read_lock() itself can't protect from the final | ||
| 1268 | * put_task_struct() after the last schedule(). | ||
| 1269 | */ | ||
| 1270 | if ((cur->flags & PF_EXITING) || is_idle_task(cur)) | ||
| 1162 | cur = NULL; | 1271 | cur = NULL; |
| 1272 | raw_spin_unlock_irq(&dst_rq->lock); | ||
| 1273 | |||
| 1274 | /* | ||
| 1275 | * Because we have preemption enabled we can get migrated around and | ||
| 1276 | * end try selecting ourselves (current == env->p) as a swap candidate. | ||
| 1277 | */ | ||
| 1278 | if (cur == env->p) | ||
| 1279 | goto unlock; | ||
| 1163 | 1280 | ||
| 1164 | /* | 1281 | /* |
| 1165 | * "imp" is the fault differential for the source task between the | 1282 | * "imp" is the fault differential for the source task between the |
| @@ -1178,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1178 | * in any group then look only at task weights. | 1295 | * in any group then look only at task weights. |
| 1179 | */ | 1296 | */ |
| 1180 | if (cur->numa_group == env->p->numa_group) { | 1297 | if (cur->numa_group == env->p->numa_group) { |
| 1181 | imp = taskimp + task_weight(cur, env->src_nid) - | 1298 | imp = taskimp + task_weight(cur, env->src_nid, dist) - |
| 1182 | task_weight(cur, env->dst_nid); | 1299 | task_weight(cur, env->dst_nid, dist); |
| 1183 | /* | 1300 | /* |
| 1184 | * Add some hysteresis to prevent swapping the | 1301 | * Add some hysteresis to prevent swapping the |
| 1185 | * tasks within a group over tiny differences. | 1302 | * tasks within a group over tiny differences. |
| @@ -1193,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1193 | * instead. | 1310 | * instead. |
| 1194 | */ | 1311 | */ |
| 1195 | if (cur->numa_group) | 1312 | if (cur->numa_group) |
| 1196 | imp += group_weight(cur, env->src_nid) - | 1313 | imp += group_weight(cur, env->src_nid, dist) - |
| 1197 | group_weight(cur, env->dst_nid); | 1314 | group_weight(cur, env->dst_nid, dist); |
| 1198 | else | 1315 | else |
| 1199 | imp += task_weight(cur, env->src_nid) - | 1316 | imp += task_weight(cur, env->src_nid, dist) - |
| 1200 | task_weight(cur, env->dst_nid); | 1317 | task_weight(cur, env->dst_nid, dist); |
| 1201 | } | 1318 | } |
| 1202 | } | 1319 | } |
| 1203 | 1320 | ||
| @@ -1206,7 +1323,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1206 | 1323 | ||
| 1207 | if (!cur) { | 1324 | if (!cur) { |
| 1208 | /* Is there capacity at our destination? */ | 1325 | /* Is there capacity at our destination? */ |
| 1209 | if (env->src_stats.has_free_capacity && | 1326 | if (env->src_stats.nr_running <= env->src_stats.task_capacity && |
| 1210 | !env->dst_stats.has_free_capacity) | 1327 | !env->dst_stats.has_free_capacity) |
| 1211 | goto unlock; | 1328 | goto unlock; |
| 1212 | 1329 | ||
| @@ -1252,6 +1369,13 @@ balance: | |||
| 1252 | if (load_too_imbalanced(src_load, dst_load, env)) | 1369 | if (load_too_imbalanced(src_load, dst_load, env)) |
| 1253 | goto unlock; | 1370 | goto unlock; |
| 1254 | 1371 | ||
| 1372 | /* | ||
| 1373 | * One idle CPU per node is evaluated for a task numa move. | ||
| 1374 | * Call select_idle_sibling to maybe find a better one. | ||
| 1375 | */ | ||
| 1376 | if (!cur) | ||
| 1377 | env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); | ||
| 1378 | |||
| 1255 | assign: | 1379 | assign: |
| 1256 | task_numa_assign(env, cur, imp); | 1380 | task_numa_assign(env, cur, imp); |
| 1257 | unlock: | 1381 | unlock: |
| @@ -1289,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1289 | }; | 1413 | }; |
| 1290 | struct sched_domain *sd; | 1414 | struct sched_domain *sd; |
| 1291 | unsigned long taskweight, groupweight; | 1415 | unsigned long taskweight, groupweight; |
| 1292 | int nid, ret; | 1416 | int nid, ret, dist; |
| 1293 | long taskimp, groupimp; | 1417 | long taskimp, groupimp; |
| 1294 | 1418 | ||
| 1295 | /* | 1419 | /* |
| @@ -1317,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1317 | return -EINVAL; | 1441 | return -EINVAL; |
| 1318 | } | 1442 | } |
| 1319 | 1443 | ||
| 1320 | taskweight = task_weight(p, env.src_nid); | ||
| 1321 | groupweight = group_weight(p, env.src_nid); | ||
| 1322 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1323 | env.dst_nid = p->numa_preferred_nid; | 1444 | env.dst_nid = p->numa_preferred_nid; |
| 1324 | taskimp = task_weight(p, env.dst_nid) - taskweight; | 1445 | dist = env.dist = node_distance(env.src_nid, env.dst_nid); |
| 1325 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1446 | taskweight = task_weight(p, env.src_nid, dist); |
| 1447 | groupweight = group_weight(p, env.src_nid, dist); | ||
| 1448 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1449 | taskimp = task_weight(p, env.dst_nid, dist) - taskweight; | ||
| 1450 | groupimp = group_weight(p, env.dst_nid, dist) - groupweight; | ||
| 1326 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1451 | update_numa_stats(&env.dst_stats, env.dst_nid); |
| 1327 | 1452 | ||
| 1328 | /* Try to find a spot on the preferred nid. */ | 1453 | /* Try to find a spot on the preferred nid. */ |
| 1329 | task_numa_find_cpu(&env, taskimp, groupimp); | 1454 | task_numa_find_cpu(&env, taskimp, groupimp); |
| 1330 | 1455 | ||
| 1331 | /* No space available on the preferred nid. Look elsewhere. */ | 1456 | /* |
| 1332 | if (env.best_cpu == -1) { | 1457 | * Look at other nodes in these cases: |
| 1458 | * - there is no space available on the preferred_nid | ||
| 1459 | * - the task is part of a numa_group that is interleaved across | ||
| 1460 | * multiple NUMA nodes; in order to better consolidate the group, | ||
| 1461 | * we need to check other locations. | ||
| 1462 | */ | ||
| 1463 | if (env.best_cpu == -1 || (p->numa_group && | ||
| 1464 | nodes_weight(p->numa_group->active_nodes) > 1)) { | ||
| 1333 | for_each_online_node(nid) { | 1465 | for_each_online_node(nid) { |
| 1334 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1466 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
| 1335 | continue; | 1467 | continue; |
| 1336 | 1468 | ||
| 1469 | dist = node_distance(env.src_nid, env.dst_nid); | ||
| 1470 | if (sched_numa_topology_type == NUMA_BACKPLANE && | ||
| 1471 | dist != env.dist) { | ||
| 1472 | taskweight = task_weight(p, env.src_nid, dist); | ||
| 1473 | groupweight = group_weight(p, env.src_nid, dist); | ||
| 1474 | } | ||
| 1475 | |||
| 1337 | /* Only consider nodes where both task and groups benefit */ | 1476 | /* Only consider nodes where both task and groups benefit */ |
| 1338 | taskimp = task_weight(p, nid) - taskweight; | 1477 | taskimp = task_weight(p, nid, dist) - taskweight; |
| 1339 | groupimp = group_weight(p, nid) - groupweight; | 1478 | groupimp = group_weight(p, nid, dist) - groupweight; |
| 1340 | if (taskimp < 0 && groupimp < 0) | 1479 | if (taskimp < 0 && groupimp < 0) |
| 1341 | continue; | 1480 | continue; |
| 1342 | 1481 | ||
| 1482 | env.dist = dist; | ||
| 1343 | env.dst_nid = nid; | 1483 | env.dst_nid = nid; |
| 1344 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1484 | update_numa_stats(&env.dst_stats, env.dst_nid); |
| 1345 | task_numa_find_cpu(&env, taskimp, groupimp); | 1485 | task_numa_find_cpu(&env, taskimp, groupimp); |
| @@ -1394,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1394 | unsigned long interval = HZ; | 1534 | unsigned long interval = HZ; |
| 1395 | 1535 | ||
| 1396 | /* This task has no NUMA fault statistics yet */ | 1536 | /* This task has no NUMA fault statistics yet */ |
| 1397 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) | 1537 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
| 1398 | return; | 1538 | return; |
| 1399 | 1539 | ||
| 1400 | /* Periodically retry migrating the task to the preferred node */ | 1540 | /* Periodically retry migrating the task to the preferred node */ |
| @@ -1506,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1506 | * scanning faster if shared accesses dominate as it may | 1646 | * scanning faster if shared accesses dominate as it may |
| 1507 | * simply bounce migrations uselessly | 1647 | * simply bounce migrations uselessly |
| 1508 | */ | 1648 | */ |
| 1509 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | 1649 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); |
| 1510 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | 1650 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; |
| 1511 | } | 1651 | } |
| 1512 | 1652 | ||
| @@ -1543,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1543 | return delta; | 1683 | return delta; |
| 1544 | } | 1684 | } |
| 1545 | 1685 | ||
| 1686 | /* | ||
| 1687 | * Determine the preferred nid for a task in a numa_group. This needs to | ||
| 1688 | * be done in a way that produces consistent results with group_weight, | ||
| 1689 | * otherwise workloads might not converge. | ||
| 1690 | */ | ||
| 1691 | static int preferred_group_nid(struct task_struct *p, int nid) | ||
| 1692 | { | ||
| 1693 | nodemask_t nodes; | ||
| 1694 | int dist; | ||
| 1695 | |||
| 1696 | /* Direct connections between all NUMA nodes. */ | ||
| 1697 | if (sched_numa_topology_type == NUMA_DIRECT) | ||
| 1698 | return nid; | ||
| 1699 | |||
| 1700 | /* | ||
| 1701 | * On a system with glueless mesh NUMA topology, group_weight | ||
| 1702 | * scores nodes according to the number of NUMA hinting faults on | ||
| 1703 | * both the node itself, and on nearby nodes. | ||
| 1704 | */ | ||
| 1705 | if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { | ||
| 1706 | unsigned long score, max_score = 0; | ||
| 1707 | int node, max_node = nid; | ||
| 1708 | |||
| 1709 | dist = sched_max_numa_distance; | ||
| 1710 | |||
| 1711 | for_each_online_node(node) { | ||
| 1712 | score = group_weight(p, node, dist); | ||
| 1713 | if (score > max_score) { | ||
| 1714 | max_score = score; | ||
| 1715 | max_node = node; | ||
| 1716 | } | ||
| 1717 | } | ||
| 1718 | return max_node; | ||
| 1719 | } | ||
| 1720 | |||
| 1721 | /* | ||
| 1722 | * Finding the preferred nid in a system with NUMA backplane | ||
| 1723 | * interconnect topology is more involved. The goal is to locate | ||
| 1724 | * tasks from numa_groups near each other in the system, and | ||
| 1725 | * untangle workloads from different sides of the system. This requires | ||
| 1726 | * searching down the hierarchy of node groups, recursively searching | ||
| 1727 | * inside the highest scoring group of nodes. The nodemask tricks | ||
| 1728 | * keep the complexity of the search down. | ||
| 1729 | */ | ||
| 1730 | nodes = node_online_map; | ||
| 1731 | for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { | ||
| 1732 | unsigned long max_faults = 0; | ||
| 1733 | nodemask_t max_group; | ||
| 1734 | int a, b; | ||
| 1735 | |||
| 1736 | /* Are there nodes at this distance from each other? */ | ||
| 1737 | if (!find_numa_distance(dist)) | ||
| 1738 | continue; | ||
| 1739 | |||
| 1740 | for_each_node_mask(a, nodes) { | ||
| 1741 | unsigned long faults = 0; | ||
| 1742 | nodemask_t this_group; | ||
| 1743 | nodes_clear(this_group); | ||
| 1744 | |||
| 1745 | /* Sum group's NUMA faults; includes a==b case. */ | ||
| 1746 | for_each_node_mask(b, nodes) { | ||
| 1747 | if (node_distance(a, b) < dist) { | ||
| 1748 | faults += group_faults(p, b); | ||
| 1749 | node_set(b, this_group); | ||
| 1750 | node_clear(b, nodes); | ||
| 1751 | } | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | /* Remember the top group. */ | ||
| 1755 | if (faults > max_faults) { | ||
| 1756 | max_faults = faults; | ||
| 1757 | max_group = this_group; | ||
| 1758 | /* | ||
| 1759 | * subtle: at the smallest distance there is | ||
| 1760 | * just one node left in each "group", the | ||
| 1761 | * winner is the preferred nid. | ||
| 1762 | */ | ||
| 1763 | nid = a; | ||
| 1764 | } | ||
| 1765 | } | ||
| 1766 | /* Next round, evaluate the nodes within max_group. */ | ||
| 1767 | nodes = max_group; | ||
| 1768 | } | ||
| 1769 | return nid; | ||
| 1770 | } | ||
| 1771 | |||
| 1546 | static void task_numa_placement(struct task_struct *p) | 1772 | static void task_numa_placement(struct task_struct *p) |
| 1547 | { | 1773 | { |
| 1548 | int seq, nid, max_nid = -1, max_group_nid = -1; | 1774 | int seq, nid, max_nid = -1, max_group_nid = -1; |
| @@ -1570,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1570 | 1796 | ||
| 1571 | /* Find the node with the highest number of faults */ | 1797 | /* Find the node with the highest number of faults */ |
| 1572 | for_each_online_node(nid) { | 1798 | for_each_online_node(nid) { |
| 1799 | /* Keep track of the offsets in numa_faults array */ | ||
| 1800 | int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; | ||
| 1573 | unsigned long faults = 0, group_faults = 0; | 1801 | unsigned long faults = 0, group_faults = 0; |
| 1574 | int priv, i; | 1802 | int priv; |
| 1575 | 1803 | ||
| 1576 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { | 1804 | for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { |
| 1577 | long diff, f_diff, f_weight; | 1805 | long diff, f_diff, f_weight; |
| 1578 | 1806 | ||
| 1579 | i = task_faults_idx(nid, priv); | 1807 | mem_idx = task_faults_idx(NUMA_MEM, nid, priv); |
| 1808 | membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); | ||
| 1809 | cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); | ||
| 1810 | cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); | ||
| 1580 | 1811 | ||
| 1581 | /* Decay existing window, copy faults since last scan */ | 1812 | /* Decay existing window, copy faults since last scan */ |
| 1582 | diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; | 1813 | diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; |
| 1583 | fault_types[priv] += p->numa_faults_buffer_memory[i]; | 1814 | fault_types[priv] += p->numa_faults[membuf_idx]; |
| 1584 | p->numa_faults_buffer_memory[i] = 0; | 1815 | p->numa_faults[membuf_idx] = 0; |
| 1585 | 1816 | ||
| 1586 | /* | 1817 | /* |
| 1587 | * Normalize the faults_from, so all tasks in a group | 1818 | * Normalize the faults_from, so all tasks in a group |
| @@ -1591,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1591 | * faults are less important. | 1822 | * faults are less important. |
| 1592 | */ | 1823 | */ |
| 1593 | f_weight = div64_u64(runtime << 16, period + 1); | 1824 | f_weight = div64_u64(runtime << 16, period + 1); |
| 1594 | f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / | 1825 | f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / |
| 1595 | (total_faults + 1); | 1826 | (total_faults + 1); |
| 1596 | f_diff = f_weight - p->numa_faults_cpu[i] / 2; | 1827 | f_diff = f_weight - p->numa_faults[cpu_idx] / 2; |
| 1597 | p->numa_faults_buffer_cpu[i] = 0; | 1828 | p->numa_faults[cpubuf_idx] = 0; |
| 1598 | 1829 | ||
| 1599 | p->numa_faults_memory[i] += diff; | 1830 | p->numa_faults[mem_idx] += diff; |
| 1600 | p->numa_faults_cpu[i] += f_diff; | 1831 | p->numa_faults[cpu_idx] += f_diff; |
| 1601 | faults += p->numa_faults_memory[i]; | 1832 | faults += p->numa_faults[mem_idx]; |
| 1602 | p->total_numa_faults += diff; | 1833 | p->total_numa_faults += diff; |
| 1603 | if (p->numa_group) { | 1834 | if (p->numa_group) { |
| 1604 | /* safe because we can only change our own group */ | 1835 | /* |
| 1605 | p->numa_group->faults[i] += diff; | 1836 | * safe because we can only change our own group |
| 1606 | p->numa_group->faults_cpu[i] += f_diff; | 1837 | * |
| 1838 | * mem_idx represents the offset for a given | ||
| 1839 | * nid and priv in a specific region because it | ||
| 1840 | * is at the beginning of the numa_faults array. | ||
| 1841 | */ | ||
| 1842 | p->numa_group->faults[mem_idx] += diff; | ||
| 1843 | p->numa_group->faults_cpu[mem_idx] += f_diff; | ||
| 1607 | p->numa_group->total_faults += diff; | 1844 | p->numa_group->total_faults += diff; |
| 1608 | group_faults += p->numa_group->faults[i]; | 1845 | group_faults += p->numa_group->faults[mem_idx]; |
| 1609 | } | 1846 | } |
| 1610 | } | 1847 | } |
| 1611 | 1848 | ||
| @@ -1625,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1625 | if (p->numa_group) { | 1862 | if (p->numa_group) { |
| 1626 | update_numa_active_node_mask(p->numa_group); | 1863 | update_numa_active_node_mask(p->numa_group); |
| 1627 | spin_unlock_irq(group_lock); | 1864 | spin_unlock_irq(group_lock); |
| 1628 | max_nid = max_group_nid; | 1865 | max_nid = preferred_group_nid(p, max_group_nid); |
| 1629 | } | 1866 | } |
| 1630 | 1867 | ||
| 1631 | if (max_faults) { | 1868 | if (max_faults) { |
| @@ -1668,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1668 | 1905 | ||
| 1669 | atomic_set(&grp->refcount, 1); | 1906 | atomic_set(&grp->refcount, 1); |
| 1670 | spin_lock_init(&grp->lock); | 1907 | spin_lock_init(&grp->lock); |
| 1671 | INIT_LIST_HEAD(&grp->task_list); | ||
| 1672 | grp->gid = p->pid; | 1908 | grp->gid = p->pid; |
| 1673 | /* Second half of the array tracks nids where faults happen */ | 1909 | /* Second half of the array tracks nids where faults happen */ |
| 1674 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | 1910 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * |
| @@ -1677,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1677 | node_set(task_node(current), grp->active_nodes); | 1913 | node_set(task_node(current), grp->active_nodes); |
| 1678 | 1914 | ||
| 1679 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 1915 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1680 | grp->faults[i] = p->numa_faults_memory[i]; | 1916 | grp->faults[i] = p->numa_faults[i]; |
| 1681 | 1917 | ||
| 1682 | grp->total_faults = p->total_numa_faults; | 1918 | grp->total_faults = p->total_numa_faults; |
| 1683 | 1919 | ||
| 1684 | list_add(&p->numa_entry, &grp->task_list); | ||
| 1685 | grp->nr_tasks++; | 1920 | grp->nr_tasks++; |
| 1686 | rcu_assign_pointer(p->numa_group, grp); | 1921 | rcu_assign_pointer(p->numa_group, grp); |
| 1687 | } | 1922 | } |
| @@ -1736,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1736 | double_lock_irq(&my_grp->lock, &grp->lock); | 1971 | double_lock_irq(&my_grp->lock, &grp->lock); |
| 1737 | 1972 | ||
| 1738 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { | 1973 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
| 1739 | my_grp->faults[i] -= p->numa_faults_memory[i]; | 1974 | my_grp->faults[i] -= p->numa_faults[i]; |
| 1740 | grp->faults[i] += p->numa_faults_memory[i]; | 1975 | grp->faults[i] += p->numa_faults[i]; |
| 1741 | } | 1976 | } |
| 1742 | my_grp->total_faults -= p->total_numa_faults; | 1977 | my_grp->total_faults -= p->total_numa_faults; |
| 1743 | grp->total_faults += p->total_numa_faults; | 1978 | grp->total_faults += p->total_numa_faults; |
| 1744 | 1979 | ||
| 1745 | list_move(&p->numa_entry, &grp->task_list); | ||
| 1746 | my_grp->nr_tasks--; | 1980 | my_grp->nr_tasks--; |
| 1747 | grp->nr_tasks++; | 1981 | grp->nr_tasks++; |
| 1748 | 1982 | ||
| @@ -1762,27 +1996,23 @@ no_join: | |||
| 1762 | void task_numa_free(struct task_struct *p) | 1996 | void task_numa_free(struct task_struct *p) |
| 1763 | { | 1997 | { |
| 1764 | struct numa_group *grp = p->numa_group; | 1998 | struct numa_group *grp = p->numa_group; |
| 1765 | void *numa_faults = p->numa_faults_memory; | 1999 | void *numa_faults = p->numa_faults; |
| 1766 | unsigned long flags; | 2000 | unsigned long flags; |
| 1767 | int i; | 2001 | int i; |
| 1768 | 2002 | ||
| 1769 | if (grp) { | 2003 | if (grp) { |
| 1770 | spin_lock_irqsave(&grp->lock, flags); | 2004 | spin_lock_irqsave(&grp->lock, flags); |
| 1771 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 2005 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1772 | grp->faults[i] -= p->numa_faults_memory[i]; | 2006 | grp->faults[i] -= p->numa_faults[i]; |
| 1773 | grp->total_faults -= p->total_numa_faults; | 2007 | grp->total_faults -= p->total_numa_faults; |
| 1774 | 2008 | ||
| 1775 | list_del(&p->numa_entry); | ||
| 1776 | grp->nr_tasks--; | 2009 | grp->nr_tasks--; |
| 1777 | spin_unlock_irqrestore(&grp->lock, flags); | 2010 | spin_unlock_irqrestore(&grp->lock, flags); |
| 1778 | rcu_assign_pointer(p->numa_group, NULL); | 2011 | RCU_INIT_POINTER(p->numa_group, NULL); |
| 1779 | put_numa_group(grp); | 2012 | put_numa_group(grp); |
| 1780 | } | 2013 | } |
| 1781 | 2014 | ||
| 1782 | p->numa_faults_memory = NULL; | 2015 | p->numa_faults = NULL; |
| 1783 | p->numa_faults_buffer_memory = NULL; | ||
| 1784 | p->numa_faults_cpu= NULL; | ||
| 1785 | p->numa_faults_buffer_cpu = NULL; | ||
| 1786 | kfree(numa_faults); | 2016 | kfree(numa_faults); |
| 1787 | } | 2017 | } |
| 1788 | 2018 | ||
| @@ -1804,29 +2034,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1804 | if (!p->mm) | 2034 | if (!p->mm) |
| 1805 | return; | 2035 | return; |
| 1806 | 2036 | ||
| 1807 | /* Do not worry about placement if exiting */ | ||
| 1808 | if (p->state == TASK_DEAD) | ||
| 1809 | return; | ||
| 1810 | |||
| 1811 | /* Allocate buffer to track faults on a per-node basis */ | 2037 | /* Allocate buffer to track faults on a per-node basis */ |
| 1812 | if (unlikely(!p->numa_faults_memory)) { | 2038 | if (unlikely(!p->numa_faults)) { |
| 1813 | int size = sizeof(*p->numa_faults_memory) * | 2039 | int size = sizeof(*p->numa_faults) * |
| 1814 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; | 2040 | NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; |
| 1815 | 2041 | ||
| 1816 | p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); | 2042 | p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); |
| 1817 | if (!p->numa_faults_memory) | 2043 | if (!p->numa_faults) |
| 1818 | return; | 2044 | return; |
| 1819 | 2045 | ||
| 1820 | BUG_ON(p->numa_faults_buffer_memory); | ||
| 1821 | /* | ||
| 1822 | * The averaged statistics, shared & private, memory & cpu, | ||
| 1823 | * occupy the first half of the array. The second half of the | ||
| 1824 | * array is for current counters, which are averaged into the | ||
| 1825 | * first set by task_numa_placement. | ||
| 1826 | */ | ||
| 1827 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); | ||
| 1828 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
| 1829 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
| 1830 | p->total_numa_faults = 0; | 2046 | p->total_numa_faults = 0; |
| 1831 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 2047 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
| 1832 | } | 2048 | } |
| @@ -1866,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1866 | if (migrated) | 2082 | if (migrated) |
| 1867 | p->numa_pages_migrated += pages; | 2083 | p->numa_pages_migrated += pages; |
| 1868 | 2084 | ||
| 1869 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; | 2085 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
| 1870 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | 2086 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
| 1871 | p->numa_faults_locality[local] += pages; | 2087 | p->numa_faults_locality[local] += pages; |
| 1872 | } | 2088 | } |
| 1873 | 2089 | ||
| @@ -1946,7 +2162,7 @@ void task_numa_work(struct callback_head *work) | |||
| 1946 | vma = mm->mmap; | 2162 | vma = mm->mmap; |
| 1947 | } | 2163 | } |
| 1948 | for (; vma; vma = vma->vm_next) { | 2164 | for (; vma; vma = vma->vm_next) { |
| 1949 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) | 2165 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) |
| 1950 | continue; | 2166 | continue; |
| 1951 | 2167 | ||
| 1952 | /* | 2168 | /* |
| @@ -2211,8 +2427,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
| 2211 | 2427 | ||
| 2212 | /* | 2428 | /* |
| 2213 | * As y^PERIOD = 1/2, we can combine | 2429 | * As y^PERIOD = 1/2, we can combine |
| 2214 | * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) | 2430 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) |
| 2215 | * With a look-up table which covers k^n (n<PERIOD) | 2431 | * With a look-up table which covers y^n (n<PERIOD) |
| 2216 | * | 2432 | * |
| 2217 | * To achieve constant time decay_load. | 2433 | * To achieve constant time decay_load. |
| 2218 | */ | 2434 | */ |
| @@ -2377,6 +2593,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
| 2377 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; | 2593 | tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; |
| 2378 | tg_contrib -= cfs_rq->tg_load_contrib; | 2594 | tg_contrib -= cfs_rq->tg_load_contrib; |
| 2379 | 2595 | ||
| 2596 | if (!tg_contrib) | ||
| 2597 | return; | ||
| 2598 | |||
| 2380 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { | 2599 | if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { |
| 2381 | atomic_long_add(tg_contrib, &tg->load_avg); | 2600 | atomic_long_add(tg_contrib, &tg->load_avg); |
| 2382 | cfs_rq->tg_load_contrib += tg_contrib; | 2601 | cfs_rq->tg_load_contrib += tg_contrib; |
| @@ -3786,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) | |||
| 3786 | 4005 | ||
| 3787 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 4006 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
| 3788 | { | 4007 | { |
| 4008 | /* init_cfs_bandwidth() was not called */ | ||
| 4009 | if (!cfs_b->throttled_cfs_rq.next) | ||
| 4010 | return; | ||
| 4011 | |||
| 3789 | hrtimer_cancel(&cfs_b->period_timer); | 4012 | hrtimer_cancel(&cfs_b->period_timer); |
| 3790 | hrtimer_cancel(&cfs_b->slack_timer); | 4013 | hrtimer_cancel(&cfs_b->slack_timer); |
| 3791 | } | 4014 | } |
| @@ -3892,14 +4115,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 3892 | resched_curr(rq); | 4115 | resched_curr(rq); |
| 3893 | return; | 4116 | return; |
| 3894 | } | 4117 | } |
| 3895 | |||
| 3896 | /* | ||
| 3897 | * Don't schedule slices shorter than 10000ns, that just | ||
| 3898 | * doesn't make sense. Rely on vruntime for fairness. | ||
| 3899 | */ | ||
| 3900 | if (rq->curr != p) | ||
| 3901 | delta = max_t(s64, 10000LL, delta); | ||
| 3902 | |||
| 3903 | hrtick_start(rq, delta); | 4118 | hrtick_start(rq, delta); |
| 3904 | } | 4119 | } |
| 3905 | } | 4120 | } |
| @@ -4087,7 +4302,7 @@ static unsigned long capacity_of(int cpu) | |||
| 4087 | static unsigned long cpu_avg_load_per_task(int cpu) | 4302 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4088 | { | 4303 | { |
| 4089 | struct rq *rq = cpu_rq(cpu); | 4304 | struct rq *rq = cpu_rq(cpu); |
| 4090 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 4305 | unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); |
| 4091 | unsigned long load_avg = rq->cfs.runnable_load_avg; | 4306 | unsigned long load_avg = rq->cfs.runnable_load_avg; |
| 4092 | 4307 | ||
| 4093 | if (nr_running) | 4308 | if (nr_running) |
| @@ -4213,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 4213 | * wl = S * s'_i; see (2) | 4428 | * wl = S * s'_i; see (2) |
| 4214 | */ | 4429 | */ |
| 4215 | if (W > 0 && w < W) | 4430 | if (W > 0 && w < W) |
| 4216 | wl = (w * tg->shares) / W; | 4431 | wl = (w * (long)tg->shares) / W; |
| 4217 | else | 4432 | else |
| 4218 | wl = tg->shares; | 4433 | wl = tg->shares; |
| 4219 | 4434 | ||
| @@ -4276,8 +4491,8 @@ static int wake_wide(struct task_struct *p) | |||
| 4276 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 4491 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
| 4277 | { | 4492 | { |
| 4278 | s64 this_load, load; | 4493 | s64 this_load, load; |
| 4494 | s64 this_eff_load, prev_eff_load; | ||
| 4279 | int idx, this_cpu, prev_cpu; | 4495 | int idx, this_cpu, prev_cpu; |
| 4280 | unsigned long tl_per_task; | ||
| 4281 | struct task_group *tg; | 4496 | struct task_group *tg; |
| 4282 | unsigned long weight; | 4497 | unsigned long weight; |
| 4283 | int balanced; | 4498 | int balanced; |
| @@ -4320,47 +4535,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 4320 | * Otherwise check if either cpus are near enough in load to allow this | 4535 | * Otherwise check if either cpus are near enough in load to allow this |
| 4321 | * task to be woken on this_cpu. | 4536 | * task to be woken on this_cpu. |
| 4322 | */ | 4537 | */ |
| 4323 | if (this_load > 0) { | 4538 | this_eff_load = 100; |
| 4324 | s64 this_eff_load, prev_eff_load; | 4539 | this_eff_load *= capacity_of(prev_cpu); |
| 4540 | |||
| 4541 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 4542 | prev_eff_load *= capacity_of(this_cpu); | ||
| 4325 | 4543 | ||
| 4326 | this_eff_load = 100; | 4544 | if (this_load > 0) { |
| 4327 | this_eff_load *= capacity_of(prev_cpu); | ||
| 4328 | this_eff_load *= this_load + | 4545 | this_eff_load *= this_load + |
| 4329 | effective_load(tg, this_cpu, weight, weight); | 4546 | effective_load(tg, this_cpu, weight, weight); |
| 4330 | 4547 | ||
| 4331 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 4332 | prev_eff_load *= capacity_of(this_cpu); | ||
| 4333 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4548 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
| 4549 | } | ||
| 4334 | 4550 | ||
| 4335 | balanced = this_eff_load <= prev_eff_load; | 4551 | balanced = this_eff_load <= prev_eff_load; |
| 4336 | } else | ||
| 4337 | balanced = true; | ||
| 4338 | |||
| 4339 | /* | ||
| 4340 | * If the currently running task will sleep within | ||
| 4341 | * a reasonable amount of time then attract this newly | ||
| 4342 | * woken task: | ||
| 4343 | */ | ||
| 4344 | if (sync && balanced) | ||
| 4345 | return 1; | ||
| 4346 | 4552 | ||
| 4347 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); | 4553 | schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); |
| 4348 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 4349 | 4554 | ||
| 4350 | if (balanced || | 4555 | if (!balanced) |
| 4351 | (this_load <= load && | 4556 | return 0; |
| 4352 | this_load + target_load(prev_cpu, idx) <= tl_per_task)) { | ||
| 4353 | /* | ||
| 4354 | * This domain has SD_WAKE_AFFINE and | ||
| 4355 | * p is cache cold in this domain, and | ||
| 4356 | * there is no bad imbalance. | ||
| 4357 | */ | ||
| 4358 | schedstat_inc(sd, ttwu_move_affine); | ||
| 4359 | schedstat_inc(p, se.statistics.nr_wakeups_affine); | ||
| 4360 | 4557 | ||
| 4361 | return 1; | 4558 | schedstat_inc(sd, ttwu_move_affine); |
| 4362 | } | 4559 | schedstat_inc(p, se.statistics.nr_wakeups_affine); |
| 4363 | return 0; | 4560 | |
| 4561 | return 1; | ||
| 4364 | } | 4562 | } |
| 4365 | 4563 | ||
| 4366 | /* | 4564 | /* |
| @@ -4428,20 +4626,46 @@ static int | |||
| 4428 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 4626 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
| 4429 | { | 4627 | { |
| 4430 | unsigned long load, min_load = ULONG_MAX; | 4628 | unsigned long load, min_load = ULONG_MAX; |
| 4431 | int idlest = -1; | 4629 | unsigned int min_exit_latency = UINT_MAX; |
| 4630 | u64 latest_idle_timestamp = 0; | ||
| 4631 | int least_loaded_cpu = this_cpu; | ||
| 4632 | int shallowest_idle_cpu = -1; | ||
| 4432 | int i; | 4633 | int i; |
| 4433 | 4634 | ||
| 4434 | /* Traverse only the allowed CPUs */ | 4635 | /* Traverse only the allowed CPUs */ |
| 4435 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { | 4636 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
| 4436 | load = weighted_cpuload(i); | 4637 | if (idle_cpu(i)) { |
| 4437 | 4638 | struct rq *rq = cpu_rq(i); | |
| 4438 | if (load < min_load || (load == min_load && i == this_cpu)) { | 4639 | struct cpuidle_state *idle = idle_get_state(rq); |
| 4439 | min_load = load; | 4640 | if (idle && idle->exit_latency < min_exit_latency) { |
| 4440 | idlest = i; | 4641 | /* |
| 4642 | * We give priority to a CPU whose idle state | ||
| 4643 | * has the smallest exit latency irrespective | ||
| 4644 | * of any idle timestamp. | ||
| 4645 | */ | ||
| 4646 | min_exit_latency = idle->exit_latency; | ||
| 4647 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4648 | shallowest_idle_cpu = i; | ||
| 4649 | } else if ((!idle || idle->exit_latency == min_exit_latency) && | ||
| 4650 | rq->idle_stamp > latest_idle_timestamp) { | ||
| 4651 | /* | ||
| 4652 | * If equal or no active idle state, then | ||
| 4653 | * the most recently idled CPU might have | ||
| 4654 | * a warmer cache. | ||
| 4655 | */ | ||
| 4656 | latest_idle_timestamp = rq->idle_stamp; | ||
| 4657 | shallowest_idle_cpu = i; | ||
| 4658 | } | ||
| 4659 | } else if (shallowest_idle_cpu == -1) { | ||
| 4660 | load = weighted_cpuload(i); | ||
| 4661 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
| 4662 | min_load = load; | ||
| 4663 | least_loaded_cpu = i; | ||
| 4664 | } | ||
| 4441 | } | 4665 | } |
| 4442 | } | 4666 | } |
| 4443 | 4667 | ||
| 4444 | return idlest; | 4668 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
| 4445 | } | 4669 | } |
| 4446 | 4670 | ||
| 4447 | /* | 4671 | /* |
| @@ -4510,14 +4734,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4510 | int want_affine = 0; | 4734 | int want_affine = 0; |
| 4511 | int sync = wake_flags & WF_SYNC; | 4735 | int sync = wake_flags & WF_SYNC; |
| 4512 | 4736 | ||
| 4513 | if (p->nr_cpus_allowed == 1) | 4737 | if (sd_flag & SD_BALANCE_WAKE) |
| 4514 | return prev_cpu; | 4738 | want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
| 4515 | |||
| 4516 | if (sd_flag & SD_BALANCE_WAKE) { | ||
| 4517 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | ||
| 4518 | want_affine = 1; | ||
| 4519 | new_cpu = prev_cpu; | ||
| 4520 | } | ||
| 4521 | 4739 | ||
| 4522 | rcu_read_lock(); | 4740 | rcu_read_lock(); |
| 4523 | for_each_domain(cpu, tmp) { | 4741 | for_each_domain(cpu, tmp) { |
| @@ -4704,7 +4922,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 4704 | return; | 4922 | return; |
| 4705 | 4923 | ||
| 4706 | /* | 4924 | /* |
| 4707 | * This is possible from callers such as move_task(), in which we | 4925 | * This is possible from callers such as attach_tasks(), in which we |
| 4708 | * unconditionally check_prempt_curr() after an enqueue (which may have | 4926 | * unconditionally check_prempt_curr() after an enqueue (which may have |
| 4709 | * lead to a throttle). This both saves work and prevents false | 4927 | * lead to a throttle). This both saves work and prevents false |
| 4710 | * next-buddy nomination below. | 4928 | * next-buddy nomination below. |
| @@ -5112,27 +5330,18 @@ struct lb_env { | |||
| 5112 | unsigned int loop_max; | 5330 | unsigned int loop_max; |
| 5113 | 5331 | ||
| 5114 | enum fbq_type fbq_type; | 5332 | enum fbq_type fbq_type; |
| 5333 | struct list_head tasks; | ||
| 5115 | }; | 5334 | }; |
| 5116 | 5335 | ||
| 5117 | /* | 5336 | /* |
| 5118 | * move_task - move a task from one runqueue to another runqueue. | ||
| 5119 | * Both runqueues must be locked. | ||
| 5120 | */ | ||
| 5121 | static void move_task(struct task_struct *p, struct lb_env *env) | ||
| 5122 | { | ||
| 5123 | deactivate_task(env->src_rq, p, 0); | ||
| 5124 | set_task_cpu(p, env->dst_cpu); | ||
| 5125 | activate_task(env->dst_rq, p, 0); | ||
| 5126 | check_preempt_curr(env->dst_rq, p, 0); | ||
| 5127 | } | ||
| 5128 | |||
| 5129 | /* | ||
| 5130 | * Is this task likely cache-hot: | 5337 | * Is this task likely cache-hot: |
| 5131 | */ | 5338 | */ |
| 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) | 5339 | static int task_hot(struct task_struct *p, struct lb_env *env) |
| 5133 | { | 5340 | { |
| 5134 | s64 delta; | 5341 | s64 delta; |
| 5135 | 5342 | ||
| 5343 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5344 | |||
| 5136 | if (p->sched_class != &fair_sched_class) | 5345 | if (p->sched_class != &fair_sched_class) |
| 5137 | return 0; | 5346 | return 0; |
| 5138 | 5347 | ||
| @@ -5164,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
| 5164 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 5373 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
| 5165 | int src_nid, dst_nid; | 5374 | int src_nid, dst_nid; |
| 5166 | 5375 | ||
| 5167 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || | 5376 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || |
| 5168 | !(env->sd->flags & SD_NUMA)) { | 5377 | !(env->sd->flags & SD_NUMA)) { |
| 5169 | return false; | 5378 | return false; |
| 5170 | } | 5379 | } |
| @@ -5203,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 5203 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5412 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
| 5204 | return false; | 5413 | return false; |
| 5205 | 5414 | ||
| 5206 | if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) | 5415 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
| 5207 | return false; | 5416 | return false; |
| 5208 | 5417 | ||
| 5209 | src_nid = cpu_to_node(env->src_cpu); | 5418 | src_nid = cpu_to_node(env->src_cpu); |
| @@ -5252,6 +5461,9 @@ static | |||
| 5252 | int can_migrate_task(struct task_struct *p, struct lb_env *env) | 5461 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
| 5253 | { | 5462 | { |
| 5254 | int tsk_cache_hot = 0; | 5463 | int tsk_cache_hot = 0; |
| 5464 | |||
| 5465 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5466 | |||
| 5255 | /* | 5467 | /* |
| 5256 | * We do not migrate tasks that are: | 5468 | * We do not migrate tasks that are: |
| 5257 | * 1) throttled_lb_pair, or | 5469 | * 1) throttled_lb_pair, or |
| @@ -5310,24 +5522,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5310 | if (!tsk_cache_hot) | 5522 | if (!tsk_cache_hot) |
| 5311 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5523 | tsk_cache_hot = migrate_degrades_locality(p, env); |
| 5312 | 5524 | ||
| 5313 | if (migrate_improves_locality(p, env)) { | 5525 | if (migrate_improves_locality(p, env) || !tsk_cache_hot || |
| 5314 | #ifdef CONFIG_SCHEDSTATS | 5526 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 5315 | if (tsk_cache_hot) { | 5527 | if (tsk_cache_hot) { |
| 5316 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 5528 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
| 5317 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 5529 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
| 5318 | } | 5530 | } |
| 5319 | #endif | ||
| 5320 | return 1; | ||
| 5321 | } | ||
| 5322 | |||
| 5323 | if (!tsk_cache_hot || | ||
| 5324 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | ||
| 5325 | |||
| 5326 | if (tsk_cache_hot) { | ||
| 5327 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 5328 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 5329 | } | ||
| 5330 | |||
| 5331 | return 1; | 5531 | return 1; |
| 5332 | } | 5532 | } |
| 5333 | 5533 | ||
| @@ -5336,47 +5536,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 5336 | } | 5536 | } |
| 5337 | 5537 | ||
| 5338 | /* | 5538 | /* |
| 5339 | * move_one_task tries to move exactly one task from busiest to this_rq, as | 5539 | * detach_task() -- detach the task for the migration specified in env |
| 5540 | */ | ||
| 5541 | static void detach_task(struct task_struct *p, struct lb_env *env) | ||
| 5542 | { | ||
| 5543 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5544 | |||
| 5545 | deactivate_task(env->src_rq, p, 0); | ||
| 5546 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
| 5547 | set_task_cpu(p, env->dst_cpu); | ||
| 5548 | } | ||
| 5549 | |||
| 5550 | /* | ||
| 5551 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as | ||
| 5340 | * part of active balancing operations within "domain". | 5552 | * part of active balancing operations within "domain". |
| 5341 | * Returns 1 if successful and 0 otherwise. | ||
| 5342 | * | 5553 | * |
| 5343 | * Called with both runqueues locked. | 5554 | * Returns a task if successful and NULL otherwise. |
| 5344 | */ | 5555 | */ |
| 5345 | static int move_one_task(struct lb_env *env) | 5556 | static struct task_struct *detach_one_task(struct lb_env *env) |
| 5346 | { | 5557 | { |
| 5347 | struct task_struct *p, *n; | 5558 | struct task_struct *p, *n; |
| 5348 | 5559 | ||
| 5560 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5561 | |||
| 5349 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 5562 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
| 5350 | if (!can_migrate_task(p, env)) | 5563 | if (!can_migrate_task(p, env)) |
| 5351 | continue; | 5564 | continue; |
| 5352 | 5565 | ||
| 5353 | move_task(p, env); | 5566 | detach_task(p, env); |
| 5567 | |||
| 5354 | /* | 5568 | /* |
| 5355 | * Right now, this is only the second place move_task() | 5569 | * Right now, this is only the second place where |
| 5356 | * is called, so we can safely collect move_task() | 5570 | * lb_gained[env->idle] is updated (other is detach_tasks) |
| 5357 | * stats here rather than inside move_task(). | 5571 | * so we can safely collect stats here rather than |
| 5572 | * inside detach_tasks(). | ||
| 5358 | */ | 5573 | */ |
| 5359 | schedstat_inc(env->sd, lb_gained[env->idle]); | 5574 | schedstat_inc(env->sd, lb_gained[env->idle]); |
| 5360 | return 1; | 5575 | return p; |
| 5361 | } | 5576 | } |
| 5362 | return 0; | 5577 | return NULL; |
| 5363 | } | 5578 | } |
| 5364 | 5579 | ||
| 5365 | static const unsigned int sched_nr_migrate_break = 32; | 5580 | static const unsigned int sched_nr_migrate_break = 32; |
| 5366 | 5581 | ||
| 5367 | /* | 5582 | /* |
| 5368 | * move_tasks tries to move up to imbalance weighted load from busiest to | 5583 | * detach_tasks() -- tries to detach up to imbalance weighted load from |
| 5369 | * this_rq, as part of a balancing operation within domain "sd". | 5584 | * busiest_rq, as part of a balancing operation within domain "sd". |
| 5370 | * Returns 1 if successful and 0 otherwise. | ||
| 5371 | * | 5585 | * |
| 5372 | * Called with both runqueues locked. | 5586 | * Returns number of detached tasks if successful and 0 otherwise. |
| 5373 | */ | 5587 | */ |
| 5374 | static int move_tasks(struct lb_env *env) | 5588 | static int detach_tasks(struct lb_env *env) |
| 5375 | { | 5589 | { |
| 5376 | struct list_head *tasks = &env->src_rq->cfs_tasks; | 5590 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
| 5377 | struct task_struct *p; | 5591 | struct task_struct *p; |
| 5378 | unsigned long load; | 5592 | unsigned long load; |
| 5379 | int pulled = 0; | 5593 | int detached = 0; |
| 5594 | |||
| 5595 | lockdep_assert_held(&env->src_rq->lock); | ||
| 5380 | 5596 | ||
| 5381 | if (env->imbalance <= 0) | 5597 | if (env->imbalance <= 0) |
| 5382 | return 0; | 5598 | return 0; |
| @@ -5407,14 +5623,16 @@ static int move_tasks(struct lb_env *env) | |||
| 5407 | if ((load / 2) > env->imbalance) | 5623 | if ((load / 2) > env->imbalance) |
| 5408 | goto next; | 5624 | goto next; |
| 5409 | 5625 | ||
| 5410 | move_task(p, env); | 5626 | detach_task(p, env); |
| 5411 | pulled++; | 5627 | list_add(&p->se.group_node, &env->tasks); |
| 5628 | |||
| 5629 | detached++; | ||
| 5412 | env->imbalance -= load; | 5630 | env->imbalance -= load; |
| 5413 | 5631 | ||
| 5414 | #ifdef CONFIG_PREEMPT | 5632 | #ifdef CONFIG_PREEMPT |
| 5415 | /* | 5633 | /* |
| 5416 | * NEWIDLE balancing is a source of latency, so preemptible | 5634 | * NEWIDLE balancing is a source of latency, so preemptible |
| 5417 | * kernels will stop after the first task is pulled to minimize | 5635 | * kernels will stop after the first task is detached to minimize |
| 5418 | * the critical section. | 5636 | * the critical section. |
| 5419 | */ | 5637 | */ |
| 5420 | if (env->idle == CPU_NEWLY_IDLE) | 5638 | if (env->idle == CPU_NEWLY_IDLE) |
| @@ -5434,13 +5652,58 @@ next: | |||
| 5434 | } | 5652 | } |
| 5435 | 5653 | ||
| 5436 | /* | 5654 | /* |
| 5437 | * Right now, this is one of only two places move_task() is called, | 5655 | * Right now, this is one of only two places we collect this stat |
| 5438 | * so we can safely collect move_task() stats here rather than | 5656 | * so we can safely collect detach_one_task() stats here rather |
| 5439 | * inside move_task(). | 5657 | * than inside detach_one_task(). |
| 5440 | */ | 5658 | */ |
| 5441 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 5659 | schedstat_add(env->sd, lb_gained[env->idle], detached); |
| 5660 | |||
| 5661 | return detached; | ||
| 5662 | } | ||
| 5663 | |||
| 5664 | /* | ||
| 5665 | * attach_task() -- attach the task detached by detach_task() to its new rq. | ||
| 5666 | */ | ||
| 5667 | static void attach_task(struct rq *rq, struct task_struct *p) | ||
| 5668 | { | ||
| 5669 | lockdep_assert_held(&rq->lock); | ||
| 5442 | 5670 | ||
| 5443 | return pulled; | 5671 | BUG_ON(task_rq(p) != rq); |
| 5672 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
| 5673 | activate_task(rq, p, 0); | ||
| 5674 | check_preempt_curr(rq, p, 0); | ||
| 5675 | } | ||
| 5676 | |||
| 5677 | /* | ||
| 5678 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
| 5679 | * its new rq. | ||
| 5680 | */ | ||
| 5681 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
| 5682 | { | ||
| 5683 | raw_spin_lock(&rq->lock); | ||
| 5684 | attach_task(rq, p); | ||
| 5685 | raw_spin_unlock(&rq->lock); | ||
| 5686 | } | ||
| 5687 | |||
| 5688 | /* | ||
| 5689 | * attach_tasks() -- attaches all tasks detached by detach_tasks() to their | ||
| 5690 | * new rq. | ||
| 5691 | */ | ||
| 5692 | static void attach_tasks(struct lb_env *env) | ||
| 5693 | { | ||
| 5694 | struct list_head *tasks = &env->tasks; | ||
| 5695 | struct task_struct *p; | ||
| 5696 | |||
| 5697 | raw_spin_lock(&env->dst_rq->lock); | ||
| 5698 | |||
| 5699 | while (!list_empty(tasks)) { | ||
| 5700 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
| 5701 | list_del_init(&p->se.group_node); | ||
| 5702 | |||
| 5703 | attach_task(env->dst_rq, p); | ||
| 5704 | } | ||
| 5705 | |||
| 5706 | raw_spin_unlock(&env->dst_rq->lock); | ||
| 5444 | } | 5707 | } |
| 5445 | 5708 | ||
| 5446 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5709 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -5559,6 +5822,13 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 5559 | #endif | 5822 | #endif |
| 5560 | 5823 | ||
| 5561 | /********** Helpers for find_busiest_group ************************/ | 5824 | /********** Helpers for find_busiest_group ************************/ |
| 5825 | |||
| 5826 | enum group_type { | ||
| 5827 | group_other = 0, | ||
| 5828 | group_imbalanced, | ||
| 5829 | group_overloaded, | ||
| 5830 | }; | ||
| 5831 | |||
| 5562 | /* | 5832 | /* |
| 5563 | * sg_lb_stats - stats of a sched_group required for load_balancing | 5833 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 5564 | */ | 5834 | */ |
| @@ -5572,7 +5842,7 @@ struct sg_lb_stats { | |||
| 5572 | unsigned int group_capacity_factor; | 5842 | unsigned int group_capacity_factor; |
| 5573 | unsigned int idle_cpus; | 5843 | unsigned int idle_cpus; |
| 5574 | unsigned int group_weight; | 5844 | unsigned int group_weight; |
| 5575 | int group_imb; /* Is there an imbalance in the group ? */ | 5845 | enum group_type group_type; |
| 5576 | int group_has_free_capacity; | 5846 | int group_has_free_capacity; |
| 5577 | #ifdef CONFIG_NUMA_BALANCING | 5847 | #ifdef CONFIG_NUMA_BALANCING |
| 5578 | unsigned int nr_numa_running; | 5848 | unsigned int nr_numa_running; |
| @@ -5610,6 +5880,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 5610 | .total_capacity = 0UL, | 5880 | .total_capacity = 0UL, |
| 5611 | .busiest_stat = { | 5881 | .busiest_stat = { |
| 5612 | .avg_load = 0UL, | 5882 | .avg_load = 0UL, |
| 5883 | .sum_nr_running = 0, | ||
| 5884 | .group_type = group_other, | ||
| 5613 | }, | 5885 | }, |
| 5614 | }; | 5886 | }; |
| 5615 | } | 5887 | } |
| @@ -5652,19 +5924,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
| 5652 | return default_scale_capacity(sd, cpu); | 5924 | return default_scale_capacity(sd, cpu); |
| 5653 | } | 5925 | } |
| 5654 | 5926 | ||
| 5655 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5927 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5656 | { | 5928 | { |
| 5657 | unsigned long weight = sd->span_weight; | 5929 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| 5658 | unsigned long smt_gain = sd->smt_gain; | 5930 | return sd->smt_gain / sd->span_weight; |
| 5659 | |||
| 5660 | smt_gain /= weight; | ||
| 5661 | 5931 | ||
| 5662 | return smt_gain; | 5932 | return SCHED_CAPACITY_SCALE; |
| 5663 | } | 5933 | } |
| 5664 | 5934 | ||
| 5665 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) | 5935 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5666 | { | 5936 | { |
| 5667 | return default_scale_smt_capacity(sd, cpu); | 5937 | return default_scale_cpu_capacity(sd, cpu); |
| 5668 | } | 5938 | } |
| 5669 | 5939 | ||
| 5670 | static unsigned long scale_rt_capacity(int cpu) | 5940 | static unsigned long scale_rt_capacity(int cpu) |
| @@ -5703,18 +5973,15 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5703 | 5973 | ||
| 5704 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 5974 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5705 | { | 5975 | { |
| 5706 | unsigned long weight = sd->span_weight; | ||
| 5707 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 5976 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
| 5708 | struct sched_group *sdg = sd->groups; | 5977 | struct sched_group *sdg = sd->groups; |
| 5709 | 5978 | ||
| 5710 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { | 5979 | if (sched_feat(ARCH_CAPACITY)) |
| 5711 | if (sched_feat(ARCH_CAPACITY)) | 5980 | capacity *= arch_scale_cpu_capacity(sd, cpu); |
| 5712 | capacity *= arch_scale_smt_capacity(sd, cpu); | 5981 | else |
| 5713 | else | 5982 | capacity *= default_scale_cpu_capacity(sd, cpu); |
| 5714 | capacity *= default_scale_smt_capacity(sd, cpu); | ||
| 5715 | 5983 | ||
| 5716 | capacity >>= SCHED_CAPACITY_SHIFT; | 5984 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5717 | } | ||
| 5718 | 5985 | ||
| 5719 | sdg->sgc->capacity_orig = capacity; | 5986 | sdg->sgc->capacity_orig = capacity; |
| 5720 | 5987 | ||
| @@ -5891,6 +6158,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
| 5891 | return capacity_factor; | 6158 | return capacity_factor; |
| 5892 | } | 6159 | } |
| 5893 | 6160 | ||
| 6161 | static enum group_type | ||
| 6162 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | ||
| 6163 | { | ||
| 6164 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | ||
| 6165 | return group_overloaded; | ||
| 6166 | |||
| 6167 | if (sg_imbalanced(group)) | ||
| 6168 | return group_imbalanced; | ||
| 6169 | |||
| 6170 | return group_other; | ||
| 6171 | } | ||
| 6172 | |||
| 5894 | /** | 6173 | /** |
| 5895 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 6174 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 5896 | * @env: The load balancing environment. | 6175 | * @env: The load balancing environment. |
| @@ -5920,7 +6199,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5920 | load = source_load(i, load_idx); | 6199 | load = source_load(i, load_idx); |
| 5921 | 6200 | ||
| 5922 | sgs->group_load += load; | 6201 | sgs->group_load += load; |
| 5923 | sgs->sum_nr_running += rq->nr_running; | 6202 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 5924 | 6203 | ||
| 5925 | if (rq->nr_running > 1) | 6204 | if (rq->nr_running > 1) |
| 5926 | *overload = true; | 6205 | *overload = true; |
| @@ -5942,9 +6221,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 5942 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6221 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 5943 | 6222 | ||
| 5944 | sgs->group_weight = group->group_weight; | 6223 | sgs->group_weight = group->group_weight; |
| 5945 | |||
| 5946 | sgs->group_imb = sg_imbalanced(group); | ||
| 5947 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | 6224 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
| 6225 | sgs->group_type = group_classify(group, sgs); | ||
| 5948 | 6226 | ||
| 5949 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6227 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
| 5950 | sgs->group_has_free_capacity = 1; | 6228 | sgs->group_has_free_capacity = 1; |
| @@ -5968,13 +6246,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5968 | struct sched_group *sg, | 6246 | struct sched_group *sg, |
| 5969 | struct sg_lb_stats *sgs) | 6247 | struct sg_lb_stats *sgs) |
| 5970 | { | 6248 | { |
| 5971 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 6249 | struct sg_lb_stats *busiest = &sds->busiest_stat; |
| 5972 | return false; | ||
| 5973 | 6250 | ||
| 5974 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6251 | if (sgs->group_type > busiest->group_type) |
| 5975 | return true; | 6252 | return true; |
| 5976 | 6253 | ||
| 5977 | if (sgs->group_imb) | 6254 | if (sgs->group_type < busiest->group_type) |
| 6255 | return false; | ||
| 6256 | |||
| 6257 | if (sgs->avg_load <= busiest->avg_load) | ||
| 6258 | return false; | ||
| 6259 | |||
| 6260 | /* This is the busiest node in its class. */ | ||
| 6261 | if (!(env->sd->flags & SD_ASYM_PACKING)) | ||
| 5978 | return true; | 6262 | return true; |
| 5979 | 6263 | ||
| 5980 | /* | 6264 | /* |
| @@ -5982,8 +6266,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 5982 | * numbered CPUs in the group, therefore mark all groups | 6266 | * numbered CPUs in the group, therefore mark all groups |
| 5983 | * higher than ourself as busy. | 6267 | * higher than ourself as busy. |
| 5984 | */ | 6268 | */ |
| 5985 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 6269 | if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { |
| 5986 | env->dst_cpu < group_first_cpu(sg)) { | ||
| 5987 | if (!sds->busiest) | 6270 | if (!sds->busiest) |
| 5988 | return true; | 6271 | return true; |
| 5989 | 6272 | ||
| @@ -6073,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6073 | * with a large weight task outweighs the tasks on the system). | 6356 | * with a large weight task outweighs the tasks on the system). |
| 6074 | */ | 6357 | */ |
| 6075 | if (prefer_sibling && sds->local && | 6358 | if (prefer_sibling && sds->local && |
| 6076 | sds->local_stat.group_has_free_capacity) | 6359 | sds->local_stat.group_has_free_capacity) { |
| 6077 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6360 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); |
| 6361 | sgs->group_type = group_classify(sg, sgs); | ||
| 6362 | } | ||
| 6078 | 6363 | ||
| 6079 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6364 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 6080 | sds->busiest = sg; | 6365 | sds->busiest = sg; |
| @@ -6228,7 +6513,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6228 | local = &sds->local_stat; | 6513 | local = &sds->local_stat; |
| 6229 | busiest = &sds->busiest_stat; | 6514 | busiest = &sds->busiest_stat; |
| 6230 | 6515 | ||
| 6231 | if (busiest->group_imb) { | 6516 | if (busiest->group_type == group_imbalanced) { |
| 6232 | /* | 6517 | /* |
| 6233 | * In the group_imb case we cannot rely on group-wide averages | 6518 | * In the group_imb case we cannot rely on group-wide averages |
| 6234 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 6519 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
| @@ -6248,12 +6533,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6248 | return fix_small_imbalance(env, sds); | 6533 | return fix_small_imbalance(env, sds); |
| 6249 | } | 6534 | } |
| 6250 | 6535 | ||
| 6251 | if (!busiest->group_imb) { | 6536 | /* |
| 6252 | /* | 6537 | * If there aren't any idle cpus, avoid creating some. |
| 6253 | * Don't want to pull so many tasks that a group would go idle. | 6538 | */ |
| 6254 | * Except of course for the group_imb case, since then we might | 6539 | if (busiest->group_type == group_overloaded && |
| 6255 | * have to drop below capacity to reach cpu-load equilibrium. | 6540 | local->group_type == group_overloaded) { |
| 6256 | */ | ||
| 6257 | load_above_capacity = | 6541 | load_above_capacity = |
| 6258 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6542 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
| 6259 | 6543 | ||
| @@ -6337,7 +6621,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6337 | * work because they assume all things are equal, which typically | 6621 | * work because they assume all things are equal, which typically |
| 6338 | * isn't true due to cpus_allowed constraints and the like. | 6622 | * isn't true due to cpus_allowed constraints and the like. |
| 6339 | */ | 6623 | */ |
| 6340 | if (busiest->group_imb) | 6624 | if (busiest->group_type == group_imbalanced) |
| 6341 | goto force_balance; | 6625 | goto force_balance; |
| 6342 | 6626 | ||
| 6343 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6627 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| @@ -6346,7 +6630,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6346 | goto force_balance; | 6630 | goto force_balance; |
| 6347 | 6631 | ||
| 6348 | /* | 6632 | /* |
| 6349 | * If the local group is more busy than the selected busiest group | 6633 | * If the local group is busier than the selected busiest group |
| 6350 | * don't try and pull any tasks. | 6634 | * don't try and pull any tasks. |
| 6351 | */ | 6635 | */ |
| 6352 | if (local->avg_load >= busiest->avg_load) | 6636 | if (local->avg_load >= busiest->avg_load) |
| @@ -6361,13 +6645,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6361 | 6645 | ||
| 6362 | if (env->idle == CPU_IDLE) { | 6646 | if (env->idle == CPU_IDLE) { |
| 6363 | /* | 6647 | /* |
| 6364 | * This cpu is idle. If the busiest group load doesn't | 6648 | * This cpu is idle. If the busiest group is not overloaded |
| 6365 | * have more tasks than the number of available cpu's and | 6649 | * and there is no imbalance between this and busiest group |
| 6366 | * there is no imbalance between this and busiest group | 6650 | * wrt idle cpus, it is balanced. The imbalance becomes |
| 6367 | * wrt to idle cpu's, it is balanced. | 6651 | * significant if the diff is greater than 1 otherwise we |
| 6652 | * might end up to just move the imbalance on another group | ||
| 6368 | */ | 6653 | */ |
| 6369 | if ((local->idle_cpus < busiest->idle_cpus) && | 6654 | if ((busiest->group_type != group_overloaded) && |
| 6370 | busiest->sum_nr_running <= busiest->group_weight) | 6655 | (local->idle_cpus <= (busiest->idle_cpus + 1))) |
| 6371 | goto out_balanced; | 6656 | goto out_balanced; |
| 6372 | } else { | 6657 | } else { |
| 6373 | /* | 6658 | /* |
| @@ -6539,7 +6824,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 6539 | struct sched_group *group; | 6824 | struct sched_group *group; |
| 6540 | struct rq *busiest; | 6825 | struct rq *busiest; |
| 6541 | unsigned long flags; | 6826 | unsigned long flags; |
| 6542 | struct cpumask *cpus = __get_cpu_var(load_balance_mask); | 6827 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); |
| 6543 | 6828 | ||
| 6544 | struct lb_env env = { | 6829 | struct lb_env env = { |
| 6545 | .sd = sd, | 6830 | .sd = sd, |
| @@ -6550,6 +6835,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 6550 | .loop_break = sched_nr_migrate_break, | 6835 | .loop_break = sched_nr_migrate_break, |
| 6551 | .cpus = cpus, | 6836 | .cpus = cpus, |
| 6552 | .fbq_type = all, | 6837 | .fbq_type = all, |
| 6838 | .tasks = LIST_HEAD_INIT(env.tasks), | ||
| 6553 | }; | 6839 | }; |
| 6554 | 6840 | ||
| 6555 | /* | 6841 | /* |
| @@ -6599,23 +6885,30 @@ redo: | |||
| 6599 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6885 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6600 | 6886 | ||
| 6601 | more_balance: | 6887 | more_balance: |
| 6602 | local_irq_save(flags); | 6888 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 6603 | double_rq_lock(env.dst_rq, busiest); | ||
| 6604 | 6889 | ||
| 6605 | /* | 6890 | /* |
| 6606 | * cur_ld_moved - load moved in current iteration | 6891 | * cur_ld_moved - load moved in current iteration |
| 6607 | * ld_moved - cumulative load moved across iterations | 6892 | * ld_moved - cumulative load moved across iterations |
| 6608 | */ | 6893 | */ |
| 6609 | cur_ld_moved = move_tasks(&env); | 6894 | cur_ld_moved = detach_tasks(&env); |
| 6610 | ld_moved += cur_ld_moved; | ||
| 6611 | double_rq_unlock(env.dst_rq, busiest); | ||
| 6612 | local_irq_restore(flags); | ||
| 6613 | 6895 | ||
| 6614 | /* | 6896 | /* |
| 6615 | * some other cpu did the load balance for us. | 6897 | * We've detached some tasks from busiest_rq. Every |
| 6898 | * task is masked "TASK_ON_RQ_MIGRATING", so we can safely | ||
| 6899 | * unlock busiest->lock, and we are able to be sure | ||
| 6900 | * that nobody can manipulate the tasks in parallel. | ||
| 6901 | * See task_rq_lock() family for the details. | ||
| 6616 | */ | 6902 | */ |
| 6617 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 6903 | |
| 6618 | resched_cpu(env.dst_cpu); | 6904 | raw_spin_unlock(&busiest->lock); |
| 6905 | |||
| 6906 | if (cur_ld_moved) { | ||
| 6907 | attach_tasks(&env); | ||
| 6908 | ld_moved += cur_ld_moved; | ||
| 6909 | } | ||
| 6910 | |||
| 6911 | local_irq_restore(flags); | ||
| 6619 | 6912 | ||
| 6620 | if (env.flags & LBF_NEED_BREAK) { | 6913 | if (env.flags & LBF_NEED_BREAK) { |
| 6621 | env.flags &= ~LBF_NEED_BREAK; | 6914 | env.flags &= ~LBF_NEED_BREAK; |
| @@ -6665,10 +6958,8 @@ more_balance: | |||
| 6665 | if (sd_parent) { | 6958 | if (sd_parent) { |
| 6666 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | 6959 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
| 6667 | 6960 | ||
| 6668 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6961 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) |
| 6669 | *group_imbalance = 1; | 6962 | *group_imbalance = 1; |
| 6670 | } else if (*group_imbalance) | ||
| 6671 | *group_imbalance = 0; | ||
| 6672 | } | 6963 | } |
| 6673 | 6964 | ||
| 6674 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6965 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| @@ -6679,7 +6970,7 @@ more_balance: | |||
| 6679 | env.loop_break = sched_nr_migrate_break; | 6970 | env.loop_break = sched_nr_migrate_break; |
| 6680 | goto redo; | 6971 | goto redo; |
| 6681 | } | 6972 | } |
| 6682 | goto out_balanced; | 6973 | goto out_all_pinned; |
| 6683 | } | 6974 | } |
| 6684 | } | 6975 | } |
| 6685 | 6976 | ||
| @@ -6744,7 +7035,7 @@ more_balance: | |||
| 6744 | * If we've begun active balancing, start to back off. This | 7035 | * If we've begun active balancing, start to back off. This |
| 6745 | * case may not be covered by the all_pinned logic if there | 7036 | * case may not be covered by the all_pinned logic if there |
| 6746 | * is only 1 task on the busy runqueue (because we don't call | 7037 | * is only 1 task on the busy runqueue (because we don't call |
| 6747 | * move_tasks). | 7038 | * detach_tasks). |
| 6748 | */ | 7039 | */ |
| 6749 | if (sd->balance_interval < sd->max_interval) | 7040 | if (sd->balance_interval < sd->max_interval) |
| 6750 | sd->balance_interval *= 2; | 7041 | sd->balance_interval *= 2; |
| @@ -6753,6 +7044,23 @@ more_balance: | |||
| 6753 | goto out; | 7044 | goto out; |
| 6754 | 7045 | ||
| 6755 | out_balanced: | 7046 | out_balanced: |
| 7047 | /* | ||
| 7048 | * We reach balance although we may have faced some affinity | ||
| 7049 | * constraints. Clear the imbalance flag if it was set. | ||
| 7050 | */ | ||
| 7051 | if (sd_parent) { | ||
| 7052 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; | ||
| 7053 | |||
| 7054 | if (*group_imbalance) | ||
| 7055 | *group_imbalance = 0; | ||
| 7056 | } | ||
| 7057 | |||
| 7058 | out_all_pinned: | ||
| 7059 | /* | ||
| 7060 | * We reach balance because all tasks are pinned at this level so | ||
| 7061 | * we can't migrate them. Let the imbalance flag set so parent level | ||
| 7062 | * can try to migrate them. | ||
| 7063 | */ | ||
| 6756 | schedstat_inc(sd, lb_balanced[idle]); | 7064 | schedstat_inc(sd, lb_balanced[idle]); |
| 6757 | 7065 | ||
| 6758 | sd->nr_balance_failed = 0; | 7066 | sd->nr_balance_failed = 0; |
| @@ -6914,6 +7222,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6914 | int target_cpu = busiest_rq->push_cpu; | 7222 | int target_cpu = busiest_rq->push_cpu; |
| 6915 | struct rq *target_rq = cpu_rq(target_cpu); | 7223 | struct rq *target_rq = cpu_rq(target_cpu); |
| 6916 | struct sched_domain *sd; | 7224 | struct sched_domain *sd; |
| 7225 | struct task_struct *p = NULL; | ||
| 6917 | 7226 | ||
| 6918 | raw_spin_lock_irq(&busiest_rq->lock); | 7227 | raw_spin_lock_irq(&busiest_rq->lock); |
| 6919 | 7228 | ||
| @@ -6933,9 +7242,6 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6933 | */ | 7242 | */ |
| 6934 | BUG_ON(busiest_rq == target_rq); | 7243 | BUG_ON(busiest_rq == target_rq); |
| 6935 | 7244 | ||
| 6936 | /* move a task from busiest_rq to target_rq */ | ||
| 6937 | double_lock_balance(busiest_rq, target_rq); | ||
| 6938 | |||
| 6939 | /* Search for an sd spanning us and the target CPU. */ | 7245 | /* Search for an sd spanning us and the target CPU. */ |
| 6940 | rcu_read_lock(); | 7246 | rcu_read_lock(); |
| 6941 | for_each_domain(target_cpu, sd) { | 7247 | for_each_domain(target_cpu, sd) { |
| @@ -6956,16 +7262,22 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 6956 | 7262 | ||
| 6957 | schedstat_inc(sd, alb_count); | 7263 | schedstat_inc(sd, alb_count); |
| 6958 | 7264 | ||
| 6959 | if (move_one_task(&env)) | 7265 | p = detach_one_task(&env); |
| 7266 | if (p) | ||
| 6960 | schedstat_inc(sd, alb_pushed); | 7267 | schedstat_inc(sd, alb_pushed); |
| 6961 | else | 7268 | else |
| 6962 | schedstat_inc(sd, alb_failed); | 7269 | schedstat_inc(sd, alb_failed); |
| 6963 | } | 7270 | } |
| 6964 | rcu_read_unlock(); | 7271 | rcu_read_unlock(); |
| 6965 | double_unlock_balance(busiest_rq, target_rq); | ||
| 6966 | out_unlock: | 7272 | out_unlock: |
| 6967 | busiest_rq->active_balance = 0; | 7273 | busiest_rq->active_balance = 0; |
| 6968 | raw_spin_unlock_irq(&busiest_rq->lock); | 7274 | raw_spin_unlock(&busiest_rq->lock); |
| 7275 | |||
| 7276 | if (p) | ||
| 7277 | attach_one_task(target_rq, p); | ||
| 7278 | |||
| 7279 | local_irq_enable(); | ||
| 7280 | |||
| 6969 | return 0; | 7281 | return 0; |
| 6970 | } | 7282 | } |
| 6971 | 7283 | ||
| @@ -7465,7 +7777,7 @@ static void task_fork_fair(struct task_struct *p) | |||
| 7465 | static void | 7777 | static void |
| 7466 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | 7778 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
| 7467 | { | 7779 | { |
| 7468 | if (!p->se.on_rq) | 7780 | if (!task_on_rq_queued(p)) |
| 7469 | return; | 7781 | return; |
| 7470 | 7782 | ||
| 7471 | /* | 7783 | /* |
| @@ -7490,11 +7802,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7490 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7802 | * switched back to the fair class the enqueue_entity(.flags=0) will |
| 7491 | * do the right thing. | 7803 | * do the right thing. |
| 7492 | * | 7804 | * |
| 7493 | * If it's on_rq, then the dequeue_entity(.flags=0) will already | 7805 | * If it's queued, then the dequeue_entity(.flags=0) will already |
| 7494 | * have normalized the vruntime, if it's !on_rq, then only when | 7806 | * have normalized the vruntime, if it's !queued, then only when |
| 7495 | * the task is sleeping will it still have non-normalized vruntime. | 7807 | * the task is sleeping will it still have non-normalized vruntime. |
| 7496 | */ | 7808 | */ |
| 7497 | if (!p->on_rq && p->state != TASK_RUNNING) { | 7809 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { |
| 7498 | /* | 7810 | /* |
| 7499 | * Fix up our vruntime so that the current sleep doesn't | 7811 | * Fix up our vruntime so that the current sleep doesn't |
| 7500 | * cause 'unlimited' sleep bonus. | 7812 | * cause 'unlimited' sleep bonus. |
| @@ -7521,15 +7833,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7521 | */ | 7833 | */ |
| 7522 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | 7834 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
| 7523 | { | 7835 | { |
| 7524 | struct sched_entity *se = &p->se; | ||
| 7525 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7836 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7837 | struct sched_entity *se = &p->se; | ||
| 7526 | /* | 7838 | /* |
| 7527 | * Since the real-depth could have been changed (only FAIR | 7839 | * Since the real-depth could have been changed (only FAIR |
| 7528 | * class maintain depth value), reset depth properly. | 7840 | * class maintain depth value), reset depth properly. |
| 7529 | */ | 7841 | */ |
| 7530 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7842 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7531 | #endif | 7843 | #endif |
| 7532 | if (!se->on_rq) | 7844 | if (!task_on_rq_queued(p)) |
| 7533 | return; | 7845 | return; |
| 7534 | 7846 | ||
| 7535 | /* | 7847 | /* |
| @@ -7575,7 +7887,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 7575 | } | 7887 | } |
| 7576 | 7888 | ||
| 7577 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7889 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7578 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 7890 | static void task_move_group_fair(struct task_struct *p, int queued) |
| 7579 | { | 7891 | { |
| 7580 | struct sched_entity *se = &p->se; | 7892 | struct sched_entity *se = &p->se; |
| 7581 | struct cfs_rq *cfs_rq; | 7893 | struct cfs_rq *cfs_rq; |
| @@ -7594,7 +7906,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7594 | * fair sleeper stuff for the first placement, but who cares. | 7906 | * fair sleeper stuff for the first placement, but who cares. |
| 7595 | */ | 7907 | */ |
| 7596 | /* | 7908 | /* |
| 7597 | * When !on_rq, vruntime of the task has usually NOT been normalized. | 7909 | * When !queued, vruntime of the task has usually NOT been normalized. |
| 7598 | * But there are some cases where it has already been normalized: | 7910 | * But there are some cases where it has already been normalized: |
| 7599 | * | 7911 | * |
| 7600 | * - Moving a forked child which is waiting for being woken up by | 7912 | * - Moving a forked child which is waiting for being woken up by |
| @@ -7605,14 +7917,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 7605 | * To prevent boost or penalty in the new cfs_rq caused by delta | 7917 | * To prevent boost or penalty in the new cfs_rq caused by delta |
| 7606 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | 7918 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. |
| 7607 | */ | 7919 | */ |
| 7608 | if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | 7920 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) |
| 7609 | on_rq = 1; | 7921 | queued = 1; |
| 7610 | 7922 | ||
| 7611 | if (!on_rq) | 7923 | if (!queued) |
| 7612 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | 7924 | se->vruntime -= cfs_rq_of(se)->min_vruntime; |
| 7613 | set_task_rq(p, task_cpu(p)); | 7925 | set_task_rq(p, task_cpu(p)); |
| 7614 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 7926 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7615 | if (!on_rq) { | 7927 | if (!queued) { |
| 7616 | cfs_rq = cfs_rq_of(se); | 7928 | cfs_rq = cfs_rq_of(se); |
| 7617 | se->vruntime += cfs_rq->min_vruntime; | 7929 | se->vruntime += cfs_rq->min_vruntime; |
| 7618 | #ifdef CONFIG_SMP | 7930 | #ifdef CONFIG_SMP |
| @@ -7835,6 +8147,8 @@ const struct sched_class fair_sched_class = { | |||
| 7835 | 8147 | ||
| 7836 | .get_rr_interval = get_rr_interval_fair, | 8148 | .get_rr_interval = get_rr_interval_fair, |
| 7837 | 8149 | ||
| 8150 | .update_curr = update_curr_fair, | ||
| 8151 | |||
| 7838 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8152 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7839 | .task_move_group = task_move_group_fair, | 8153 | .task_move_group = task_move_group_fair, |
| 7840 | #endif | 8154 | #endif |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -147,6 +147,9 @@ use_default: | |||
| 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
| 148 | goto use_default; | 148 | goto use_default; |
| 149 | 149 | ||
| 150 | /* Take note of the planned idle state. */ | ||
| 151 | idle_set_state(this_rq(), &drv->states[next_state]); | ||
| 152 | |||
| 150 | /* | 153 | /* |
| 151 | * Enter the idle state previously returned by the governor decision. | 154 | * Enter the idle state previously returned by the governor decision. |
| 152 | * This function will block until an interrupt occurs and will take | 155 | * This function will block until an interrupt occurs and will take |
| @@ -154,6 +157,9 @@ use_default: | |||
| 154 | */ | 157 | */ |
| 155 | entered_state = cpuidle_enter(drv, dev, next_state); | 158 | entered_state = cpuidle_enter(drv, dev, next_state); |
| 156 | 159 | ||
| 160 | /* The cpu is no longer idle or about to enter idle. */ | ||
| 161 | idle_set_state(this_rq(), NULL); | ||
| 162 | |||
| 157 | if (broadcast) | 163 | if (broadcast) |
| 158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 164 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
| 159 | 165 | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 67ad4e7f506a..c65dac8c97cd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task | |||
| 75 | return 0; | 75 | return 0; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | static void update_curr_idle(struct rq *rq) | ||
| 79 | { | ||
| 80 | } | ||
| 81 | |||
| 78 | /* | 82 | /* |
| 79 | * Simple, special scheduling class for the per-CPU idle tasks: | 83 | * Simple, special scheduling class for the per-CPU idle tasks: |
| 80 | */ | 84 | */ |
| @@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = { | |||
| 101 | 105 | ||
| 102 | .prio_changed = prio_changed_idle, | 106 | .prio_changed = prio_changed_idle, |
| 103 | .switched_to = switched_to_idle, | 107 | .switched_to = switched_to_idle, |
| 108 | .update_curr = update_curr_idle, | ||
| 104 | }; | 109 | }; |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..ee15f5a0d1c1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
| 1301 | struct task_struct *curr; | 1301 | struct task_struct *curr; |
| 1302 | struct rq *rq; | 1302 | struct rq *rq; |
| 1303 | 1303 | ||
| 1304 | if (p->nr_cpus_allowed == 1) | ||
| 1305 | goto out; | ||
| 1306 | |||
| 1307 | /* For anything but wake ups, just return the task_cpu */ | 1304 | /* For anything but wake ups, just return the task_cpu */ |
| 1308 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1305 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
| 1309 | goto out; | 1306 | goto out; |
| @@ -1351,16 +1348,22 @@ out: | |||
| 1351 | 1348 | ||
| 1352 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1349 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
| 1353 | { | 1350 | { |
| 1354 | if (rq->curr->nr_cpus_allowed == 1) | 1351 | /* |
| 1352 | * Current can't be migrated, useless to reschedule, | ||
| 1353 | * let's hope p can move out. | ||
| 1354 | */ | ||
| 1355 | if (rq->curr->nr_cpus_allowed == 1 || | ||
| 1356 | !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
| 1355 | return; | 1357 | return; |
| 1356 | 1358 | ||
| 1359 | /* | ||
| 1360 | * p is migratable, so let's not schedule it and | ||
| 1361 | * see if it is pushed or pulled somewhere else. | ||
| 1362 | */ | ||
| 1357 | if (p->nr_cpus_allowed != 1 | 1363 | if (p->nr_cpus_allowed != 1 |
| 1358 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1364 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
| 1359 | return; | 1365 | return; |
| 1360 | 1366 | ||
| 1361 | if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) | ||
| 1362 | return; | ||
| 1363 | |||
| 1364 | /* | 1367 | /* |
| 1365 | * There appears to be other cpus that can accept | 1368 | * There appears to be other cpus that can accept |
| 1366 | * current and none to run 'p', so lets reschedule | 1369 | * current and none to run 'p', so lets reschedule |
| @@ -1448,7 +1451,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1448 | * means a dl or stop task can slip in, in which case we need | 1451 | * means a dl or stop task can slip in, in which case we need |
| 1449 | * to re-start task selection. | 1452 | * to re-start task selection. |
| 1450 | */ | 1453 | */ |
| 1451 | if (unlikely((rq->stop && rq->stop->on_rq) || | 1454 | if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || |
| 1452 | rq->dl.dl_nr_running)) | 1455 | rq->dl.dl_nr_running)) |
| 1453 | return RETRY_TASK; | 1456 | return RETRY_TASK; |
| 1454 | } | 1457 | } |
| @@ -1468,8 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1468 | p = _pick_next_task_rt(rq); | 1471 | p = _pick_next_task_rt(rq); |
| 1469 | 1472 | ||
| 1470 | /* The running task is never eligible for pushing */ | 1473 | /* The running task is never eligible for pushing */ |
| 1471 | if (p) | 1474 | dequeue_pushable_task(rq, p); |
| 1472 | dequeue_pushable_task(rq, p); | ||
| 1473 | 1475 | ||
| 1474 | set_post_schedule(rq); | 1476 | set_post_schedule(rq); |
| 1475 | 1477 | ||
| @@ -1526,7 +1528,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); | |||
| 1526 | static int find_lowest_rq(struct task_struct *task) | 1528 | static int find_lowest_rq(struct task_struct *task) |
| 1527 | { | 1529 | { |
| 1528 | struct sched_domain *sd; | 1530 | struct sched_domain *sd; |
| 1529 | struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); | 1531 | struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); |
| 1530 | int this_cpu = smp_processor_id(); | 1532 | int this_cpu = smp_processor_id(); |
| 1531 | int cpu = task_cpu(task); | 1533 | int cpu = task_cpu(task); |
| 1532 | 1534 | ||
| @@ -1624,7 +1626,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1624 | !cpumask_test_cpu(lowest_rq->cpu, | 1626 | !cpumask_test_cpu(lowest_rq->cpu, |
| 1625 | tsk_cpus_allowed(task)) || | 1627 | tsk_cpus_allowed(task)) || |
| 1626 | task_running(rq, task) || | 1628 | task_running(rq, task) || |
| 1627 | !task->on_rq)) { | 1629 | !task_on_rq_queued(task))) { |
| 1628 | 1630 | ||
| 1629 | double_unlock_balance(rq, lowest_rq); | 1631 | double_unlock_balance(rq, lowest_rq); |
| 1630 | lowest_rq = NULL; | 1632 | lowest_rq = NULL; |
| @@ -1658,7 +1660,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1658 | BUG_ON(task_current(rq, p)); | 1660 | BUG_ON(task_current(rq, p)); |
| 1659 | BUG_ON(p->nr_cpus_allowed <= 1); | 1661 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1660 | 1662 | ||
| 1661 | BUG_ON(!p->on_rq); | 1663 | BUG_ON(!task_on_rq_queued(p)); |
| 1662 | BUG_ON(!rt_task(p)); | 1664 | BUG_ON(!rt_task(p)); |
| 1663 | 1665 | ||
| 1664 | return p; | 1666 | return p; |
| @@ -1809,7 +1811,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1809 | */ | 1811 | */ |
| 1810 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1812 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
| 1811 | WARN_ON(p == src_rq->curr); | 1813 | WARN_ON(p == src_rq->curr); |
| 1812 | WARN_ON(!p->on_rq); | 1814 | WARN_ON(!task_on_rq_queued(p)); |
| 1813 | 1815 | ||
| 1814 | /* | 1816 | /* |
| 1815 | * There's a chance that p is higher in priority | 1817 | * There's a chance that p is higher in priority |
| @@ -1870,7 +1872,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1870 | 1872 | ||
| 1871 | BUG_ON(!rt_task(p)); | 1873 | BUG_ON(!rt_task(p)); |
| 1872 | 1874 | ||
| 1873 | if (!p->on_rq) | 1875 | if (!task_on_rq_queued(p)) |
| 1874 | return; | 1876 | return; |
| 1875 | 1877 | ||
| 1876 | weight = cpumask_weight(new_mask); | 1878 | weight = cpumask_weight(new_mask); |
| @@ -1936,7 +1938,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1936 | * we may need to handle the pulling of RT tasks | 1938 | * we may need to handle the pulling of RT tasks |
| 1937 | * now. | 1939 | * now. |
| 1938 | */ | 1940 | */ |
| 1939 | if (!p->on_rq || rq->rt.rt_nr_running) | 1941 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
| 1940 | return; | 1942 | return; |
| 1941 | 1943 | ||
| 1942 | if (pull_rt_task(rq)) | 1944 | if (pull_rt_task(rq)) |
| @@ -1970,7 +1972,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1970 | * If that current running task is also an RT task | 1972 | * If that current running task is also an RT task |
| 1971 | * then see if we can move to another run queue. | 1973 | * then see if we can move to another run queue. |
| 1972 | */ | 1974 | */ |
| 1973 | if (p->on_rq && rq->curr != p) { | 1975 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1974 | #ifdef CONFIG_SMP | 1976 | #ifdef CONFIG_SMP |
| 1975 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && | 1977 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
| 1976 | /* Don't resched if we changed runqueues */ | 1978 | /* Don't resched if we changed runqueues */ |
| @@ -1989,7 +1991,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1989 | static void | 1991 | static void |
| 1990 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1992 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
| 1991 | { | 1993 | { |
| 1992 | if (!p->on_rq) | 1994 | if (!task_on_rq_queued(p)) |
| 1993 | return; | 1995 | return; |
| 1994 | 1996 | ||
| 1995 | if (rq->curr == p) { | 1997 | if (rq->curr == p) { |
| @@ -2073,7 +2075,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 2073 | for_each_sched_rt_entity(rt_se) { | 2075 | for_each_sched_rt_entity(rt_se) { |
| 2074 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 2076 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
| 2075 | requeue_task_rt(rq, p, 0); | 2077 | requeue_task_rt(rq, p, 0); |
| 2076 | set_tsk_need_resched(p); | 2078 | resched_curr(rq); |
| 2077 | return; | 2079 | return; |
| 2078 | } | 2080 | } |
| 2079 | } | 2081 | } |
| @@ -2129,6 +2131,8 @@ const struct sched_class rt_sched_class = { | |||
| 2129 | 2131 | ||
| 2130 | .prio_changed = prio_changed_rt, | 2132 | .prio_changed = prio_changed_rt, |
| 2131 | .switched_to = switched_to_rt, | 2133 | .switched_to = switched_to_rt, |
| 2134 | |||
| 2135 | .update_curr = update_curr_rt, | ||
| 2132 | }; | 2136 | }; |
| 2133 | 2137 | ||
| 2134 | #ifdef CONFIG_SCHED_DEBUG | 2138 | #ifdef CONFIG_SCHED_DEBUG |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..9a2a45c970e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -14,6 +14,11 @@ | |||
| 14 | #include "cpuacct.h" | 14 | #include "cpuacct.h" |
| 15 | 15 | ||
| 16 | struct rq; | 16 | struct rq; |
| 17 | struct cpuidle_state; | ||
| 18 | |||
| 19 | /* task_struct::on_rq states: */ | ||
| 20 | #define TASK_ON_RQ_QUEUED 1 | ||
| 21 | #define TASK_ON_RQ_MIGRATING 2 | ||
| 17 | 22 | ||
| 18 | extern __read_mostly int scheduler_running; | 23 | extern __read_mostly int scheduler_running; |
| 19 | 24 | ||
| @@ -126,6 +131,9 @@ struct rt_bandwidth { | |||
| 126 | u64 rt_runtime; | 131 | u64 rt_runtime; |
| 127 | struct hrtimer rt_period_timer; | 132 | struct hrtimer rt_period_timer; |
| 128 | }; | 133 | }; |
| 134 | |||
| 135 | void __dl_clear_params(struct task_struct *p); | ||
| 136 | |||
| 129 | /* | 137 | /* |
| 130 | * To keep the bandwidth of -deadline tasks and groups under control | 138 | * To keep the bandwidth of -deadline tasks and groups under control |
| 131 | * we need some place where: | 139 | * we need some place where: |
| @@ -168,6 +176,25 @@ struct dl_bw { | |||
| 168 | u64 bw, total_bw; | 176 | u64 bw, total_bw; |
| 169 | }; | 177 | }; |
| 170 | 178 | ||
| 179 | static inline | ||
| 180 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 181 | { | ||
| 182 | dl_b->total_bw -= tsk_bw; | ||
| 183 | } | ||
| 184 | |||
| 185 | static inline | ||
| 186 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | ||
| 187 | { | ||
| 188 | dl_b->total_bw += tsk_bw; | ||
| 189 | } | ||
| 190 | |||
| 191 | static inline | ||
| 192 | bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | ||
| 193 | { | ||
| 194 | return dl_b->bw != -1 && | ||
| 195 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | ||
| 196 | } | ||
| 197 | |||
| 171 | extern struct mutex sched_domains_mutex; | 198 | extern struct mutex sched_domains_mutex; |
| 172 | 199 | ||
| 173 | #ifdef CONFIG_CGROUP_SCHED | 200 | #ifdef CONFIG_CGROUP_SCHED |
| @@ -184,7 +211,7 @@ struct cfs_bandwidth { | |||
| 184 | raw_spinlock_t lock; | 211 | raw_spinlock_t lock; |
| 185 | ktime_t period; | 212 | ktime_t period; |
| 186 | u64 quota, runtime; | 213 | u64 quota, runtime; |
| 187 | s64 hierarchal_quota; | 214 | s64 hierarchical_quota; |
| 188 | u64 runtime_expires; | 215 | u64 runtime_expires; |
| 189 | 216 | ||
| 190 | int idle, timer_active; | 217 | int idle, timer_active; |
| @@ -636,6 +663,11 @@ struct rq { | |||
| 636 | #ifdef CONFIG_SMP | 663 | #ifdef CONFIG_SMP |
| 637 | struct llist_head wake_list; | 664 | struct llist_head wake_list; |
| 638 | #endif | 665 | #endif |
| 666 | |||
| 667 | #ifdef CONFIG_CPU_IDLE | ||
| 668 | /* Must be inspected within a rcu lock section */ | ||
| 669 | struct cpuidle_state *idle_state; | ||
| 670 | #endif | ||
| 639 | }; | 671 | }; |
| 640 | 672 | ||
| 641 | static inline int cpu_of(struct rq *rq) | 673 | static inline int cpu_of(struct rq *rq) |
| @@ -647,13 +679,13 @@ static inline int cpu_of(struct rq *rq) | |||
| 647 | #endif | 679 | #endif |
| 648 | } | 680 | } |
| 649 | 681 | ||
| 650 | DECLARE_PER_CPU(struct rq, runqueues); | 682 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 651 | 683 | ||
| 652 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 684 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 653 | #define this_rq() (&__get_cpu_var(runqueues)) | 685 | #define this_rq() this_cpu_ptr(&runqueues) |
| 654 | #define task_rq(p) cpu_rq(task_cpu(p)) | 686 | #define task_rq(p) cpu_rq(task_cpu(p)) |
| 655 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 687 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
| 656 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | 688 | #define raw_rq() raw_cpu_ptr(&runqueues) |
| 657 | 689 | ||
| 658 | static inline u64 rq_clock(struct rq *rq) | 690 | static inline u64 rq_clock(struct rq *rq) |
| 659 | { | 691 | { |
| @@ -665,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
| 665 | return rq->clock_task; | 697 | return rq->clock_task; |
| 666 | } | 698 | } |
| 667 | 699 | ||
| 700 | #ifdef CONFIG_NUMA | ||
| 701 | enum numa_topology_type { | ||
| 702 | NUMA_DIRECT, | ||
| 703 | NUMA_GLUELESS_MESH, | ||
| 704 | NUMA_BACKPLANE, | ||
| 705 | }; | ||
| 706 | extern enum numa_topology_type sched_numa_topology_type; | ||
| 707 | extern int sched_max_numa_distance; | ||
| 708 | extern bool find_numa_distance(int distance); | ||
| 709 | #endif | ||
| 710 | |||
| 668 | #ifdef CONFIG_NUMA_BALANCING | 711 | #ifdef CONFIG_NUMA_BALANCING |
| 712 | /* The regions in numa_faults array from task_struct */ | ||
| 713 | enum numa_faults_stats { | ||
| 714 | NUMA_MEM = 0, | ||
| 715 | NUMA_CPU, | ||
| 716 | NUMA_MEMBUF, | ||
| 717 | NUMA_CPUBUF | ||
| 718 | }; | ||
| 669 | extern void sched_setnuma(struct task_struct *p, int node); | 719 | extern void sched_setnuma(struct task_struct *p, int node); |
| 670 | extern int migrate_task_to(struct task_struct *p, int cpu); | 720 | extern int migrate_task_to(struct task_struct *p, int cpu); |
| 671 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 721 | extern int migrate_swap(struct task_struct *, struct task_struct *); |
| @@ -942,6 +992,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 942 | #endif | 992 | #endif |
| 943 | } | 993 | } |
| 944 | 994 | ||
| 995 | static inline int task_on_rq_queued(struct task_struct *p) | ||
| 996 | { | ||
| 997 | return p->on_rq == TASK_ON_RQ_QUEUED; | ||
| 998 | } | ||
| 999 | |||
| 1000 | static inline int task_on_rq_migrating(struct task_struct *p) | ||
| 1001 | { | ||
| 1002 | return p->on_rq == TASK_ON_RQ_MIGRATING; | ||
| 1003 | } | ||
| 945 | 1004 | ||
| 946 | #ifndef prepare_arch_switch | 1005 | #ifndef prepare_arch_switch |
| 947 | # define prepare_arch_switch(next) do { } while (0) | 1006 | # define prepare_arch_switch(next) do { } while (0) |
| @@ -953,7 +1012,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
| 953 | # define finish_arch_post_lock_switch() do { } while (0) | 1012 | # define finish_arch_post_lock_switch() do { } while (0) |
| 954 | #endif | 1013 | #endif |
| 955 | 1014 | ||
| 956 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 957 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 1015 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 958 | { | 1016 | { |
| 959 | #ifdef CONFIG_SMP | 1017 | #ifdef CONFIG_SMP |
| @@ -991,35 +1049,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 991 | raw_spin_unlock_irq(&rq->lock); | 1049 | raw_spin_unlock_irq(&rq->lock); |
| 992 | } | 1050 | } |
| 993 | 1051 | ||
| 994 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 995 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 996 | { | ||
| 997 | #ifdef CONFIG_SMP | ||
| 998 | /* | ||
| 999 | * We can optimise this out completely for !SMP, because the | ||
| 1000 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 1001 | * here. | ||
| 1002 | */ | ||
| 1003 | next->on_cpu = 1; | ||
| 1004 | #endif | ||
| 1005 | raw_spin_unlock(&rq->lock); | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 1009 | { | ||
| 1010 | #ifdef CONFIG_SMP | ||
| 1011 | /* | ||
| 1012 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 1013 | * We must ensure this doesn't happen until the switch is completely | ||
| 1014 | * finished. | ||
| 1015 | */ | ||
| 1016 | smp_wmb(); | ||
| 1017 | prev->on_cpu = 0; | ||
| 1018 | #endif | ||
| 1019 | local_irq_enable(); | ||
| 1020 | } | ||
| 1021 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 1022 | |||
| 1023 | /* | 1052 | /* |
| 1024 | * wake flags | 1053 | * wake flags |
| 1025 | */ | 1054 | */ |
| @@ -1135,6 +1164,11 @@ struct sched_class { | |||
| 1135 | void (*task_fork) (struct task_struct *p); | 1164 | void (*task_fork) (struct task_struct *p); |
| 1136 | void (*task_dead) (struct task_struct *p); | 1165 | void (*task_dead) (struct task_struct *p); |
| 1137 | 1166 | ||
| 1167 | /* | ||
| 1168 | * The switched_from() call is allowed to drop rq->lock, therefore we | ||
| 1169 | * cannot assume the switched_from/switched_to pair is serliazed by | ||
| 1170 | * rq->lock. They are however serialized by p->pi_lock. | ||
| 1171 | */ | ||
| 1138 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1172 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
| 1139 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1173 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
| 1140 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1174 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
| @@ -1143,6 +1177,8 @@ struct sched_class { | |||
| 1143 | unsigned int (*get_rr_interval) (struct rq *rq, | 1177 | unsigned int (*get_rr_interval) (struct rq *rq, |
| 1144 | struct task_struct *task); | 1178 | struct task_struct *task); |
| 1145 | 1179 | ||
| 1180 | void (*update_curr) (struct rq *rq); | ||
| 1181 | |||
| 1146 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1147 | void (*task_move_group) (struct task_struct *p, int on_rq); | 1183 | void (*task_move_group) (struct task_struct *p, int on_rq); |
| 1148 | #endif | 1184 | #endif |
| @@ -1180,6 +1216,30 @@ static inline void idle_exit_fair(struct rq *rq) { } | |||
| 1180 | 1216 | ||
| 1181 | #endif | 1217 | #endif |
| 1182 | 1218 | ||
| 1219 | #ifdef CONFIG_CPU_IDLE | ||
| 1220 | static inline void idle_set_state(struct rq *rq, | ||
| 1221 | struct cpuidle_state *idle_state) | ||
| 1222 | { | ||
| 1223 | rq->idle_state = idle_state; | ||
| 1224 | } | ||
| 1225 | |||
| 1226 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1227 | { | ||
| 1228 | WARN_ON(!rcu_read_lock_held()); | ||
| 1229 | return rq->idle_state; | ||
| 1230 | } | ||
| 1231 | #else | ||
| 1232 | static inline void idle_set_state(struct rq *rq, | ||
| 1233 | struct cpuidle_state *idle_state) | ||
| 1234 | { | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | ||
| 1238 | { | ||
| 1239 | return NULL; | ||
| 1240 | } | ||
| 1241 | #endif | ||
| 1242 | |||
| 1183 | extern void sysrq_sched_debug_show(void); | 1243 | extern void sysrq_sched_debug_show(void); |
| 1184 | extern void sched_init_granularity(void); | 1244 | extern void sched_init_granularity(void); |
| 1185 | extern void update_max_interval(void); | 1245 | extern void update_max_interval(void); |
| @@ -1486,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | |||
| 1486 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | 1546 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); |
| 1487 | extern void print_cfs_stats(struct seq_file *m, int cpu); | 1547 | extern void print_cfs_stats(struct seq_file *m, int cpu); |
| 1488 | extern void print_rt_stats(struct seq_file *m, int cpu); | 1548 | extern void print_rt_stats(struct seq_file *m, int cpu); |
| 1549 | extern void print_dl_stats(struct seq_file *m, int cpu); | ||
| 1489 | 1550 | ||
| 1490 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1551 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1491 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1552 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..79ffec45a6ac 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
| 28 | { | 28 | { |
| 29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
| 30 | 30 | ||
| 31 | if (!stop || !stop->on_rq) | 31 | if (!stop || !task_on_rq_queued(stop)) |
| 32 | return NULL; | 32 | return NULL; |
| 33 | 33 | ||
| 34 | put_prev_task(rq, prev); | 34 | put_prev_task(rq, prev); |
| @@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
| 102 | return 0; | 102 | return 0; |
| 103 | } | 103 | } |
| 104 | 104 | ||
| 105 | static void update_curr_stop(struct rq *rq) | ||
| 106 | { | ||
| 107 | } | ||
| 108 | |||
| 105 | /* | 109 | /* |
| 106 | * Simple, special scheduling class for the per-CPU stop tasks: | 110 | * Simple, special scheduling class for the per-CPU stop tasks: |
| 107 | */ | 111 | */ |
| @@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = { | |||
| 128 | 132 | ||
| 129 | .prio_changed = prio_changed_stop, | 133 | .prio_changed = prio_changed_stop, |
| 130 | .switched_to = switched_to_stop, | 134 | .switched_to = switched_to_stop, |
| 135 | .update_curr = update_curr_stop, | ||
| 131 | }; | 136 | }; |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 15cab1a4f84e..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
| 11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
| 12 | #include <linux/kthread.h> | ||
| 12 | 13 | ||
| 13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) | 14 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
| 14 | { | 15 | { |
| @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * | |||
| 297 | } | 298 | } |
| 298 | EXPORT_SYMBOL(autoremove_wake_function); | 299 | EXPORT_SYMBOL(autoremove_wake_function); |
| 299 | 300 | ||
| 301 | static inline bool is_kthread_should_stop(void) | ||
| 302 | { | ||
| 303 | return (current->flags & PF_KTHREAD) && kthread_should_stop(); | ||
| 304 | } | ||
| 305 | |||
| 306 | /* | ||
| 307 | * DEFINE_WAIT_FUNC(wait, woken_wake_func); | ||
| 308 | * | ||
| 309 | * add_wait_queue(&wq, &wait); | ||
| 310 | * for (;;) { | ||
| 311 | * if (condition) | ||
| 312 | * break; | ||
| 313 | * | ||
| 314 | * p->state = mode; condition = true; | ||
| 315 | * smp_mb(); // A smp_wmb(); // C | ||
| 316 | * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; | ||
| 317 | * schedule() try_to_wake_up(); | ||
| 318 | * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ | ||
| 319 | * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; | ||
| 320 | * smp_mb() // B smp_wmb(); // C | ||
| 321 | * wait->flags |= WQ_FLAG_WOKEN; | ||
| 322 | * } | ||
| 323 | * remove_wait_queue(&wq, &wait); | ||
| 324 | * | ||
| 325 | */ | ||
| 326 | long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | ||
| 327 | { | ||
| 328 | set_current_state(mode); /* A */ | ||
| 329 | /* | ||
| 330 | * The above implies an smp_mb(), which matches with the smp_wmb() from | ||
| 331 | * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must | ||
| 332 | * also observe all state before the wakeup. | ||
| 333 | */ | ||
| 334 | if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) | ||
| 335 | timeout = schedule_timeout(timeout); | ||
| 336 | __set_current_state(TASK_RUNNING); | ||
| 337 | |||
| 338 | /* | ||
| 339 | * The below implies an smp_mb(), it too pairs with the smp_wmb() from | ||
| 340 | * woken_wake_function() such that we must either observe the wait | ||
| 341 | * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss | ||
| 342 | * an event. | ||
| 343 | */ | ||
| 344 | set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ | ||
| 345 | |||
| 346 | return timeout; | ||
| 347 | } | ||
| 348 | EXPORT_SYMBOL(wait_woken); | ||
| 349 | |||
| 350 | int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
| 351 | { | ||
| 352 | /* | ||
| 353 | * Although this function is called under waitqueue lock, LOCK | ||
| 354 | * doesn't imply write barrier and the users expects write | ||
| 355 | * barrier semantics on wakeup functions. The following | ||
| 356 | * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() | ||
| 357 | * and is paired with set_mb() in wait_woken(). | ||
| 358 | */ | ||
| 359 | smp_wmb(); /* C */ | ||
| 360 | wait->flags |= WQ_FLAG_WOKEN; | ||
| 361 | |||
| 362 | return default_wake_function(wait, mode, sync, key); | ||
| 363 | } | ||
| 364 | EXPORT_SYMBOL(woken_wake_function); | ||
| 365 | |||
| 300 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) | 366 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) |
| 301 | { | 367 | { |
| 302 | struct wait_bit_key *key = arg; | 368 | struct wait_bit_key *key = arg; |
| @@ -343,6 +409,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, | |||
| 343 | } | 409 | } |
| 344 | EXPORT_SYMBOL(out_of_line_wait_on_bit); | 410 | EXPORT_SYMBOL(out_of_line_wait_on_bit); |
| 345 | 411 | ||
| 412 | int __sched out_of_line_wait_on_bit_timeout( | ||
| 413 | void *word, int bit, wait_bit_action_f *action, | ||
| 414 | unsigned mode, unsigned long timeout) | ||
| 415 | { | ||
| 416 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
| 417 | DEFINE_WAIT_BIT(wait, word, bit); | ||
| 418 | |||
| 419 | wait.key.timeout = jiffies + timeout; | ||
| 420 | return __wait_on_bit(wq, &wait, action, mode); | ||
| 421 | } | ||
| 422 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | ||
| 423 | |||
| 346 | int __sched | 424 | int __sched |
| 347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 425 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
| 348 | wait_bit_action_f *action, unsigned mode) | 426 | wait_bit_action_f *action, unsigned mode) |
| @@ -520,3 +598,27 @@ __sched int bit_wait_io(struct wait_bit_key *word) | |||
| 520 | return 0; | 598 | return 0; |
| 521 | } | 599 | } |
| 522 | EXPORT_SYMBOL(bit_wait_io); | 600 | EXPORT_SYMBOL(bit_wait_io); |
| 601 | |||
| 602 | __sched int bit_wait_timeout(struct wait_bit_key *word) | ||
| 603 | { | ||
| 604 | unsigned long now = ACCESS_ONCE(jiffies); | ||
| 605 | if (signal_pending_state(current->state, current)) | ||
| 606 | return 1; | ||
| 607 | if (time_after_eq(now, word->timeout)) | ||
| 608 | return -EAGAIN; | ||
| 609 | schedule_timeout(word->timeout - now); | ||
| 610 | return 0; | ||
| 611 | } | ||
| 612 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | ||
| 613 | |||
| 614 | __sched int bit_wait_io_timeout(struct wait_bit_key *word) | ||
| 615 | { | ||
| 616 | unsigned long now = ACCESS_ONCE(jiffies); | ||
| 617 | if (signal_pending_state(current->state, current)) | ||
| 618 | return 1; | ||
| 619 | if (time_after_eq(now, word->timeout)) | ||
| 620 | return -EAGAIN; | ||
| 621 | io_schedule_timeout(word->timeout - now); | ||
| 622 | return 0; | ||
| 623 | } | ||
| 624 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | ||
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..4ef9687ac115 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -21,10 +21,11 @@ | |||
| 21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
| 22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
| 23 | 23 | ||
| 24 | /* #define SECCOMP_DEBUG 1 */ | 24 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
| 25 | #include <asm/syscall.h> | ||
| 26 | #endif | ||
| 25 | 27 | ||
| 26 | #ifdef CONFIG_SECCOMP_FILTER | 28 | #ifdef CONFIG_SECCOMP_FILTER |
| 27 | #include <asm/syscall.h> | ||
| 28 | #include <linux/filter.h> | 29 | #include <linux/filter.h> |
| 29 | #include <linux/pid.h> | 30 | #include <linux/pid.h> |
| 30 | #include <linux/ptrace.h> | 31 | #include <linux/ptrace.h> |
| @@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
| 172 | * | 173 | * |
| 173 | * Returns valid seccomp BPF response codes. | 174 | * Returns valid seccomp BPF response codes. |
| 174 | */ | 175 | */ |
| 175 | static u32 seccomp_run_filters(int syscall) | 176 | static u32 seccomp_run_filters(struct seccomp_data *sd) |
| 176 | { | 177 | { |
| 177 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); | 178 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); |
| 178 | struct seccomp_data sd; | 179 | struct seccomp_data sd_local; |
| 179 | u32 ret = SECCOMP_RET_ALLOW; | 180 | u32 ret = SECCOMP_RET_ALLOW; |
| 180 | 181 | ||
| 181 | /* Ensure unexpected behavior doesn't result in failing open. */ | 182 | /* Ensure unexpected behavior doesn't result in failing open. */ |
| @@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall) | |||
| 185 | /* Make sure cross-thread synced filter points somewhere sane. */ | 186 | /* Make sure cross-thread synced filter points somewhere sane. */ |
| 186 | smp_read_barrier_depends(); | 187 | smp_read_barrier_depends(); |
| 187 | 188 | ||
| 188 | populate_seccomp_data(&sd); | 189 | if (!sd) { |
| 190 | populate_seccomp_data(&sd_local); | ||
| 191 | sd = &sd_local; | ||
| 192 | } | ||
| 189 | 193 | ||
| 190 | /* | 194 | /* |
| 191 | * All filters in the list are evaluated and the lowest BPF return | 195 | * All filters in the list are evaluated and the lowest BPF return |
| 192 | * value always takes priority (ignoring the DATA). | 196 | * value always takes priority (ignoring the DATA). |
| 193 | */ | 197 | */ |
| 194 | for (; f; f = f->prev) { | 198 | for (; f; f = f->prev) { |
| 195 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); | 199 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); |
| 196 | 200 | ||
| 197 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 201 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
| 198 | ret = cur_ret; | 202 | ret = cur_ret; |
| @@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
| 395 | if (!filter) | 399 | if (!filter) |
| 396 | goto free_prog; | 400 | goto free_prog; |
| 397 | 401 | ||
| 398 | filter->prog = kzalloc(bpf_prog_size(new_len), | 402 | filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); |
| 399 | GFP_KERNEL|__GFP_NOWARN); | ||
| 400 | if (!filter->prog) | 403 | if (!filter->prog) |
| 401 | goto free_filter; | 404 | goto free_filter; |
| 402 | 405 | ||
| 403 | ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); | 406 | ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); |
| 404 | if (ret) | 407 | if (ret) |
| 405 | goto free_filter_prog; | 408 | goto free_filter_prog; |
| 406 | kfree(fp); | ||
| 407 | 409 | ||
| 410 | kfree(fp); | ||
| 408 | atomic_set(&filter->usage, 1); | 411 | atomic_set(&filter->usage, 1); |
| 409 | filter->prog->len = new_len; | 412 | filter->prog->len = new_len; |
| 410 | 413 | ||
| @@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
| 413 | return filter; | 416 | return filter; |
| 414 | 417 | ||
| 415 | free_filter_prog: | 418 | free_filter_prog: |
| 416 | kfree(filter->prog); | 419 | __bpf_prog_free(filter->prog); |
| 417 | free_filter: | 420 | free_filter: |
| 418 | kfree(filter); | 421 | kfree(filter); |
| 419 | free_prog: | 422 | free_prog: |
| @@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = { | |||
| 564 | }; | 567 | }; |
| 565 | #endif | 568 | #endif |
| 566 | 569 | ||
| 567 | int __secure_computing(int this_syscall) | 570 | static void __secure_computing_strict(int this_syscall) |
| 571 | { | ||
| 572 | int *syscall_whitelist = mode1_syscalls; | ||
| 573 | #ifdef CONFIG_COMPAT | ||
| 574 | if (is_compat_task()) | ||
| 575 | syscall_whitelist = mode1_syscalls_32; | ||
| 576 | #endif | ||
| 577 | do { | ||
| 578 | if (*syscall_whitelist == this_syscall) | ||
| 579 | return; | ||
| 580 | } while (*++syscall_whitelist); | ||
| 581 | |||
| 582 | #ifdef SECCOMP_DEBUG | ||
| 583 | dump_stack(); | ||
| 584 | #endif | ||
| 585 | audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); | ||
| 586 | do_exit(SIGKILL); | ||
| 587 | } | ||
| 588 | |||
| 589 | #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER | ||
| 590 | void secure_computing_strict(int this_syscall) | ||
| 591 | { | ||
| 592 | int mode = current->seccomp.mode; | ||
| 593 | |||
| 594 | if (mode == 0) | ||
| 595 | return; | ||
| 596 | else if (mode == SECCOMP_MODE_STRICT) | ||
| 597 | __secure_computing_strict(this_syscall); | ||
| 598 | else | ||
| 599 | BUG(); | ||
| 600 | } | ||
| 601 | #else | ||
| 602 | int __secure_computing(void) | ||
| 603 | { | ||
| 604 | u32 phase1_result = seccomp_phase1(NULL); | ||
| 605 | |||
| 606 | if (likely(phase1_result == SECCOMP_PHASE1_OK)) | ||
| 607 | return 0; | ||
| 608 | else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) | ||
| 609 | return -1; | ||
| 610 | else | ||
| 611 | return seccomp_phase2(phase1_result); | ||
| 612 | } | ||
| 613 | |||
| 614 | #ifdef CONFIG_SECCOMP_FILTER | ||
| 615 | static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) | ||
| 568 | { | 616 | { |
| 569 | int exit_sig = 0; | 617 | u32 filter_ret, action; |
| 570 | int *syscall; | 618 | int data; |
| 571 | u32 ret; | ||
| 572 | 619 | ||
| 573 | /* | 620 | /* |
| 574 | * Make sure that any changes to mode from another thread have | 621 | * Make sure that any changes to mode from another thread have |
| @@ -576,85 +623,127 @@ int __secure_computing(int this_syscall) | |||
| 576 | */ | 623 | */ |
| 577 | rmb(); | 624 | rmb(); |
| 578 | 625 | ||
| 579 | switch (current->seccomp.mode) { | 626 | filter_ret = seccomp_run_filters(sd); |
| 580 | case SECCOMP_MODE_STRICT: | 627 | data = filter_ret & SECCOMP_RET_DATA; |
| 581 | syscall = mode1_syscalls; | 628 | action = filter_ret & SECCOMP_RET_ACTION; |
| 582 | #ifdef CONFIG_COMPAT | 629 | |
| 583 | if (is_compat_task()) | 630 | switch (action) { |
| 584 | syscall = mode1_syscalls_32; | 631 | case SECCOMP_RET_ERRNO: |
| 632 | /* Set the low-order 16-bits as a errno. */ | ||
| 633 | syscall_set_return_value(current, task_pt_regs(current), | ||
| 634 | -data, 0); | ||
| 635 | goto skip; | ||
| 636 | |||
| 637 | case SECCOMP_RET_TRAP: | ||
| 638 | /* Show the handler the original registers. */ | ||
| 639 | syscall_rollback(current, task_pt_regs(current)); | ||
| 640 | /* Let the filter pass back 16 bits of data. */ | ||
| 641 | seccomp_send_sigsys(this_syscall, data); | ||
| 642 | goto skip; | ||
| 643 | |||
| 644 | case SECCOMP_RET_TRACE: | ||
| 645 | return filter_ret; /* Save the rest for phase 2. */ | ||
| 646 | |||
| 647 | case SECCOMP_RET_ALLOW: | ||
| 648 | return SECCOMP_PHASE1_OK; | ||
| 649 | |||
| 650 | case SECCOMP_RET_KILL: | ||
| 651 | default: | ||
| 652 | audit_seccomp(this_syscall, SIGSYS, action); | ||
| 653 | do_exit(SIGSYS); | ||
| 654 | } | ||
| 655 | |||
| 656 | unreachable(); | ||
| 657 | |||
| 658 | skip: | ||
| 659 | audit_seccomp(this_syscall, 0, action); | ||
| 660 | return SECCOMP_PHASE1_SKIP; | ||
| 661 | } | ||
| 585 | #endif | 662 | #endif |
| 586 | do { | 663 | |
| 587 | if (*syscall == this_syscall) | 664 | /** |
| 588 | return 0; | 665 | * seccomp_phase1() - run fast path seccomp checks on the current syscall |
| 589 | } while (*++syscall); | 666 | * @arg sd: The seccomp_data or NULL |
| 590 | exit_sig = SIGKILL; | 667 | * |
| 591 | ret = SECCOMP_RET_KILL; | 668 | * This only reads pt_regs via the syscall_xyz helpers. The only change |
| 592 | break; | 669 | * it will make to pt_regs is via syscall_set_return_value, and it will |
| 670 | * only do that if it returns SECCOMP_PHASE1_SKIP. | ||
| 671 | * | ||
| 672 | * If sd is provided, it will not read pt_regs at all. | ||
| 673 | * | ||
| 674 | * It may also call do_exit or force a signal; these actions must be | ||
| 675 | * safe. | ||
| 676 | * | ||
| 677 | * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should | ||
| 678 | * be processed normally. | ||
| 679 | * | ||
| 680 | * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be | ||
| 681 | * invoked. In this case, seccomp_phase1 will have set the return value | ||
| 682 | * using syscall_set_return_value. | ||
| 683 | * | ||
| 684 | * If it returns anything else, then the return value should be passed | ||
| 685 | * to seccomp_phase2 from a context in which ptrace hooks are safe. | ||
| 686 | */ | ||
| 687 | u32 seccomp_phase1(struct seccomp_data *sd) | ||
| 688 | { | ||
| 689 | int mode = current->seccomp.mode; | ||
| 690 | int this_syscall = sd ? sd->nr : | ||
| 691 | syscall_get_nr(current, task_pt_regs(current)); | ||
| 692 | |||
| 693 | switch (mode) { | ||
| 694 | case SECCOMP_MODE_STRICT: | ||
| 695 | __secure_computing_strict(this_syscall); /* may call do_exit */ | ||
| 696 | return SECCOMP_PHASE1_OK; | ||
| 593 | #ifdef CONFIG_SECCOMP_FILTER | 697 | #ifdef CONFIG_SECCOMP_FILTER |
| 594 | case SECCOMP_MODE_FILTER: { | 698 | case SECCOMP_MODE_FILTER: |
| 595 | int data; | 699 | return __seccomp_phase1_filter(this_syscall, sd); |
| 596 | struct pt_regs *regs = task_pt_regs(current); | ||
| 597 | ret = seccomp_run_filters(this_syscall); | ||
| 598 | data = ret & SECCOMP_RET_DATA; | ||
| 599 | ret &= SECCOMP_RET_ACTION; | ||
| 600 | switch (ret) { | ||
| 601 | case SECCOMP_RET_ERRNO: | ||
| 602 | /* Set the low-order 16-bits as a errno. */ | ||
| 603 | syscall_set_return_value(current, regs, | ||
| 604 | -data, 0); | ||
| 605 | goto skip; | ||
| 606 | case SECCOMP_RET_TRAP: | ||
| 607 | /* Show the handler the original registers. */ | ||
| 608 | syscall_rollback(current, regs); | ||
| 609 | /* Let the filter pass back 16 bits of data. */ | ||
| 610 | seccomp_send_sigsys(this_syscall, data); | ||
| 611 | goto skip; | ||
| 612 | case SECCOMP_RET_TRACE: | ||
| 613 | /* Skip these calls if there is no tracer. */ | ||
| 614 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 615 | syscall_set_return_value(current, regs, | ||
| 616 | -ENOSYS, 0); | ||
| 617 | goto skip; | ||
| 618 | } | ||
| 619 | /* Allow the BPF to provide the event message */ | ||
| 620 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 621 | /* | ||
| 622 | * The delivery of a fatal signal during event | ||
| 623 | * notification may silently skip tracer notification. | ||
| 624 | * Terminating the task now avoids executing a system | ||
| 625 | * call that may not be intended. | ||
| 626 | */ | ||
| 627 | if (fatal_signal_pending(current)) | ||
| 628 | break; | ||
| 629 | if (syscall_get_nr(current, regs) < 0) | ||
| 630 | goto skip; /* Explicit request to skip. */ | ||
| 631 | |||
| 632 | return 0; | ||
| 633 | case SECCOMP_RET_ALLOW: | ||
| 634 | return 0; | ||
| 635 | case SECCOMP_RET_KILL: | ||
| 636 | default: | ||
| 637 | break; | ||
| 638 | } | ||
| 639 | exit_sig = SIGSYS; | ||
| 640 | break; | ||
| 641 | } | ||
| 642 | #endif | 700 | #endif |
| 643 | default: | 701 | default: |
| 644 | BUG(); | 702 | BUG(); |
| 645 | } | 703 | } |
| 704 | } | ||
| 646 | 705 | ||
| 647 | #ifdef SECCOMP_DEBUG | 706 | /** |
| 648 | dump_stack(); | 707 | * seccomp_phase2() - finish slow path seccomp work for the current syscall |
| 649 | #endif | 708 | * @phase1_result: The return value from seccomp_phase1() |
| 650 | audit_seccomp(this_syscall, exit_sig, ret); | 709 | * |
| 651 | do_exit(exit_sig); | 710 | * This must be called from a context in which ptrace hooks can be used. |
| 652 | #ifdef CONFIG_SECCOMP_FILTER | 711 | * |
| 653 | skip: | 712 | * Returns 0 if the syscall should be processed or -1 to skip the syscall. |
| 654 | audit_seccomp(this_syscall, exit_sig, ret); | 713 | */ |
| 655 | #endif | 714 | int seccomp_phase2(u32 phase1_result) |
| 656 | return -1; | 715 | { |
| 716 | struct pt_regs *regs = task_pt_regs(current); | ||
| 717 | u32 action = phase1_result & SECCOMP_RET_ACTION; | ||
| 718 | int data = phase1_result & SECCOMP_RET_DATA; | ||
| 719 | |||
| 720 | BUG_ON(action != SECCOMP_RET_TRACE); | ||
| 721 | |||
| 722 | audit_seccomp(syscall_get_nr(current, regs), 0, action); | ||
| 723 | |||
| 724 | /* Skip these calls if there is no tracer. */ | ||
| 725 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { | ||
| 726 | syscall_set_return_value(current, regs, | ||
| 727 | -ENOSYS, 0); | ||
| 728 | return -1; | ||
| 729 | } | ||
| 730 | |||
| 731 | /* Allow the BPF to provide the event message */ | ||
| 732 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
| 733 | /* | ||
| 734 | * The delivery of a fatal signal during event | ||
| 735 | * notification may silently skip tracer notification. | ||
| 736 | * Terminating the task now avoids executing a system | ||
| 737 | * call that may not be intended. | ||
| 738 | */ | ||
| 739 | if (fatal_signal_pending(current)) | ||
| 740 | do_exit(SIGSYS); | ||
| 741 | if (syscall_get_nr(current, regs) < 0) | ||
| 742 | return -1; /* Explicit request to skip. */ | ||
| 743 | |||
| 744 | return 0; | ||
| 657 | } | 745 | } |
| 746 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ | ||
| 658 | 747 | ||
| 659 | long prctl_get_seccomp(void) | 748 | long prctl_get_seccomp(void) |
| 660 | { | 749 | { |
diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..16a305295256 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
| 1275 | local_irq_restore(*flags); | 1275 | local_irq_restore(*flags); |
| 1276 | break; | 1276 | break; |
| 1277 | } | 1277 | } |
| 1278 | 1278 | /* | |
| 1279 | * This sighand can be already freed and even reused, but | ||
| 1280 | * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which | ||
| 1281 | * initializes ->siglock: this slab can't go away, it has | ||
| 1282 | * the same object type, ->siglock can't be reinitialized. | ||
| 1283 | * | ||
| 1284 | * We need to ensure that tsk->sighand is still the same | ||
| 1285 | * after we take the lock, we can race with de_thread() or | ||
| 1286 | * __exit_signal(). In the latter case the next iteration | ||
| 1287 | * must see ->sighand == NULL. | ||
| 1288 | */ | ||
| 1279 | spin_lock(&sighand->siglock); | 1289 | spin_lock(&sighand->siglock); |
| 1280 | if (likely(sighand == tsk->sighand)) { | 1290 | if (likely(sighand == tsk->sighand)) { |
| 1281 | rcu_read_unlock(); | 1291 | rcu_read_unlock(); |
| @@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
| 1331 | int error = -ESRCH; | 1341 | int error = -ESRCH; |
| 1332 | struct task_struct *p; | 1342 | struct task_struct *p; |
| 1333 | 1343 | ||
| 1334 | rcu_read_lock(); | 1344 | for (;;) { |
| 1335 | retry: | 1345 | rcu_read_lock(); |
| 1336 | p = pid_task(pid, PIDTYPE_PID); | 1346 | p = pid_task(pid, PIDTYPE_PID); |
| 1337 | if (p) { | 1347 | if (p) |
| 1338 | error = group_send_sig_info(sig, info, p); | 1348 | error = group_send_sig_info(sig, info, p); |
| 1339 | if (unlikely(error == -ESRCH)) | 1349 | rcu_read_unlock(); |
| 1340 | /* | 1350 | if (likely(!p || error != -ESRCH)) |
| 1341 | * The task was unhashed in between, try again. | 1351 | return error; |
| 1342 | * If it is dead, pid_task() will return NULL, | ||
| 1343 | * if we race with de_thread() it will find the | ||
| 1344 | * new leader. | ||
| 1345 | */ | ||
| 1346 | goto retry; | ||
| 1347 | } | ||
| 1348 | rcu_read_unlock(); | ||
| 1349 | 1352 | ||
| 1350 | return error; | 1353 | /* |
| 1354 | * The task was unhashed in between, try again. If it | ||
| 1355 | * is dead, pid_task() will return NULL, if we race with | ||
| 1356 | * de_thread() it will find the new leader. | ||
| 1357 | */ | ||
| 1358 | } | ||
| 1351 | } | 1359 | } |
| 1352 | 1360 | ||
| 1353 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1361 | int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
| @@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2748 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | 2756 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) |
| 2749 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2757 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
| 2750 | #endif | 2758 | #endif |
| 2759 | #ifdef SEGV_BNDERR | ||
| 2760 | err |= __put_user(from->si_lower, &to->si_lower); | ||
| 2761 | err |= __put_user(from->si_upper, &to->si_upper); | ||
| 2762 | #endif | ||
| 2751 | break; | 2763 | break; |
| 2752 | case __SI_CHLD: | 2764 | case __SI_CHLD: |
| 2753 | err |= __put_user(from->si_pid, &to->si_pid); | 2765 | err |= __put_user(from->si_pid, &to->si_pid); |
diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..f38a1e692259 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
| 14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/sched.h> | ||
| 16 | 17 | ||
| 17 | #include "smpboot.h" | 18 | #include "smpboot.h" |
| 18 | 19 | ||
| @@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, | |||
| 164 | if (!csd) { | 165 | if (!csd) { |
| 165 | csd = &csd_stack; | 166 | csd = &csd_stack; |
| 166 | if (!wait) | 167 | if (!wait) |
| 167 | csd = &__get_cpu_var(csd_data); | 168 | csd = this_cpu_ptr(&csd_data); |
| 168 | } | 169 | } |
| 169 | 170 | ||
| 170 | csd_lock(csd); | 171 | csd_lock(csd); |
| @@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
| 229 | 230 | ||
| 230 | WARN_ON(!irqs_disabled()); | 231 | WARN_ON(!irqs_disabled()); |
| 231 | 232 | ||
| 232 | head = &__get_cpu_var(call_single_queue); | 233 | head = this_cpu_ptr(&call_single_queue); |
| 233 | entry = llist_del_all(head); | 234 | entry = llist_del_all(head); |
| 234 | entry = llist_reverse_order(entry); | 235 | entry = llist_reverse_order(entry); |
| 235 | 236 | ||
| @@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 419 | return; | 420 | return; |
| 420 | } | 421 | } |
| 421 | 422 | ||
| 422 | cfd = &__get_cpu_var(cfd_data); | 423 | cfd = this_cpu_ptr(&cfd_data); |
| 423 | 424 | ||
| 424 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); | 425 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); |
| 425 | cpumask_clear_cpu(this_cpu, cfd->cpumask); | 426 | cpumask_clear_cpu(this_cpu, cfd->cpumask); |
| @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) | |||
| 699 | smp_call_function(do_nothing, NULL, 1); | 700 | smp_call_function(do_nothing, NULL, 1); |
| 700 | } | 701 | } |
| 701 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | 702 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); |
| 703 | |||
| 704 | /** | ||
| 705 | * wake_up_all_idle_cpus - break all cpus out of idle | ||
| 706 | * wake_up_all_idle_cpus try to break all cpus which is in idle state even | ||
| 707 | * including idle polling cpus, for non-idle cpus, we will do nothing | ||
| 708 | * for them. | ||
| 709 | */ | ||
| 710 | void wake_up_all_idle_cpus(void) | ||
| 711 | { | ||
| 712 | int cpu; | ||
| 713 | |||
| 714 | preempt_disable(); | ||
| 715 | for_each_online_cpu(cpu) { | ||
| 716 | if (cpu == smp_processor_id()) | ||
| 717 | continue; | ||
| 718 | |||
| 719 | wake_up_if_idle(cpu); | ||
| 720 | } | ||
| 721 | preempt_enable(); | ||
| 722 | } | ||
| 723 | EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index eb89e1807408..f032fb5284e3 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data) | |||
| 110 | set_current_state(TASK_INTERRUPTIBLE); | 110 | set_current_state(TASK_INTERRUPTIBLE); |
| 111 | preempt_disable(); | 111 | preempt_disable(); |
| 112 | if (kthread_should_stop()) { | 112 | if (kthread_should_stop()) { |
| 113 | set_current_state(TASK_RUNNING); | 113 | __set_current_state(TASK_RUNNING); |
| 114 | preempt_enable(); | 114 | preempt_enable(); |
| 115 | if (ht->cleanup) | 115 | if (ht->cleanup) |
| 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); |
| @@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data) | |||
| 136 | /* Check for state change setup */ | 136 | /* Check for state change setup */ |
| 137 | switch (td->status) { | 137 | switch (td->status) { |
| 138 | case HP_THREAD_NONE: | 138 | case HP_THREAD_NONE: |
| 139 | __set_current_state(TASK_RUNNING); | ||
| 139 | preempt_enable(); | 140 | preempt_enable(); |
| 140 | if (ht->setup) | 141 | if (ht->setup) |
| 141 | ht->setup(td->cpu); | 142 | ht->setup(td->cpu); |
| 142 | td->status = HP_THREAD_ACTIVE; | 143 | td->status = HP_THREAD_ACTIVE; |
| 143 | preempt_disable(); | 144 | continue; |
| 144 | break; | 145 | |
| 145 | case HP_THREAD_PARKED: | 146 | case HP_THREAD_PARKED: |
| 147 | __set_current_state(TASK_RUNNING); | ||
| 146 | preempt_enable(); | 148 | preempt_enable(); |
| 147 | if (ht->unpark) | 149 | if (ht->unpark) |
| 148 | ht->unpark(td->cpu); | 150 | ht->unpark(td->cpu); |
| 149 | td->status = HP_THREAD_ACTIVE; | 151 | td->status = HP_THREAD_ACTIVE; |
| 150 | preempt_disable(); | 152 | continue; |
| 151 | break; | ||
| 152 | } | 153 | } |
| 153 | 154 | ||
| 154 | if (!ht->thread_should_run(td->cpu)) { | 155 | if (!ht->thread_should_run(td->cpu)) { |
| 155 | preempt_enable(); | 156 | preempt_enable_no_resched(); |
| 156 | schedule(); | 157 | schedule(); |
| 157 | } else { | 158 | } else { |
| 158 | set_current_state(TASK_RUNNING); | 159 | __set_current_state(TASK_RUNNING); |
| 159 | preempt_enable(); | 160 | preempt_enable(); |
| 160 | ht->thread_fn(td->cpu); | 161 | ht->thread_fn(td->cpu); |
| 161 | } | 162 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..501baa9ac1be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -278,7 +278,7 @@ restart: | |||
| 278 | pending >>= softirq_bit; | 278 | pending >>= softirq_bit; |
| 279 | } | 279 | } |
| 280 | 280 | ||
| 281 | rcu_bh_qs(smp_processor_id()); | 281 | rcu_bh_qs(); |
| 282 | local_irq_disable(); | 282 | local_irq_disable(); |
| 283 | 283 | ||
| 284 | pending = local_softirq_pending(); | 284 | pending = local_softirq_pending(); |
| @@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a) | |||
| 485 | local_irq_disable(); | 485 | local_irq_disable(); |
| 486 | list = __this_cpu_read(tasklet_vec.head); | 486 | list = __this_cpu_read(tasklet_vec.head); |
| 487 | __this_cpu_write(tasklet_vec.head, NULL); | 487 | __this_cpu_write(tasklet_vec.head, NULL); |
| 488 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); | 488 | __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); |
| 489 | local_irq_enable(); | 489 | local_irq_enable(); |
| 490 | 490 | ||
| 491 | while (list) { | 491 | while (list) { |
| @@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a) | |||
| 521 | local_irq_disable(); | 521 | local_irq_disable(); |
| 522 | list = __this_cpu_read(tasklet_hi_vec.head); | 522 | list = __this_cpu_read(tasklet_hi_vec.head); |
| 523 | __this_cpu_write(tasklet_hi_vec.head, NULL); | 523 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
| 524 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); | 524 | __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); |
| 525 | local_irq_enable(); | 525 | local_irq_enable(); |
| 526 | 526 | ||
| 527 | while (list) { | 527 | while (list) { |
| @@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu) | |||
| 656 | * in the task stack here. | 656 | * in the task stack here. |
| 657 | */ | 657 | */ |
| 658 | __do_softirq(); | 658 | __do_softirq(); |
| 659 | rcu_note_context_switch(cpu); | 659 | rcu_note_context_switch(); |
| 660 | local_irq_enable(); | 660 | local_irq_enable(); |
| 661 | cond_resched(); | 661 | cond_resched(); |
| 662 | return; | 662 | return; |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a82..b6e4c16377c7 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
| @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
| 25 | } | 25 | } |
| 26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
| 27 | 27 | ||
| 28 | int snprint_stack_trace(char *buf, size_t size, | ||
| 29 | struct stack_trace *trace, int spaces) | ||
| 30 | { | ||
| 31 | int i; | ||
| 32 | unsigned long ip; | ||
| 33 | int generated; | ||
| 34 | int total = 0; | ||
| 35 | |||
| 36 | if (WARN_ON(!trace->entries)) | ||
| 37 | return 0; | ||
| 38 | |||
| 39 | for (i = 0; i < trace->nr_entries; i++) { | ||
| 40 | ip = trace->entries[i]; | ||
| 41 | generated = snprintf(buf, size, "%*c[<%p>] %pS\n", | ||
| 42 | 1 + spaces, ' ', (void *) ip, (void *) ip); | ||
| 43 | |||
| 44 | total += generated; | ||
| 45 | |||
| 46 | /* Assume that generated isn't a negative number */ | ||
| 47 | if (generated >= size) { | ||
| 48 | buf += size; | ||
| 49 | size = 0; | ||
| 50 | } else { | ||
| 51 | buf += generated; | ||
| 52 | size -= generated; | ||
| 53 | } | ||
| 54 | } | ||
| 55 | |||
| 56 | return total; | ||
| 57 | } | ||
| 58 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | ||
| 59 | |||
| 28 | /* | 60 | /* |
| 29 | * Architectures that do not implement save_stack_trace_tsk or | 61 | * Architectures that do not implement save_stack_trace_tsk or |
| 30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning | 62 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..a8c9f5a7dda6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -62,28 +62,28 @@ | |||
| 62 | #include <asm/unistd.h> | 62 | #include <asm/unistd.h> |
| 63 | 63 | ||
| 64 | #ifndef SET_UNALIGN_CTL | 64 | #ifndef SET_UNALIGN_CTL |
| 65 | # define SET_UNALIGN_CTL(a,b) (-EINVAL) | 65 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) |
| 66 | #endif | 66 | #endif |
| 67 | #ifndef GET_UNALIGN_CTL | 67 | #ifndef GET_UNALIGN_CTL |
| 68 | # define GET_UNALIGN_CTL(a,b) (-EINVAL) | 68 | # define GET_UNALIGN_CTL(a, b) (-EINVAL) |
| 69 | #endif | 69 | #endif |
| 70 | #ifndef SET_FPEMU_CTL | 70 | #ifndef SET_FPEMU_CTL |
| 71 | # define SET_FPEMU_CTL(a,b) (-EINVAL) | 71 | # define SET_FPEMU_CTL(a, b) (-EINVAL) |
| 72 | #endif | 72 | #endif |
| 73 | #ifndef GET_FPEMU_CTL | 73 | #ifndef GET_FPEMU_CTL |
| 74 | # define GET_FPEMU_CTL(a,b) (-EINVAL) | 74 | # define GET_FPEMU_CTL(a, b) (-EINVAL) |
| 75 | #endif | 75 | #endif |
| 76 | #ifndef SET_FPEXC_CTL | 76 | #ifndef SET_FPEXC_CTL |
| 77 | # define SET_FPEXC_CTL(a,b) (-EINVAL) | 77 | # define SET_FPEXC_CTL(a, b) (-EINVAL) |
| 78 | #endif | 78 | #endif |
| 79 | #ifndef GET_FPEXC_CTL | 79 | #ifndef GET_FPEXC_CTL |
| 80 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | 80 | # define GET_FPEXC_CTL(a, b) (-EINVAL) |
| 81 | #endif | 81 | #endif |
| 82 | #ifndef GET_ENDIAN | 82 | #ifndef GET_ENDIAN |
| 83 | # define GET_ENDIAN(a,b) (-EINVAL) | 83 | # define GET_ENDIAN(a, b) (-EINVAL) |
| 84 | #endif | 84 | #endif |
| 85 | #ifndef SET_ENDIAN | 85 | #ifndef SET_ENDIAN |
| 86 | # define SET_ENDIAN(a,b) (-EINVAL) | 86 | # define SET_ENDIAN(a, b) (-EINVAL) |
| 87 | #endif | 87 | #endif |
| 88 | #ifndef GET_TSC_CTL | 88 | #ifndef GET_TSC_CTL |
| 89 | # define GET_TSC_CTL(a) (-EINVAL) | 89 | # define GET_TSC_CTL(a) (-EINVAL) |
| @@ -91,6 +91,12 @@ | |||
| 91 | #ifndef SET_TSC_CTL | 91 | #ifndef SET_TSC_CTL |
| 92 | # define SET_TSC_CTL(a) (-EINVAL) | 92 | # define SET_TSC_CTL(a) (-EINVAL) |
| 93 | #endif | 93 | #endif |
| 94 | #ifndef MPX_ENABLE_MANAGEMENT | ||
| 95 | # define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) | ||
| 96 | #endif | ||
| 97 | #ifndef MPX_DISABLE_MANAGEMENT | ||
| 98 | # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) | ||
| 99 | #endif | ||
| 94 | 100 | ||
| 95 | /* | 101 | /* |
| 96 | * this is where the system-wide overflow UID and GID are defined, for | 102 | * this is where the system-wide overflow UID and GID are defined, for |
| @@ -182,39 +188,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
| 182 | rcu_read_lock(); | 188 | rcu_read_lock(); |
| 183 | read_lock(&tasklist_lock); | 189 | read_lock(&tasklist_lock); |
| 184 | switch (which) { | 190 | switch (which) { |
| 185 | case PRIO_PROCESS: | 191 | case PRIO_PROCESS: |
| 186 | if (who) | 192 | if (who) |
| 187 | p = find_task_by_vpid(who); | 193 | p = find_task_by_vpid(who); |
| 188 | else | 194 | else |
| 189 | p = current; | 195 | p = current; |
| 190 | if (p) | 196 | if (p) |
| 191 | error = set_one_prio(p, niceval, error); | 197 | error = set_one_prio(p, niceval, error); |
| 192 | break; | 198 | break; |
| 193 | case PRIO_PGRP: | 199 | case PRIO_PGRP: |
| 194 | if (who) | 200 | if (who) |
| 195 | pgrp = find_vpid(who); | 201 | pgrp = find_vpid(who); |
| 196 | else | 202 | else |
| 197 | pgrp = task_pgrp(current); | 203 | pgrp = task_pgrp(current); |
| 198 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 204 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
| 199 | error = set_one_prio(p, niceval, error); | 205 | error = set_one_prio(p, niceval, error); |
| 200 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 206 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
| 201 | break; | 207 | break; |
| 202 | case PRIO_USER: | 208 | case PRIO_USER: |
| 203 | uid = make_kuid(cred->user_ns, who); | 209 | uid = make_kuid(cred->user_ns, who); |
| 204 | user = cred->user; | 210 | user = cred->user; |
| 205 | if (!who) | 211 | if (!who) |
| 206 | uid = cred->uid; | 212 | uid = cred->uid; |
| 207 | else if (!uid_eq(uid, cred->uid) && | 213 | else if (!uid_eq(uid, cred->uid)) { |
| 208 | !(user = find_user(uid))) | 214 | user = find_user(uid); |
| 215 | if (!user) | ||
| 209 | goto out_unlock; /* No processes for this user */ | 216 | goto out_unlock; /* No processes for this user */ |
| 210 | 217 | } | |
| 211 | do_each_thread(g, p) { | 218 | do_each_thread(g, p) { |
| 212 | if (uid_eq(task_uid(p), uid)) | 219 | if (uid_eq(task_uid(p), uid)) |
| 213 | error = set_one_prio(p, niceval, error); | 220 | error = set_one_prio(p, niceval, error); |
| 214 | } while_each_thread(g, p); | 221 | } while_each_thread(g, p); |
| 215 | if (!uid_eq(uid, cred->uid)) | 222 | if (!uid_eq(uid, cred->uid)) |
| 216 | free_uid(user); /* For find_user() */ | 223 | free_uid(user); /* For find_user() */ |
| 217 | break; | 224 | break; |
| 218 | } | 225 | } |
| 219 | out_unlock: | 226 | out_unlock: |
| 220 | read_unlock(&tasklist_lock); | 227 | read_unlock(&tasklist_lock); |
| @@ -244,47 +251,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 244 | rcu_read_lock(); | 251 | rcu_read_lock(); |
| 245 | read_lock(&tasklist_lock); | 252 | read_lock(&tasklist_lock); |
| 246 | switch (which) { | 253 | switch (which) { |
| 247 | case PRIO_PROCESS: | 254 | case PRIO_PROCESS: |
| 248 | if (who) | 255 | if (who) |
| 249 | p = find_task_by_vpid(who); | 256 | p = find_task_by_vpid(who); |
| 250 | else | 257 | else |
| 251 | p = current; | 258 | p = current; |
| 252 | if (p) { | 259 | if (p) { |
| 260 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 261 | if (niceval > retval) | ||
| 262 | retval = niceval; | ||
| 263 | } | ||
| 264 | break; | ||
| 265 | case PRIO_PGRP: | ||
| 266 | if (who) | ||
| 267 | pgrp = find_vpid(who); | ||
| 268 | else | ||
| 269 | pgrp = task_pgrp(current); | ||
| 270 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
| 271 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 272 | if (niceval > retval) | ||
| 273 | retval = niceval; | ||
| 274 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
| 275 | break; | ||
| 276 | case PRIO_USER: | ||
| 277 | uid = make_kuid(cred->user_ns, who); | ||
| 278 | user = cred->user; | ||
| 279 | if (!who) | ||
| 280 | uid = cred->uid; | ||
| 281 | else if (!uid_eq(uid, cred->uid)) { | ||
| 282 | user = find_user(uid); | ||
| 283 | if (!user) | ||
| 284 | goto out_unlock; /* No processes for this user */ | ||
| 285 | } | ||
| 286 | do_each_thread(g, p) { | ||
| 287 | if (uid_eq(task_uid(p), uid)) { | ||
| 253 | niceval = nice_to_rlimit(task_nice(p)); | 288 | niceval = nice_to_rlimit(task_nice(p)); |
| 254 | if (niceval > retval) | 289 | if (niceval > retval) |
| 255 | retval = niceval; | 290 | retval = niceval; |
| 256 | } | 291 | } |
| 257 | break; | 292 | } while_each_thread(g, p); |
| 258 | case PRIO_PGRP: | 293 | if (!uid_eq(uid, cred->uid)) |
| 259 | if (who) | 294 | free_uid(user); /* for find_user() */ |
| 260 | pgrp = find_vpid(who); | 295 | break; |
| 261 | else | ||
| 262 | pgrp = task_pgrp(current); | ||
| 263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
| 264 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 265 | if (niceval > retval) | ||
| 266 | retval = niceval; | ||
| 267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
| 268 | break; | ||
| 269 | case PRIO_USER: | ||
| 270 | uid = make_kuid(cred->user_ns, who); | ||
| 271 | user = cred->user; | ||
| 272 | if (!who) | ||
| 273 | uid = cred->uid; | ||
| 274 | else if (!uid_eq(uid, cred->uid) && | ||
| 275 | !(user = find_user(uid))) | ||
| 276 | goto out_unlock; /* No processes for this user */ | ||
| 277 | |||
| 278 | do_each_thread(g, p) { | ||
| 279 | if (uid_eq(task_uid(p), uid)) { | ||
| 280 | niceval = nice_to_rlimit(task_nice(p)); | ||
| 281 | if (niceval > retval) | ||
| 282 | retval = niceval; | ||
| 283 | } | ||
| 284 | } while_each_thread(g, p); | ||
| 285 | if (!uid_eq(uid, cred->uid)) | ||
| 286 | free_uid(user); /* for find_user() */ | ||
| 287 | break; | ||
| 288 | } | 296 | } |
| 289 | out_unlock: | 297 | out_unlock: |
| 290 | read_unlock(&tasklist_lock); | 298 | read_unlock(&tasklist_lock); |
| @@ -306,7 +314,7 @@ out_unlock: | |||
| 306 | * | 314 | * |
| 307 | * The general idea is that a program which uses just setregid() will be | 315 | * The general idea is that a program which uses just setregid() will be |
| 308 | * 100% compatible with BSD. A program which uses just setgid() will be | 316 | * 100% compatible with BSD. A program which uses just setgid() will be |
| 309 | * 100% compatible with POSIX with saved IDs. | 317 | * 100% compatible with POSIX with saved IDs. |
| 310 | * | 318 | * |
| 311 | * SMP: There are not races, the GIDs are checked only by filesystem | 319 | * SMP: There are not races, the GIDs are checked only by filesystem |
| 312 | * operations (as far as semantic preservation is concerned). | 320 | * operations (as far as semantic preservation is concerned). |
| @@ -364,7 +372,7 @@ error: | |||
| 364 | } | 372 | } |
| 365 | 373 | ||
| 366 | /* | 374 | /* |
| 367 | * setgid() is implemented like SysV w/ SAVED_IDS | 375 | * setgid() is implemented like SysV w/ SAVED_IDS |
| 368 | * | 376 | * |
| 369 | * SMP: Same implicit races as above. | 377 | * SMP: Same implicit races as above. |
| 370 | */ | 378 | */ |
| @@ -442,7 +450,7 @@ static int set_user(struct cred *new) | |||
| 442 | * | 450 | * |
| 443 | * The general idea is that a program which uses just setreuid() will be | 451 | * The general idea is that a program which uses just setreuid() will be |
| 444 | * 100% compatible with BSD. A program which uses just setuid() will be | 452 | * 100% compatible with BSD. A program which uses just setuid() will be |
| 445 | * 100% compatible with POSIX with saved IDs. | 453 | * 100% compatible with POSIX with saved IDs. |
| 446 | */ | 454 | */ |
| 447 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 455 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
| 448 | { | 456 | { |
| @@ -503,17 +511,17 @@ error: | |||
| 503 | abort_creds(new); | 511 | abort_creds(new); |
| 504 | return retval; | 512 | return retval; |
| 505 | } | 513 | } |
| 506 | 514 | ||
| 507 | /* | 515 | /* |
| 508 | * setuid() is implemented like SysV with SAVED_IDS | 516 | * setuid() is implemented like SysV with SAVED_IDS |
| 509 | * | 517 | * |
| 510 | * Note that SAVED_ID's is deficient in that a setuid root program | 518 | * Note that SAVED_ID's is deficient in that a setuid root program |
| 511 | * like sendmail, for example, cannot set its uid to be a normal | 519 | * like sendmail, for example, cannot set its uid to be a normal |
| 512 | * user and then switch back, because if you're root, setuid() sets | 520 | * user and then switch back, because if you're root, setuid() sets |
| 513 | * the saved uid too. If you don't like this, blame the bright people | 521 | * the saved uid too. If you don't like this, blame the bright people |
| 514 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() | 522 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() |
| 515 | * will allow a root program to temporarily drop privileges and be able to | 523 | * will allow a root program to temporarily drop privileges and be able to |
| 516 | * regain them by swapping the real and effective uid. | 524 | * regain them by swapping the real and effective uid. |
| 517 | */ | 525 | */ |
| 518 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 526 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
| 519 | { | 527 | { |
| @@ -637,10 +645,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ | |||
| 637 | euid = from_kuid_munged(cred->user_ns, cred->euid); | 645 | euid = from_kuid_munged(cred->user_ns, cred->euid); |
| 638 | suid = from_kuid_munged(cred->user_ns, cred->suid); | 646 | suid = from_kuid_munged(cred->user_ns, cred->suid); |
| 639 | 647 | ||
| 640 | if (!(retval = put_user(ruid, ruidp)) && | 648 | retval = put_user(ruid, ruidp); |
| 641 | !(retval = put_user(euid, euidp))) | 649 | if (!retval) { |
| 642 | retval = put_user(suid, suidp); | 650 | retval = put_user(euid, euidp); |
| 643 | 651 | if (!retval) | |
| 652 | return put_user(suid, suidp); | ||
| 653 | } | ||
| 644 | return retval; | 654 | return retval; |
| 645 | } | 655 | } |
| 646 | 656 | ||
| @@ -709,9 +719,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ | |||
| 709 | egid = from_kgid_munged(cred->user_ns, cred->egid); | 719 | egid = from_kgid_munged(cred->user_ns, cred->egid); |
| 710 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); | 720 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); |
| 711 | 721 | ||
| 712 | if (!(retval = put_user(rgid, rgidp)) && | 722 | retval = put_user(rgid, rgidp); |
| 713 | !(retval = put_user(egid, egidp))) | 723 | if (!retval) { |
| 714 | retval = put_user(sgid, sgidp); | 724 | retval = put_user(egid, egidp); |
| 725 | if (!retval) | ||
| 726 | retval = put_user(sgid, sgidp); | ||
| 727 | } | ||
| 715 | 728 | ||
| 716 | return retval; | 729 | return retval; |
| 717 | } | 730 | } |
| @@ -862,11 +875,9 @@ void do_sys_times(struct tms *tms) | |||
| 862 | { | 875 | { |
| 863 | cputime_t tgutime, tgstime, cutime, cstime; | 876 | cputime_t tgutime, tgstime, cutime, cstime; |
| 864 | 877 | ||
| 865 | spin_lock_irq(¤t->sighand->siglock); | ||
| 866 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 878 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
| 867 | cutime = current->signal->cutime; | 879 | cutime = current->signal->cutime; |
| 868 | cstime = current->signal->cstime; | 880 | cstime = current->signal->cstime; |
| 869 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 870 | tms->tms_utime = cputime_to_clock_t(tgutime); | 881 | tms->tms_utime = cputime_to_clock_t(tgutime); |
| 871 | tms->tms_stime = cputime_to_clock_t(tgstime); | 882 | tms->tms_stime = cputime_to_clock_t(tgstime); |
| 872 | tms->tms_cutime = cputime_to_clock_t(cutime); | 883 | tms->tms_cutime = cputime_to_clock_t(cutime); |
| @@ -1284,7 +1295,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
| 1284 | /* | 1295 | /* |
| 1285 | * Back compatibility for getrlimit. Needed for some apps. | 1296 | * Back compatibility for getrlimit. Needed for some apps. |
| 1286 | */ | 1297 | */ |
| 1287 | |||
| 1288 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | 1298 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, |
| 1289 | struct rlimit __user *, rlim) | 1299 | struct rlimit __user *, rlim) |
| 1290 | { | 1300 | { |
| @@ -1299,7 +1309,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
| 1299 | x.rlim_cur = 0x7FFFFFFF; | 1309 | x.rlim_cur = 0x7FFFFFFF; |
| 1300 | if (x.rlim_max > 0x7FFFFFFF) | 1310 | if (x.rlim_max > 0x7FFFFFFF) |
| 1301 | x.rlim_max = 0x7FFFFFFF; | 1311 | x.rlim_max = 0x7FFFFFFF; |
| 1302 | return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; | 1312 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; |
| 1303 | } | 1313 | } |
| 1304 | 1314 | ||
| 1305 | #endif | 1315 | #endif |
| @@ -1527,7 +1537,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1527 | cputime_t tgutime, tgstime, utime, stime; | 1537 | cputime_t tgutime, tgstime, utime, stime; |
| 1528 | unsigned long maxrss = 0; | 1538 | unsigned long maxrss = 0; |
| 1529 | 1539 | ||
| 1530 | memset((char *) r, 0, sizeof *r); | 1540 | memset((char *)r, 0, sizeof (*r)); |
| 1531 | utime = stime = 0; | 1541 | utime = stime = 0; |
| 1532 | 1542 | ||
| 1533 | if (who == RUSAGE_THREAD) { | 1543 | if (who == RUSAGE_THREAD) { |
| @@ -1541,41 +1551,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1541 | return; | 1551 | return; |
| 1542 | 1552 | ||
| 1543 | switch (who) { | 1553 | switch (who) { |
| 1544 | case RUSAGE_BOTH: | 1554 | case RUSAGE_BOTH: |
| 1545 | case RUSAGE_CHILDREN: | 1555 | case RUSAGE_CHILDREN: |
| 1546 | utime = p->signal->cutime; | 1556 | utime = p->signal->cutime; |
| 1547 | stime = p->signal->cstime; | 1557 | stime = p->signal->cstime; |
| 1548 | r->ru_nvcsw = p->signal->cnvcsw; | 1558 | r->ru_nvcsw = p->signal->cnvcsw; |
| 1549 | r->ru_nivcsw = p->signal->cnivcsw; | 1559 | r->ru_nivcsw = p->signal->cnivcsw; |
| 1550 | r->ru_minflt = p->signal->cmin_flt; | 1560 | r->ru_minflt = p->signal->cmin_flt; |
| 1551 | r->ru_majflt = p->signal->cmaj_flt; | 1561 | r->ru_majflt = p->signal->cmaj_flt; |
| 1552 | r->ru_inblock = p->signal->cinblock; | 1562 | r->ru_inblock = p->signal->cinblock; |
| 1553 | r->ru_oublock = p->signal->coublock; | 1563 | r->ru_oublock = p->signal->coublock; |
| 1554 | maxrss = p->signal->cmaxrss; | 1564 | maxrss = p->signal->cmaxrss; |
| 1555 | 1565 | ||
| 1556 | if (who == RUSAGE_CHILDREN) | 1566 | if (who == RUSAGE_CHILDREN) |
| 1557 | break; | ||
| 1558 | |||
| 1559 | case RUSAGE_SELF: | ||
| 1560 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | ||
| 1561 | utime += tgutime; | ||
| 1562 | stime += tgstime; | ||
| 1563 | r->ru_nvcsw += p->signal->nvcsw; | ||
| 1564 | r->ru_nivcsw += p->signal->nivcsw; | ||
| 1565 | r->ru_minflt += p->signal->min_flt; | ||
| 1566 | r->ru_majflt += p->signal->maj_flt; | ||
| 1567 | r->ru_inblock += p->signal->inblock; | ||
| 1568 | r->ru_oublock += p->signal->oublock; | ||
| 1569 | if (maxrss < p->signal->maxrss) | ||
| 1570 | maxrss = p->signal->maxrss; | ||
| 1571 | t = p; | ||
| 1572 | do { | ||
| 1573 | accumulate_thread_rusage(t, r); | ||
| 1574 | } while_each_thread(p, t); | ||
| 1575 | break; | 1567 | break; |
| 1576 | 1568 | ||
| 1577 | default: | 1569 | case RUSAGE_SELF: |
| 1578 | BUG(); | 1570 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
| 1571 | utime += tgutime; | ||
| 1572 | stime += tgstime; | ||
| 1573 | r->ru_nvcsw += p->signal->nvcsw; | ||
| 1574 | r->ru_nivcsw += p->signal->nivcsw; | ||
| 1575 | r->ru_minflt += p->signal->min_flt; | ||
| 1576 | r->ru_majflt += p->signal->maj_flt; | ||
| 1577 | r->ru_inblock += p->signal->inblock; | ||
| 1578 | r->ru_oublock += p->signal->oublock; | ||
| 1579 | if (maxrss < p->signal->maxrss) | ||
| 1580 | maxrss = p->signal->maxrss; | ||
| 1581 | t = p; | ||
| 1582 | do { | ||
| 1583 | accumulate_thread_rusage(t, r); | ||
| 1584 | } while_each_thread(p, t); | ||
| 1585 | break; | ||
| 1586 | |||
| 1587 | default: | ||
| 1588 | BUG(); | ||
| 1579 | } | 1589 | } |
| 1580 | unlock_task_sighand(p, &flags); | 1590 | unlock_task_sighand(p, &flags); |
| 1581 | 1591 | ||
| @@ -1585,6 +1595,7 @@ out: | |||
| 1585 | 1595 | ||
| 1586 | if (who != RUSAGE_CHILDREN) { | 1596 | if (who != RUSAGE_CHILDREN) { |
| 1587 | struct mm_struct *mm = get_task_mm(p); | 1597 | struct mm_struct *mm = get_task_mm(p); |
| 1598 | |||
| 1588 | if (mm) { | 1599 | if (mm) { |
| 1589 | setmax_mm_hiwater_rss(&maxrss, mm); | 1600 | setmax_mm_hiwater_rss(&maxrss, mm); |
| 1590 | mmput(mm); | 1601 | mmput(mm); |
| @@ -1596,6 +1607,7 @@ out: | |||
| 1596 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1607 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
| 1597 | { | 1608 | { |
| 1598 | struct rusage r; | 1609 | struct rusage r; |
| 1610 | |||
| 1599 | k_getrusage(p, who, &r); | 1611 | k_getrusage(p, who, &r); |
| 1600 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | 1612 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; |
| 1601 | } | 1613 | } |
| @@ -1628,12 +1640,14 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1628 | return mask; | 1640 | return mask; |
| 1629 | } | 1641 | } |
| 1630 | 1642 | ||
| 1631 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1643 | static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) |
| 1632 | { | 1644 | { |
| 1633 | struct fd exe; | 1645 | struct fd exe; |
| 1634 | struct inode *inode; | 1646 | struct inode *inode; |
| 1635 | int err; | 1647 | int err; |
| 1636 | 1648 | ||
| 1649 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
| 1650 | |||
| 1637 | exe = fdget(fd); | 1651 | exe = fdget(fd); |
| 1638 | if (!exe.file) | 1652 | if (!exe.file) |
| 1639 | return -EBADF; | 1653 | return -EBADF; |
| @@ -1654,8 +1668,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1654 | if (err) | 1668 | if (err) |
| 1655 | goto exit; | 1669 | goto exit; |
| 1656 | 1670 | ||
| 1657 | down_write(&mm->mmap_sem); | ||
| 1658 | |||
| 1659 | /* | 1671 | /* |
| 1660 | * Forbid mm->exe_file change if old file still mapped. | 1672 | * Forbid mm->exe_file change if old file still mapped. |
| 1661 | */ | 1673 | */ |
| @@ -1667,7 +1679,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1667 | if (vma->vm_file && | 1679 | if (vma->vm_file && |
| 1668 | path_equal(&vma->vm_file->f_path, | 1680 | path_equal(&vma->vm_file->f_path, |
| 1669 | &mm->exe_file->f_path)) | 1681 | &mm->exe_file->f_path)) |
| 1670 | goto exit_unlock; | 1682 | goto exit; |
| 1671 | } | 1683 | } |
| 1672 | 1684 | ||
| 1673 | /* | 1685 | /* |
| @@ -1678,34 +1690,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1678 | */ | 1690 | */ |
| 1679 | err = -EPERM; | 1691 | err = -EPERM; |
| 1680 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1692 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
| 1681 | goto exit_unlock; | 1693 | goto exit; |
| 1682 | 1694 | ||
| 1683 | err = 0; | 1695 | err = 0; |
| 1684 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ | 1696 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ |
| 1685 | exit_unlock: | ||
| 1686 | up_write(&mm->mmap_sem); | ||
| 1687 | |||
| 1688 | exit: | 1697 | exit: |
| 1689 | fdput(exe); | 1698 | fdput(exe); |
| 1690 | return err; | 1699 | return err; |
| 1691 | } | 1700 | } |
| 1692 | 1701 | ||
| 1702 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
| 1703 | /* | ||
| 1704 | * WARNING: we don't require any capability here so be very careful | ||
| 1705 | * in what is allowed for modification from userspace. | ||
| 1706 | */ | ||
| 1707 | static int validate_prctl_map(struct prctl_mm_map *prctl_map) | ||
| 1708 | { | ||
| 1709 | unsigned long mmap_max_addr = TASK_SIZE; | ||
| 1710 | struct mm_struct *mm = current->mm; | ||
| 1711 | int error = -EINVAL, i; | ||
| 1712 | |||
| 1713 | static const unsigned char offsets[] = { | ||
| 1714 | offsetof(struct prctl_mm_map, start_code), | ||
| 1715 | offsetof(struct prctl_mm_map, end_code), | ||
| 1716 | offsetof(struct prctl_mm_map, start_data), | ||
| 1717 | offsetof(struct prctl_mm_map, end_data), | ||
| 1718 | offsetof(struct prctl_mm_map, start_brk), | ||
| 1719 | offsetof(struct prctl_mm_map, brk), | ||
| 1720 | offsetof(struct prctl_mm_map, start_stack), | ||
| 1721 | offsetof(struct prctl_mm_map, arg_start), | ||
| 1722 | offsetof(struct prctl_mm_map, arg_end), | ||
| 1723 | offsetof(struct prctl_mm_map, env_start), | ||
| 1724 | offsetof(struct prctl_mm_map, env_end), | ||
| 1725 | }; | ||
| 1726 | |||
| 1727 | /* | ||
| 1728 | * Make sure the members are not somewhere outside | ||
| 1729 | * of allowed address space. | ||
| 1730 | */ | ||
| 1731 | for (i = 0; i < ARRAY_SIZE(offsets); i++) { | ||
| 1732 | u64 val = *(u64 *)((char *)prctl_map + offsets[i]); | ||
| 1733 | |||
| 1734 | if ((unsigned long)val >= mmap_max_addr || | ||
| 1735 | (unsigned long)val < mmap_min_addr) | ||
| 1736 | goto out; | ||
| 1737 | } | ||
| 1738 | |||
| 1739 | /* | ||
| 1740 | * Make sure the pairs are ordered. | ||
| 1741 | */ | ||
| 1742 | #define __prctl_check_order(__m1, __op, __m2) \ | ||
| 1743 | ((unsigned long)prctl_map->__m1 __op \ | ||
| 1744 | (unsigned long)prctl_map->__m2) ? 0 : -EINVAL | ||
| 1745 | error = __prctl_check_order(start_code, <, end_code); | ||
| 1746 | error |= __prctl_check_order(start_data, <, end_data); | ||
| 1747 | error |= __prctl_check_order(start_brk, <=, brk); | ||
| 1748 | error |= __prctl_check_order(arg_start, <=, arg_end); | ||
| 1749 | error |= __prctl_check_order(env_start, <=, env_end); | ||
| 1750 | if (error) | ||
| 1751 | goto out; | ||
| 1752 | #undef __prctl_check_order | ||
| 1753 | |||
| 1754 | error = -EINVAL; | ||
| 1755 | |||
| 1756 | /* | ||
| 1757 | * @brk should be after @end_data in traditional maps. | ||
| 1758 | */ | ||
| 1759 | if (prctl_map->start_brk <= prctl_map->end_data || | ||
| 1760 | prctl_map->brk <= prctl_map->end_data) | ||
| 1761 | goto out; | ||
| 1762 | |||
| 1763 | /* | ||
| 1764 | * Neither we should allow to override limits if they set. | ||
| 1765 | */ | ||
| 1766 | if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, | ||
| 1767 | prctl_map->start_brk, prctl_map->end_data, | ||
| 1768 | prctl_map->start_data)) | ||
| 1769 | goto out; | ||
| 1770 | |||
| 1771 | /* | ||
| 1772 | * Someone is trying to cheat the auxv vector. | ||
| 1773 | */ | ||
| 1774 | if (prctl_map->auxv_size) { | ||
| 1775 | if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) | ||
| 1776 | goto out; | ||
| 1777 | } | ||
| 1778 | |||
| 1779 | /* | ||
| 1780 | * Finally, make sure the caller has the rights to | ||
| 1781 | * change /proc/pid/exe link: only local root should | ||
| 1782 | * be allowed to. | ||
| 1783 | */ | ||
| 1784 | if (prctl_map->exe_fd != (u32)-1) { | ||
| 1785 | struct user_namespace *ns = current_user_ns(); | ||
| 1786 | const struct cred *cred = current_cred(); | ||
| 1787 | |||
| 1788 | if (!uid_eq(cred->uid, make_kuid(ns, 0)) || | ||
| 1789 | !gid_eq(cred->gid, make_kgid(ns, 0))) | ||
| 1790 | goto out; | ||
| 1791 | } | ||
| 1792 | |||
| 1793 | error = 0; | ||
| 1794 | out: | ||
| 1795 | return error; | ||
| 1796 | } | ||
| 1797 | |||
| 1798 | static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) | ||
| 1799 | { | ||
| 1800 | struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; | ||
| 1801 | unsigned long user_auxv[AT_VECTOR_SIZE]; | ||
| 1802 | struct mm_struct *mm = current->mm; | ||
| 1803 | int error; | ||
| 1804 | |||
| 1805 | BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); | ||
| 1806 | BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); | ||
| 1807 | |||
| 1808 | if (opt == PR_SET_MM_MAP_SIZE) | ||
| 1809 | return put_user((unsigned int)sizeof(prctl_map), | ||
| 1810 | (unsigned int __user *)addr); | ||
| 1811 | |||
| 1812 | if (data_size != sizeof(prctl_map)) | ||
| 1813 | return -EINVAL; | ||
| 1814 | |||
| 1815 | if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) | ||
| 1816 | return -EFAULT; | ||
| 1817 | |||
| 1818 | error = validate_prctl_map(&prctl_map); | ||
| 1819 | if (error) | ||
| 1820 | return error; | ||
| 1821 | |||
| 1822 | if (prctl_map.auxv_size) { | ||
| 1823 | memset(user_auxv, 0, sizeof(user_auxv)); | ||
| 1824 | if (copy_from_user(user_auxv, | ||
| 1825 | (const void __user *)prctl_map.auxv, | ||
| 1826 | prctl_map.auxv_size)) | ||
| 1827 | return -EFAULT; | ||
| 1828 | |||
| 1829 | /* Last entry must be AT_NULL as specification requires */ | ||
| 1830 | user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; | ||
| 1831 | user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; | ||
| 1832 | } | ||
| 1833 | |||
| 1834 | down_write(&mm->mmap_sem); | ||
| 1835 | if (prctl_map.exe_fd != (u32)-1) | ||
| 1836 | error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); | ||
| 1837 | downgrade_write(&mm->mmap_sem); | ||
| 1838 | if (error) | ||
| 1839 | goto out; | ||
| 1840 | |||
| 1841 | /* | ||
| 1842 | * We don't validate if these members are pointing to | ||
| 1843 | * real present VMAs because application may have correspond | ||
| 1844 | * VMAs already unmapped and kernel uses these members for statistics | ||
| 1845 | * output in procfs mostly, except | ||
| 1846 | * | ||
| 1847 | * - @start_brk/@brk which are used in do_brk but kernel lookups | ||
| 1848 | * for VMAs when updating these memvers so anything wrong written | ||
| 1849 | * here cause kernel to swear at userspace program but won't lead | ||
| 1850 | * to any problem in kernel itself | ||
| 1851 | */ | ||
| 1852 | |||
| 1853 | mm->start_code = prctl_map.start_code; | ||
| 1854 | mm->end_code = prctl_map.end_code; | ||
| 1855 | mm->start_data = prctl_map.start_data; | ||
| 1856 | mm->end_data = prctl_map.end_data; | ||
| 1857 | mm->start_brk = prctl_map.start_brk; | ||
| 1858 | mm->brk = prctl_map.brk; | ||
| 1859 | mm->start_stack = prctl_map.start_stack; | ||
| 1860 | mm->arg_start = prctl_map.arg_start; | ||
| 1861 | mm->arg_end = prctl_map.arg_end; | ||
| 1862 | mm->env_start = prctl_map.env_start; | ||
| 1863 | mm->env_end = prctl_map.env_end; | ||
| 1864 | |||
| 1865 | /* | ||
| 1866 | * Note this update of @saved_auxv is lockless thus | ||
| 1867 | * if someone reads this member in procfs while we're | ||
| 1868 | * updating -- it may get partly updated results. It's | ||
| 1869 | * known and acceptable trade off: we leave it as is to | ||
| 1870 | * not introduce additional locks here making the kernel | ||
| 1871 | * more complex. | ||
| 1872 | */ | ||
| 1873 | if (prctl_map.auxv_size) | ||
| 1874 | memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); | ||
| 1875 | |||
| 1876 | error = 0; | ||
| 1877 | out: | ||
| 1878 | up_read(&mm->mmap_sem); | ||
| 1879 | return error; | ||
| 1880 | } | ||
| 1881 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
| 1882 | |||
| 1693 | static int prctl_set_mm(int opt, unsigned long addr, | 1883 | static int prctl_set_mm(int opt, unsigned long addr, |
| 1694 | unsigned long arg4, unsigned long arg5) | 1884 | unsigned long arg4, unsigned long arg5) |
| 1695 | { | 1885 | { |
| 1696 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
| 1697 | struct mm_struct *mm = current->mm; | 1886 | struct mm_struct *mm = current->mm; |
| 1698 | struct vm_area_struct *vma; | 1887 | struct vm_area_struct *vma; |
| 1699 | int error; | 1888 | int error; |
| 1700 | 1889 | ||
| 1701 | if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) | 1890 | if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && |
| 1891 | opt != PR_SET_MM_MAP && | ||
| 1892 | opt != PR_SET_MM_MAP_SIZE))) | ||
| 1702 | return -EINVAL; | 1893 | return -EINVAL; |
| 1703 | 1894 | ||
| 1895 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
| 1896 | if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) | ||
| 1897 | return prctl_set_mm_map(opt, (const void __user *)addr, arg4); | ||
| 1898 | #endif | ||
| 1899 | |||
| 1704 | if (!capable(CAP_SYS_RESOURCE)) | 1900 | if (!capable(CAP_SYS_RESOURCE)) |
| 1705 | return -EPERM; | 1901 | return -EPERM; |
| 1706 | 1902 | ||
| 1707 | if (opt == PR_SET_MM_EXE_FILE) | 1903 | if (opt == PR_SET_MM_EXE_FILE) { |
| 1708 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | 1904 | down_write(&mm->mmap_sem); |
| 1905 | error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); | ||
| 1906 | up_write(&mm->mmap_sem); | ||
| 1907 | return error; | ||
| 1908 | } | ||
| 1709 | 1909 | ||
| 1710 | if (addr >= TASK_SIZE || addr < mmap_min_addr) | 1910 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
| 1711 | return -EINVAL; | 1911 | return -EINVAL; |
| @@ -1733,9 +1933,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1733 | if (addr <= mm->end_data) | 1933 | if (addr <= mm->end_data) |
| 1734 | goto out; | 1934 | goto out; |
| 1735 | 1935 | ||
| 1736 | if (rlim < RLIM_INFINITY && | 1936 | if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, |
| 1737 | (mm->brk - addr) + | 1937 | mm->end_data, mm->start_data)) |
| 1738 | (mm->end_data - mm->start_data) > rlim) | ||
| 1739 | goto out; | 1938 | goto out; |
| 1740 | 1939 | ||
| 1741 | mm->start_brk = addr; | 1940 | mm->start_brk = addr; |
| @@ -1745,9 +1944,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 1745 | if (addr <= mm->end_data) | 1944 | if (addr <= mm->end_data) |
| 1746 | goto out; | 1945 | goto out; |
| 1747 | 1946 | ||
| 1748 | if (rlim < RLIM_INFINITY && | 1947 | if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, |
| 1749 | (addr - mm->start_brk) + | 1948 | mm->end_data, mm->start_data)) |
| 1750 | (mm->end_data - mm->start_data) > rlim) | ||
| 1751 | goto out; | 1949 | goto out; |
| 1752 | 1950 | ||
| 1753 | mm->brk = addr; | 1951 | mm->brk = addr; |
| @@ -2011,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2011 | me->mm->def_flags &= ~VM_NOHUGEPAGE; | 2209 | me->mm->def_flags &= ~VM_NOHUGEPAGE; |
| 2012 | up_write(&me->mm->mmap_sem); | 2210 | up_write(&me->mm->mmap_sem); |
| 2013 | break; | 2211 | break; |
| 2212 | case PR_MPX_ENABLE_MANAGEMENT: | ||
| 2213 | error = MPX_ENABLE_MANAGEMENT(me); | ||
| 2214 | break; | ||
| 2215 | case PR_MPX_DISABLE_MANAGEMENT: | ||
| 2216 | error = MPX_DISABLE_MANAGEMENT(me); | ||
| 2217 | break; | ||
| 2014 | default: | 2218 | default: |
| 2015 | error = -EINVAL; | 2219 | error = -EINVAL; |
| 2016 | break; | 2220 | break; |
| @@ -2023,6 +2227,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
| 2023 | { | 2227 | { |
| 2024 | int err = 0; | 2228 | int err = 0; |
| 2025 | int cpu = raw_smp_processor_id(); | 2229 | int cpu = raw_smp_processor_id(); |
| 2230 | |||
| 2026 | if (cpup) | 2231 | if (cpup) |
| 2027 | err |= put_user(cpu, cpup); | 2232 | err |= put_user(cpu, cpup); |
| 2028 | if (nodep) | 2233 | if (nodep) |
| @@ -2135,7 +2340,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) | |||
| 2135 | /* Check to see if any memory value is too large for 32-bit and scale | 2340 | /* Check to see if any memory value is too large for 32-bit and scale |
| 2136 | * down if needed | 2341 | * down if needed |
| 2137 | */ | 2342 | */ |
| 2138 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | 2343 | if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { |
| 2139 | int bitcount = 0; | 2344 | int bitcount = 0; |
| 2140 | 2345 | ||
| 2141 | while (s.mem_unit < PAGE_SIZE) { | 2346 | while (s.mem_unit < PAGE_SIZE) { |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev); | |||
| 156 | cond_syscall(compat_sys_process_vm_readv); | 156 | cond_syscall(compat_sys_process_vm_readv); |
| 157 | cond_syscall(compat_sys_process_vm_writev); | 157 | cond_syscall(compat_sys_process_vm_writev); |
| 158 | cond_syscall(sys_uselib); | 158 | cond_syscall(sys_uselib); |
| 159 | cond_syscall(sys_fadvise64); | ||
| 160 | cond_syscall(sys_fadvise64_64); | ||
| 161 | cond_syscall(sys_madvise); | ||
| 159 | 162 | ||
| 160 | /* arch-specific weak syscall entries */ | 163 | /* arch-specific weak syscall entries */ |
| 161 | cond_syscall(sys_pciconfig_read); | 164 | cond_syscall(sys_pciconfig_read); |
| @@ -166,6 +169,8 @@ cond_syscall(ppc_rtas); | |||
| 166 | cond_syscall(sys_spu_run); | 169 | cond_syscall(sys_spu_run); |
| 167 | cond_syscall(sys_spu_create); | 170 | cond_syscall(sys_spu_create); |
| 168 | cond_syscall(sys_subpage_prot); | 171 | cond_syscall(sys_subpage_prot); |
| 172 | cond_syscall(sys_s390_pci_mmio_read); | ||
| 173 | cond_syscall(sys_s390_pci_mmio_write); | ||
| 169 | 174 | ||
| 170 | /* mmu depending weak syscall entries */ | 175 | /* mmu depending weak syscall entries */ |
| 171 | cond_syscall(sys_mprotect); | 176 | cond_syscall(sys_mprotect); |
| @@ -218,3 +223,9 @@ cond_syscall(sys_kcmp); | |||
| 218 | 223 | ||
| 219 | /* operate on Secure Computing state */ | 224 | /* operate on Secure Computing state */ |
| 220 | cond_syscall(sys_seccomp); | 225 | cond_syscall(sys_seccomp); |
| 226 | |||
| 227 | /* access BPF programs and maps */ | ||
| 228 | cond_syscall(sys_bpf); | ||
| 229 | |||
| 230 | /* execveat */ | ||
| 231 | cond_syscall(sys_execveat); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75875a741b5e..137c7f69b264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { | |||
| 387 | .data = &sysctl_numa_balancing_scan_size, | 387 | .data = &sysctl_numa_balancing_scan_size, |
| 388 | .maxlen = sizeof(unsigned int), | 388 | .maxlen = sizeof(unsigned int), |
| 389 | .mode = 0644, | 389 | .mode = 0644, |
| 390 | .proc_handler = proc_dointvec, | 390 | .proc_handler = proc_dointvec_minmax, |
| 391 | .extra1 = &one, | ||
| 391 | }, | 392 | }, |
| 392 | { | 393 | { |
| 393 | .procname = "numa_balancing", | 394 | .procname = "numa_balancing", |
| @@ -622,6 +623,13 @@ static struct ctl_table kern_table[] = { | |||
| 622 | .mode = 0644, | 623 | .mode = 0644, |
| 623 | .proc_handler = proc_dointvec, | 624 | .proc_handler = proc_dointvec, |
| 624 | }, | 625 | }, |
| 626 | { | ||
| 627 | .procname = "tracepoint_printk", | ||
| 628 | .data = &tracepoint_printk, | ||
| 629 | .maxlen = sizeof(tracepoint_printk), | ||
| 630 | .mode = 0644, | ||
| 631 | .proc_handler = proc_dointvec, | ||
| 632 | }, | ||
| 625 | #endif | 633 | #endif |
| 626 | #ifdef CONFIG_KEXEC | 634 | #ifdef CONFIG_KEXEC |
| 627 | { | 635 | { |
| @@ -1055,15 +1063,6 @@ static struct ctl_table kern_table[] = { | |||
| 1055 | .child = key_sysctls, | 1063 | .child = key_sysctls, |
| 1056 | }, | 1064 | }, |
| 1057 | #endif | 1065 | #endif |
| 1058 | #ifdef CONFIG_RCU_TORTURE_TEST | ||
| 1059 | { | ||
| 1060 | .procname = "rcutorture_runnable", | ||
| 1061 | .data = &rcutorture_runnable, | ||
| 1062 | .maxlen = sizeof(int), | ||
| 1063 | .mode = 0644, | ||
| 1064 | .proc_handler = proc_dointvec, | ||
| 1065 | }, | ||
| 1066 | #endif | ||
| 1067 | #ifdef CONFIG_PERF_EVENTS | 1066 | #ifdef CONFIG_PERF_EVENTS |
| 1068 | /* | 1067 | /* |
| 1069 | * User-space scripts rely on the existence of this file | 1068 | * User-space scripts rely on the existence of this file |
| @@ -1112,6 +1111,15 @@ static struct ctl_table kern_table[] = { | |||
| 1112 | .proc_handler = proc_dointvec, | 1111 | .proc_handler = proc_dointvec, |
| 1113 | }, | 1112 | }, |
| 1114 | #endif | 1113 | #endif |
| 1114 | { | ||
| 1115 | .procname = "panic_on_warn", | ||
| 1116 | .data = &panic_on_warn, | ||
| 1117 | .maxlen = sizeof(int), | ||
| 1118 | .mode = 0644, | ||
| 1119 | .proc_handler = proc_dointvec_minmax, | ||
| 1120 | .extra1 = &zero, | ||
| 1121 | .extra2 = &one, | ||
| 1122 | }, | ||
| 1115 | { } | 1123 | { } |
| 1116 | }; | 1124 | }; |
| 1117 | 1125 | ||
| @@ -1460,13 +1468,6 @@ static struct ctl_table vm_table[] = { | |||
| 1460 | .extra2 = &one, | 1468 | .extra2 = &one, |
| 1461 | }, | 1469 | }, |
| 1462 | #endif | 1470 | #endif |
| 1463 | { | ||
| 1464 | .procname = "scan_unevictable_pages", | ||
| 1465 | .data = &scan_unevictable_pages, | ||
| 1466 | .maxlen = sizeof(scan_unevictable_pages), | ||
| 1467 | .mode = 0644, | ||
| 1468 | .proc_handler = scan_unevictable_handler, | ||
| 1469 | }, | ||
| 1470 | #ifdef CONFIG_MEMORY_FAILURE | 1471 | #ifdef CONFIG_MEMORY_FAILURE |
| 1471 | { | 1472 | { |
| 1472 | .procname = "memory_failure_early_kill", | 1473 | .procname = "memory_failure_early_kill", |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e4ba9a5a5ccb..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { | |||
| 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, | 137 | { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, |
| 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, | 138 | { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, |
| 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, | 139 | { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, |
| 140 | { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, | ||
| 140 | {} | 141 | {} |
| 141 | }; | 142 | }; |
| 142 | 143 | ||
| @@ -390,7 +391,6 @@ static const struct bin_table bin_net_ipv4_table[] = { | |||
| 390 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, | 391 | { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, |
| 391 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, | 392 | { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, |
| 392 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, | 393 | { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, |
| 393 | { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, | ||
| 394 | { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, | 394 | { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, |
| 395 | { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, | 395 | { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, |
| 396 | { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, | 396 | { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..670fff88a961 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 459 | stats = nla_data(na); | 459 | stats = nla_data(na); |
| 460 | memset(stats, 0, sizeof(*stats)); | 460 | memset(stats, 0, sizeof(*stats)); |
| 461 | 461 | ||
| 462 | rc = cgroupstats_build(stats, f.file->f_dentry); | 462 | rc = cgroupstats_build(stats, f.file->f_path.dentry); |
| 463 | if (rc < 0) { | 463 | if (rc < 0) { |
| 464 | nlmsg_free(rep_skb); | 464 | nlmsg_free(rep_skb); |
| 465 | goto err; | 465 | goto err; |
| @@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 638 | fill_tgid_exit(tsk); | 638 | fill_tgid_exit(tsk); |
| 639 | } | 639 | } |
| 640 | 640 | ||
| 641 | listeners = __this_cpu_ptr(&listener_array); | 641 | listeners = raw_cpu_ptr(&listener_array); |
| 642 | if (list_empty(&listeners->list)) | 642 | if (list_empty(&listeners->list)) |
| 643 | return; | 643 | return; |
| 644 | 644 | ||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 7347426fa68d..f622cf28628a 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | |||
| 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o |
| 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
| 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
| 16 | obj-$(CONFIG_TEST_UDELAY) += udelay_test.o | 16 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o |
| 17 | 17 | ||
| 18 | $(obj)/time.o: $(obj)/timeconst.h | 18 | $(obj)/time.o: $(obj)/timeconst.h |
| 19 | 19 | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, | |||
| 72 | * Also omit the add if it would overflow the u64 boundary. | 72 | * Also omit the add if it would overflow the u64 boundary. |
| 73 | */ | 73 | */ |
| 74 | if ((~0ULL - clc > rnd) && | 74 | if ((~0ULL - clc > rnd) && |
| 75 | (!ismax || evt->mult <= (1U << evt->shift))) | 75 | (!ismax || evt->mult <= (1ULL << evt->shift))) |
| 76 | clc += rnd; | 76 | clc += rnd; |
| 77 | 77 | ||
| 78 | do_div(clc, evt->mult); | 78 | do_div(clc, evt->mult); |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2e949cc9c9f1..b79f39bda7e1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 792 | /* Initialize mult/shift and max_idle_ns */ | 792 | /* Initialize mult/shift and max_idle_ns */ |
| 793 | __clocksource_updatefreq_scale(cs, scale, freq); | 793 | __clocksource_updatefreq_scale(cs, scale, freq); |
| 794 | 794 | ||
| 795 | /* Add clocksource to the clcoksource list */ | 795 | /* Add clocksource to the clocksource list */ |
| 796 | mutex_lock(&clocksource_mutex); | 796 | mutex_lock(&clocksource_mutex); |
| 797 | clocksource_enqueue(cs); | 797 | clocksource_enqueue(cs); |
| 798 | clocksource_enqueue_watchdog(cs); | 798 | clocksource_enqueue_watchdog(cs); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..37e50aadd471 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
| 558 | static int hrtimer_reprogram(struct hrtimer *timer, | 558 | static int hrtimer_reprogram(struct hrtimer *timer, |
| 559 | struct hrtimer_clock_base *base) | 559 | struct hrtimer_clock_base *base) |
| 560 | { | 560 | { |
| 561 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 561 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 562 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | 562 | ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); |
| 563 | int res; | 563 | int res; |
| 564 | 564 | ||
| @@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | |||
| 629 | */ | 629 | */ |
| 630 | static void retrigger_next_event(void *arg) | 630 | static void retrigger_next_event(void *arg) |
| 631 | { | 631 | { |
| 632 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 632 | struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); |
| 633 | 633 | ||
| 634 | if (!hrtimer_hres_active()) | 634 | if (!hrtimer_hres_active()) |
| 635 | return; | 635 | return; |
| @@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
| 903 | */ | 903 | */ |
| 904 | debug_deactivate(timer); | 904 | debug_deactivate(timer); |
| 905 | timer_stats_hrtimer_clear_start_info(timer); | 905 | timer_stats_hrtimer_clear_start_info(timer); |
| 906 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 906 | reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); |
| 907 | /* | 907 | /* |
| 908 | * We must preserve the CALLBACK state flag here, | 908 | * We must preserve the CALLBACK state flag here, |
| 909 | * otherwise we could move the timer base in | 909 | * otherwise we could move the timer base in |
| @@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 963 | * on dynticks target. | 963 | * on dynticks target. |
| 964 | */ | 964 | */ |
| 965 | wake_up_nohz_cpu(new_base->cpu_base->cpu); | 965 | wake_up_nohz_cpu(new_base->cpu_base->cpu); |
| 966 | } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && | 966 | } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && |
| 967 | hrtimer_reprogram(timer, new_base)) { | 967 | hrtimer_reprogram(timer, new_base)) { |
| 968 | /* | 968 | /* |
| 969 | * Only allow reprogramming if the new base is on this CPU. | 969 | * Only allow reprogramming if the new base is on this CPU. |
| @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | |||
| 1103 | */ | 1103 | */ |
| 1104 | ktime_t hrtimer_get_next_event(void) | 1104 | ktime_t hrtimer_get_next_event(void) |
| 1105 | { | 1105 | { |
| 1106 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1106 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1107 | struct hrtimer_clock_base *base = cpu_base->clock_base; | 1107 | struct hrtimer_clock_base *base = cpu_base->clock_base; |
| 1108 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; | 1108 | ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; |
| 1109 | unsigned long flags; | 1109 | unsigned long flags; |
| @@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 1144 | 1144 | ||
| 1145 | memset(timer, 0, sizeof(struct hrtimer)); | 1145 | memset(timer, 0, sizeof(struct hrtimer)); |
| 1146 | 1146 | ||
| 1147 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1147 | cpu_base = raw_cpu_ptr(&hrtimer_bases); |
| 1148 | 1148 | ||
| 1149 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) | 1149 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) |
| 1150 | clock_id = CLOCK_MONOTONIC; | 1150 | clock_id = CLOCK_MONOTONIC; |
| @@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 1187 | struct hrtimer_cpu_base *cpu_base; | 1187 | struct hrtimer_cpu_base *cpu_base; |
| 1188 | int base = hrtimer_clockid_to_base(which_clock); | 1188 | int base = hrtimer_clockid_to_base(which_clock); |
| 1189 | 1189 | ||
| 1190 | cpu_base = &__raw_get_cpu_var(hrtimer_bases); | 1190 | cpu_base = raw_cpu_ptr(&hrtimer_bases); |
| 1191 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); | 1191 | *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); |
| 1192 | 1192 | ||
| 1193 | return 0; | 1193 | return 0; |
| @@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
| 1242 | */ | 1242 | */ |
| 1243 | void hrtimer_interrupt(struct clock_event_device *dev) | 1243 | void hrtimer_interrupt(struct clock_event_device *dev) |
| 1244 | { | 1244 | { |
| 1245 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1245 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1246 | ktime_t expires_next, now, entry_time, delta; | 1246 | ktime_t expires_next, now, entry_time, delta; |
| 1247 | int i, retries = 0; | 1247 | int i, retries = 0; |
| 1248 | 1248 | ||
| @@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void) | |||
| 1376 | if (!hrtimer_hres_active()) | 1376 | if (!hrtimer_hres_active()) |
| 1377 | return; | 1377 | return; |
| 1378 | 1378 | ||
| 1379 | td = &__get_cpu_var(tick_cpu_device); | 1379 | td = this_cpu_ptr(&tick_cpu_device); |
| 1380 | if (td && td->evtdev) | 1380 | if (td && td->evtdev) |
| 1381 | hrtimer_interrupt(td->evtdev); | 1381 | hrtimer_interrupt(td->evtdev); |
| 1382 | } | 1382 | } |
| @@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void) | |||
| 1440 | void hrtimer_run_queues(void) | 1440 | void hrtimer_run_queues(void) |
| 1441 | { | 1441 | { |
| 1442 | struct timerqueue_node *node; | 1442 | struct timerqueue_node *node; |
| 1443 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1443 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
| 1444 | struct hrtimer_clock_base *base; | 1444 | struct hrtimer_clock_base *base; |
| 1445 | int index, gettime = 1; | 1445 | int index, gettime = 1; |
| 1446 | 1446 | ||
| @@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu) | |||
| 1679 | 1679 | ||
| 1680 | local_irq_disable(); | 1680 | local_irq_disable(); |
| 1681 | old_base = &per_cpu(hrtimer_bases, scpu); | 1681 | old_base = &per_cpu(hrtimer_bases, scpu); |
| 1682 | new_base = &__get_cpu_var(hrtimer_bases); | 1682 | new_base = this_cpu_ptr(&hrtimer_bases); |
| 1683 | /* | 1683 | /* |
| 1684 | * The caller is globally serialized and nobody else | 1684 | * The caller is globally serialized and nobody else |
| 1685 | * takes two locks at once, deadlock is not possible. | 1685 | * takes two locks at once, deadlock is not possible. |
| @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, | |||
| 1776 | */ | 1776 | */ |
| 1777 | if (!expires) { | 1777 | if (!expires) { |
| 1778 | schedule(); | 1778 | schedule(); |
| 1779 | __set_current_state(TASK_RUNNING); | ||
| 1780 | return -EINTR; | 1779 | return -EINTR; |
| 1781 | } | 1780 | } |
| 1782 | 1781 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..a16b67859e2a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
| @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
| 272 | if (same_thread_group(tsk, current)) | 272 | if (same_thread_group(tsk, current)) |
| 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); | 273 | err = cpu_clock_sample(which_clock, tsk, &rtn); |
| 274 | } else { | 274 | } else { |
| 275 | unsigned long flags; | ||
| 276 | struct sighand_struct *sighand; | ||
| 277 | |||
| 278 | /* | ||
| 279 | * while_each_thread() is not yet entirely RCU safe, | ||
| 280 | * keep locking the group while sampling process | ||
| 281 | * clock for now. | ||
| 282 | */ | ||
| 283 | sighand = lock_task_sighand(tsk, &flags); | ||
| 284 | if (!sighand) | ||
| 285 | return err; | ||
| 286 | |||
| 287 | if (tsk == current || thread_group_leader(tsk)) | 275 | if (tsk == current || thread_group_leader(tsk)) |
| 288 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); | 276 | err = cpu_clock_sample_group(which_clock, tsk, &rtn); |
| 289 | |||
| 290 | unlock_task_sighand(tsk, &flags); | ||
| 291 | } | 277 | } |
| 292 | 278 | ||
| 293 | if (!err) | 279 | if (!err) |
| @@ -567,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
| 567 | *sample = cputime_to_expires(cputime.utime); | 553 | *sample = cputime_to_expires(cputime.utime); |
| 568 | break; | 554 | break; |
| 569 | case CPUCLOCK_SCHED: | 555 | case CPUCLOCK_SCHED: |
| 570 | *sample = cputime.sum_exec_runtime + task_delta_exec(p); | 556 | *sample = cputime.sum_exec_runtime; |
| 571 | break; | 557 | break; |
| 572 | } | 558 | } |
| 573 | return 0; | 559 | return 0; |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 636 | goto out; | 636 | goto out; |
| 637 | } | 637 | } |
| 638 | } else { | 638 | } else { |
| 639 | memset(&event.sigev_value, 0, sizeof(event.sigev_value)); | ||
| 639 | event.sigev_notify = SIGEV_SIGNAL; | 640 | event.sigev_notify = SIGEV_SIGNAL; |
| 640 | event.sigev_signo = SIGALRM; | 641 | event.sigev_signo = SIGALRM; |
| 641 | event.sigev_value.sival_int = new_timer->it_id; | 642 | event.sigev_value.sival_int = new_timer->it_id; |
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c index e622ba365a13..e622ba365a13 100644 --- a/kernel/time/udelay_test.c +++ b/kernel/time/test_udelay.c | |||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..066f0ec05e48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
| 554 | void tick_check_oneshot_broadcast_this_cpu(void) | 554 | void tick_check_oneshot_broadcast_this_cpu(void) |
| 555 | { | 555 | { |
| 556 | if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { | 556 | if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { |
| 557 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 557 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 558 | 558 | ||
| 559 | /* | 559 | /* |
| 560 | * We might be in the middle of switching over from | 560 | * We might be in the middle of switching over from |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..7efeedf53ebd 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, | |||
| 224 | 224 | ||
| 225 | void tick_install_replacement(struct clock_event_device *newdev) | 225 | void tick_install_replacement(struct clock_event_device *newdev) |
| 226 | { | 226 | { |
| 227 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 227 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 228 | int cpu = smp_processor_id(); | 228 | int cpu = smp_processor_id(); |
| 229 | 229 | ||
| 230 | clockevents_exchange_device(td->evtdev, newdev); | 230 | clockevents_exchange_device(td->evtdev, newdev); |
| @@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup) | |||
| 374 | 374 | ||
| 375 | void tick_suspend(void) | 375 | void tick_suspend(void) |
| 376 | { | 376 | { |
| 377 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 377 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 378 | 378 | ||
| 379 | clockevents_shutdown(td->evtdev); | 379 | clockevents_shutdown(td->evtdev); |
| 380 | } | 380 | } |
| 381 | 381 | ||
| 382 | void tick_resume(void) | 382 | void tick_resume(void) |
| 383 | { | 383 | { |
| 384 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 384 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 385 | int broadcast = tick_resume_broadcast(); | 385 | int broadcast = tick_resume_broadcast(); |
| 386 | 386 | ||
| 387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | 387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); |
| @@ -400,4 +400,5 @@ void tick_resume(void) | |||
| 400 | void __init tick_init(void) | 400 | void __init tick_init(void) |
| 401 | { | 401 | { |
| 402 | tick_broadcast_init(); | 402 | tick_broadcast_init(); |
| 403 | tick_nohz_init(); | ||
| 403 | } | 404 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } | |||
| 99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | 99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } |
| 100 | #endif /* !TICK_ONESHOT */ | 100 | #endif /* !TICK_ONESHOT */ |
| 101 | 101 | ||
| 102 | /* NO_HZ_FULL internal */ | ||
| 103 | #ifdef CONFIG_NO_HZ_FULL | ||
| 104 | extern void tick_nohz_init(void); | ||
| 105 | # else | ||
| 106 | static inline void tick_nohz_init(void) { } | ||
| 107 | #endif | ||
| 108 | |||
| 102 | /* | 109 | /* |
| 103 | * Broadcasting support | 110 | * Broadcasting support |
| 104 | */ | 111 | */ |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..7ce740e78e1b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
| 59 | */ | 59 | */ |
| 60 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | 60 | int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) |
| 61 | { | 61 | { |
| 62 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 62 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
| 63 | struct clock_event_device *dev = td->evtdev; | 63 | struct clock_event_device *dev = td->evtdev; |
| 64 | 64 | ||
| 65 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || | 65 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..1363d58f07e9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
| 205 | */ | 205 | */ |
| 206 | void __tick_nohz_full_check(void) | 206 | void __tick_nohz_full_check(void) |
| 207 | { | 207 | { |
| 208 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 208 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 209 | 209 | ||
| 210 | if (tick_nohz_full_cpu(smp_processor_id())) { | 210 | if (tick_nohz_full_cpu(smp_processor_id())) { |
| 211 | if (ts->tick_stopped && !is_idle_task(current)) { | 211 | if (ts->tick_stopped && !is_idle_task(current)) { |
| @@ -235,7 +235,7 @@ void tick_nohz_full_kick(void) | |||
| 235 | if (!tick_nohz_full_cpu(smp_processor_id())) | 235 | if (!tick_nohz_full_cpu(smp_processor_id())) |
| 236 | return; | 236 | return; |
| 237 | 237 | ||
| 238 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | 238 | irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); |
| 239 | } | 239 | } |
| 240 | 240 | ||
| 241 | /* | 241 | /* |
| @@ -295,22 +295,12 @@ out: | |||
| 295 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 295 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
| 296 | static int __init tick_nohz_full_setup(char *str) | 296 | static int __init tick_nohz_full_setup(char *str) |
| 297 | { | 297 | { |
| 298 | int cpu; | ||
| 299 | |||
| 300 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); | 298 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
| 301 | alloc_bootmem_cpumask_var(&housekeeping_mask); | ||
| 302 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { | 299 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
| 303 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 300 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
| 301 | free_bootmem_cpumask_var(tick_nohz_full_mask); | ||
| 304 | return 1; | 302 | return 1; |
| 305 | } | 303 | } |
| 306 | |||
| 307 | cpu = smp_processor_id(); | ||
| 308 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | ||
| 309 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
| 310 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | ||
| 311 | } | ||
| 312 | cpumask_andnot(housekeeping_mask, | ||
| 313 | cpu_possible_mask, tick_nohz_full_mask); | ||
| 314 | tick_nohz_full_running = true; | 304 | tick_nohz_full_running = true; |
| 315 | 305 | ||
| 316 | return 1; | 306 | return 1; |
| @@ -349,18 +339,11 @@ static int tick_nohz_init_all(void) | |||
| 349 | 339 | ||
| 350 | #ifdef CONFIG_NO_HZ_FULL_ALL | 340 | #ifdef CONFIG_NO_HZ_FULL_ALL |
| 351 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { | 341 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
| 352 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 342 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); |
| 353 | return err; | ||
| 354 | } | ||
| 355 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
| 356 | pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
| 357 | return err; | 343 | return err; |
| 358 | } | 344 | } |
| 359 | err = 0; | 345 | err = 0; |
| 360 | cpumask_setall(tick_nohz_full_mask); | 346 | cpumask_setall(tick_nohz_full_mask); |
| 361 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); | ||
| 362 | cpumask_clear(housekeeping_mask); | ||
| 363 | cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | ||
| 364 | tick_nohz_full_running = true; | 347 | tick_nohz_full_running = true; |
| 365 | #endif | 348 | #endif |
| 366 | return err; | 349 | return err; |
| @@ -375,6 +358,37 @@ void __init tick_nohz_init(void) | |||
| 375 | return; | 358 | return; |
| 376 | } | 359 | } |
| 377 | 360 | ||
| 361 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
| 362 | WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
| 363 | cpumask_clear(tick_nohz_full_mask); | ||
| 364 | tick_nohz_full_running = false; | ||
| 365 | return; | ||
| 366 | } | ||
| 367 | |||
| 368 | /* | ||
| 369 | * Full dynticks uses irq work to drive the tick rescheduling on safe | ||
| 370 | * locking contexts. But then we need irq work to raise its own | ||
| 371 | * interrupts to avoid circular dependency on the tick | ||
| 372 | */ | ||
| 373 | if (!arch_irq_work_has_interrupt()) { | ||
| 374 | pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " | ||
| 375 | "support irq work self-IPIs\n"); | ||
| 376 | cpumask_clear(tick_nohz_full_mask); | ||
| 377 | cpumask_copy(housekeeping_mask, cpu_possible_mask); | ||
| 378 | tick_nohz_full_running = false; | ||
| 379 | return; | ||
| 380 | } | ||
| 381 | |||
| 382 | cpu = smp_processor_id(); | ||
| 383 | |||
| 384 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | ||
| 385 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
| 386 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | ||
| 387 | } | ||
| 388 | |||
| 389 | cpumask_andnot(housekeeping_mask, | ||
| 390 | cpu_possible_mask, tick_nohz_full_mask); | ||
| 391 | |||
| 378 | for_each_cpu(cpu, tick_nohz_full_mask) | 392 | for_each_cpu(cpu, tick_nohz_full_mask) |
| 379 | context_tracking_cpu_set(cpu); | 393 | context_tracking_cpu_set(cpu); |
| 380 | 394 | ||
| @@ -559,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 559 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 573 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
| 560 | ktime_t last_update, expires, ret = { .tv64 = 0 }; | 574 | ktime_t last_update, expires, ret = { .tv64 = 0 }; |
| 561 | unsigned long rcu_delta_jiffies; | 575 | unsigned long rcu_delta_jiffies; |
| 562 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 576 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
| 563 | u64 time_delta; | 577 | u64 time_delta; |
| 564 | 578 | ||
| 565 | time_delta = timekeeping_max_deferment(); | 579 | time_delta = timekeeping_max_deferment(); |
| @@ -571,8 +585,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 571 | last_jiffies = jiffies; | 585 | last_jiffies = jiffies; |
| 572 | } while (read_seqretry(&jiffies_lock, seq)); | 586 | } while (read_seqretry(&jiffies_lock, seq)); |
| 573 | 587 | ||
| 574 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || | 588 | if (rcu_needs_cpu(&rcu_delta_jiffies) || |
| 575 | arch_needs_cpu(cpu) || irq_work_needs_cpu()) { | 589 | arch_needs_cpu() || irq_work_needs_cpu()) { |
| 576 | next_jiffies = last_jiffies + 1; | 590 | next_jiffies = last_jiffies + 1; |
| 577 | delta_jiffies = 1; | 591 | delta_jiffies = 1; |
| 578 | } else { | 592 | } else { |
| @@ -827,13 +841,12 @@ void tick_nohz_idle_enter(void) | |||
| 827 | 841 | ||
| 828 | local_irq_disable(); | 842 | local_irq_disable(); |
| 829 | 843 | ||
| 830 | ts = &__get_cpu_var(tick_cpu_sched); | 844 | ts = this_cpu_ptr(&tick_cpu_sched); |
| 831 | ts->inidle = 1; | 845 | ts->inidle = 1; |
| 832 | __tick_nohz_idle_enter(ts); | 846 | __tick_nohz_idle_enter(ts); |
| 833 | 847 | ||
| 834 | local_irq_enable(); | 848 | local_irq_enable(); |
| 835 | } | 849 | } |
| 836 | EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); | ||
| 837 | 850 | ||
| 838 | /** | 851 | /** |
| 839 | * tick_nohz_irq_exit - update next tick event from interrupt exit | 852 | * tick_nohz_irq_exit - update next tick event from interrupt exit |
| @@ -845,7 +858,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); | |||
| 845 | */ | 858 | */ |
| 846 | void tick_nohz_irq_exit(void) | 859 | void tick_nohz_irq_exit(void) |
| 847 | { | 860 | { |
| 848 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 861 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 849 | 862 | ||
| 850 | if (ts->inidle) | 863 | if (ts->inidle) |
| 851 | __tick_nohz_idle_enter(ts); | 864 | __tick_nohz_idle_enter(ts); |
| @@ -860,7 +873,7 @@ void tick_nohz_irq_exit(void) | |||
| 860 | */ | 873 | */ |
| 861 | ktime_t tick_nohz_get_sleep_length(void) | 874 | ktime_t tick_nohz_get_sleep_length(void) |
| 862 | { | 875 | { |
| 863 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 876 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 864 | 877 | ||
| 865 | return ts->sleep_length; | 878 | return ts->sleep_length; |
| 866 | } | 879 | } |
| @@ -938,7 +951,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) | |||
| 938 | */ | 951 | */ |
| 939 | void tick_nohz_idle_exit(void) | 952 | void tick_nohz_idle_exit(void) |
| 940 | { | 953 | { |
| 941 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 954 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 942 | ktime_t now; | 955 | ktime_t now; |
| 943 | 956 | ||
| 944 | local_irq_disable(); | 957 | local_irq_disable(); |
| @@ -960,7 +973,6 @@ void tick_nohz_idle_exit(void) | |||
| 960 | 973 | ||
| 961 | local_irq_enable(); | 974 | local_irq_enable(); |
| 962 | } | 975 | } |
| 963 | EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); | ||
| 964 | 976 | ||
| 965 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | 977 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) |
| 966 | { | 978 | { |
| @@ -973,7 +985,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | |||
| 973 | */ | 985 | */ |
| 974 | static void tick_nohz_handler(struct clock_event_device *dev) | 986 | static void tick_nohz_handler(struct clock_event_device *dev) |
| 975 | { | 987 | { |
| 976 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 988 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 977 | struct pt_regs *regs = get_irq_regs(); | 989 | struct pt_regs *regs = get_irq_regs(); |
| 978 | ktime_t now = ktime_get(); | 990 | ktime_t now = ktime_get(); |
| 979 | 991 | ||
| @@ -982,6 +994,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
| 982 | tick_sched_do_timer(now); | 994 | tick_sched_do_timer(now); |
| 983 | tick_sched_handle(ts, regs); | 995 | tick_sched_handle(ts, regs); |
| 984 | 996 | ||
| 997 | /* No need to reprogram if we are running tickless */ | ||
| 998 | if (unlikely(ts->tick_stopped)) | ||
| 999 | return; | ||
| 1000 | |||
| 985 | while (tick_nohz_reprogram(ts, now)) { | 1001 | while (tick_nohz_reprogram(ts, now)) { |
| 986 | now = ktime_get(); | 1002 | now = ktime_get(); |
| 987 | tick_do_update_jiffies64(now); | 1003 | tick_do_update_jiffies64(now); |
| @@ -993,7 +1009,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
| 993 | */ | 1009 | */ |
| 994 | static void tick_nohz_switch_to_nohz(void) | 1010 | static void tick_nohz_switch_to_nohz(void) |
| 995 | { | 1011 | { |
| 996 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1012 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 997 | ktime_t next; | 1013 | ktime_t next; |
| 998 | 1014 | ||
| 999 | if (!tick_nohz_enabled) | 1015 | if (!tick_nohz_enabled) |
| @@ -1055,7 +1071,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) | |||
| 1055 | 1071 | ||
| 1056 | static inline void tick_nohz_irq_enter(void) | 1072 | static inline void tick_nohz_irq_enter(void) |
| 1057 | { | 1073 | { |
| 1058 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1074 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1059 | ktime_t now; | 1075 | ktime_t now; |
| 1060 | 1076 | ||
| 1061 | if (!ts->idle_active && !ts->tick_stopped) | 1077 | if (!ts->idle_active && !ts->tick_stopped) |
| @@ -1109,6 +1125,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 1109 | if (regs) | 1125 | if (regs) |
| 1110 | tick_sched_handle(ts, regs); | 1126 | tick_sched_handle(ts, regs); |
| 1111 | 1127 | ||
| 1128 | /* No need to reprogram if we are in idle or full dynticks mode */ | ||
| 1129 | if (unlikely(ts->tick_stopped)) | ||
| 1130 | return HRTIMER_NORESTART; | ||
| 1131 | |||
| 1112 | hrtimer_forward(timer, now, tick_period); | 1132 | hrtimer_forward(timer, now, tick_period); |
| 1113 | 1133 | ||
| 1114 | return HRTIMER_RESTART; | 1134 | return HRTIMER_RESTART; |
| @@ -1129,7 +1149,7 @@ early_param("skew_tick", skew_tick); | |||
| 1129 | */ | 1149 | */ |
| 1130 | void tick_setup_sched_timer(void) | 1150 | void tick_setup_sched_timer(void) |
| 1131 | { | 1151 | { |
| 1132 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1152 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1133 | ktime_t now = ktime_get(); | 1153 | ktime_t now = ktime_get(); |
| 1134 | 1154 | ||
| 1135 | /* | 1155 | /* |
| @@ -1198,7 +1218,7 @@ void tick_clock_notify(void) | |||
| 1198 | */ | 1218 | */ |
| 1199 | void tick_oneshot_notify(void) | 1219 | void tick_oneshot_notify(void) |
| 1200 | { | 1220 | { |
| 1201 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1221 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1202 | 1222 | ||
| 1203 | set_bit(0, &ts->check_clocks); | 1223 | set_bit(0, &ts->check_clocks); |
| 1204 | } | 1224 | } |
| @@ -1213,7 +1233,7 @@ void tick_oneshot_notify(void) | |||
| 1213 | */ | 1233 | */ |
| 1214 | int tick_check_oneshot_change(int allow_nohz) | 1234 | int tick_check_oneshot_change(int allow_nohz) |
| 1215 | { | 1235 | { |
| 1216 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 1236 | struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); |
| 1217 | 1237 | ||
| 1218 | if (!test_and_clear_bit(0, &ts->check_clocks)) | 1238 | if (!test_and_clear_bit(0, &ts->check_clocks)) |
| 1219 | return 0; | 1239 | return 0; |
diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..6390517e77d4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
| 304 | } | 304 | } |
| 305 | EXPORT_SYMBOL(timespec_trunc); | 305 | EXPORT_SYMBOL(timespec_trunc); |
| 306 | 306 | ||
| 307 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 307 | /* |
| 308 | * mktime64 - Converts date to seconds. | ||
| 309 | * Converts Gregorian date to seconds since 1970-01-01 00:00:00. | ||
| 308 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 310 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
| 309 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | 311 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
| 310 | * | 312 | * |
| @@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc); | |||
| 314 | * -year/100+year/400 terms, and add 10.] | 316 | * -year/100+year/400 terms, and add 10.] |
| 315 | * | 317 | * |
| 316 | * This algorithm was first published by Gauss (I think). | 318 | * This algorithm was first published by Gauss (I think). |
| 317 | * | ||
| 318 | * WARNING: this function will overflow on 2106-02-07 06:28:16 on | ||
| 319 | * machines where long is 32-bit! (However, as time_t is signed, we | ||
| 320 | * will already get problems at other places on 2038-01-19 03:14:08) | ||
| 321 | */ | 319 | */ |
| 322 | unsigned long | 320 | time64_t mktime64(const unsigned int year0, const unsigned int mon0, |
| 323 | mktime(const unsigned int year0, const unsigned int mon0, | 321 | const unsigned int day, const unsigned int hour, |
| 324 | const unsigned int day, const unsigned int hour, | 322 | const unsigned int min, const unsigned int sec) |
| 325 | const unsigned int min, const unsigned int sec) | ||
| 326 | { | 323 | { |
| 327 | unsigned int mon = mon0, year = year0; | 324 | unsigned int mon = mon0, year = year0; |
| 328 | 325 | ||
| @@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0, | |||
| 332 | year -= 1; | 329 | year -= 1; |
| 333 | } | 330 | } |
| 334 | 331 | ||
| 335 | return ((((unsigned long) | 332 | return ((((time64_t) |
| 336 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + | 333 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + |
| 337 | year*365 - 719499 | 334 | year*365 - 719499 |
| 338 | )*24 + hour /* now have hours */ | 335 | )*24 + hour /* now have hours */ |
| 339 | )*60 + min /* now have minutes */ | 336 | )*60 + min /* now have minutes */ |
| 340 | )*60 + sec; /* finally seconds */ | 337 | )*60 + sec; /* finally seconds */ |
| 341 | } | 338 | } |
| 342 | 339 | EXPORT_SYMBOL(mktime64); | |
| 343 | EXPORT_SYMBOL(mktime); | ||
| 344 | 340 | ||
| 345 | /** | 341 | /** |
| 346 | * set_normalized_timespec - set timespec sec and nsec parts and normalize | 342 | * set_normalized_timespec - set timespec sec and nsec parts and normalize |
| @@ -745,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n) | |||
| 745 | return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); | 741 | return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); |
| 746 | #endif | 742 | #endif |
| 747 | } | 743 | } |
| 744 | EXPORT_SYMBOL(nsecs_to_jiffies64); | ||
| 748 | 745 | ||
| 749 | /** | 746 | /** |
| 750 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 747 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ec1791fae965..6a931852082f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | |||
| 417 | */ | 417 | */ |
| 418 | static inline void tk_update_ktime_data(struct timekeeper *tk) | 418 | static inline void tk_update_ktime_data(struct timekeeper *tk) |
| 419 | { | 419 | { |
| 420 | s64 nsec; | 420 | u64 seconds; |
| 421 | u32 nsec; | ||
| 421 | 422 | ||
| 422 | /* | 423 | /* |
| 423 | * The xtime based monotonic readout is: | 424 | * The xtime based monotonic readout is: |
| @@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
| 426 | * nsec = base_mono + now(); | 427 | * nsec = base_mono + now(); |
| 427 | * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec | 428 | * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec |
| 428 | */ | 429 | */ |
| 429 | nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 430 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); |
| 430 | nsec *= NSEC_PER_SEC; | 431 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; |
| 431 | nsec += tk->wall_to_monotonic.tv_nsec; | 432 | tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); |
| 432 | tk->tkr.base_mono = ns_to_ktime(nsec); | ||
| 433 | 433 | ||
| 434 | /* Update the monotonic raw base */ | 434 | /* Update the monotonic raw base */ |
| 435 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | 435 | tk->base_raw = timespec64_to_ktime(tk->raw_time); |
| 436 | |||
| 437 | /* | ||
| 438 | * The sum of the nanoseconds portions of xtime and | ||
| 439 | * wall_to_monotonic can be greater/equal one second. Take | ||
| 440 | * this into account before updating tk->ktime_sec. | ||
| 441 | */ | ||
| 442 | nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); | ||
| 443 | if (nsec >= NSEC_PER_SEC) | ||
| 444 | seconds++; | ||
| 445 | tk->ktime_sec = seconds; | ||
| 436 | } | 446 | } |
| 437 | 447 | ||
| 438 | /* must hold timekeeper_lock */ | 448 | /* must hold timekeeper_lock */ |
| @@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64); | |||
| 519 | 529 | ||
| 520 | /** | 530 | /** |
| 521 | * getnstimeofday64 - Returns the time of day in a timespec64. | 531 | * getnstimeofday64 - Returns the time of day in a timespec64. |
| 522 | * @ts: pointer to the timespec to be set | 532 | * @ts: pointer to the timespec64 to be set |
| 523 | * | 533 | * |
| 524 | * Returns the time of day in a timespec (WARN if suspended). | 534 | * Returns the time of day in a timespec64 (WARN if suspended). |
| 525 | */ | 535 | */ |
| 526 | void getnstimeofday64(struct timespec64 *ts) | 536 | void getnstimeofday64(struct timespec64 *ts) |
| 527 | { | 537 | { |
| @@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw); | |||
| 623 | * | 633 | * |
| 624 | * The function calculates the monotonic clock from the realtime | 634 | * The function calculates the monotonic clock from the realtime |
| 625 | * clock and the wall_to_monotonic offset and stores the result | 635 | * clock and the wall_to_monotonic offset and stores the result |
| 626 | * in normalized timespec format in the variable pointed to by @ts. | 636 | * in normalized timespec64 format in the variable pointed to by @ts. |
| 627 | */ | 637 | */ |
| 628 | void ktime_get_ts64(struct timespec64 *ts) | 638 | void ktime_get_ts64(struct timespec64 *ts) |
| 629 | { | 639 | { |
| @@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts) | |||
| 648 | } | 658 | } |
| 649 | EXPORT_SYMBOL_GPL(ktime_get_ts64); | 659 | EXPORT_SYMBOL_GPL(ktime_get_ts64); |
| 650 | 660 | ||
| 661 | /** | ||
| 662 | * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC | ||
| 663 | * | ||
| 664 | * Returns the seconds portion of CLOCK_MONOTONIC with a single non | ||
| 665 | * serialized read. tk->ktime_sec is of type 'unsigned long' so this | ||
| 666 | * works on both 32 and 64 bit systems. On 32 bit systems the readout | ||
| 667 | * covers ~136 years of uptime which should be enough to prevent | ||
| 668 | * premature wrap arounds. | ||
| 669 | */ | ||
| 670 | time64_t ktime_get_seconds(void) | ||
| 671 | { | ||
| 672 | struct timekeeper *tk = &tk_core.timekeeper; | ||
| 673 | |||
| 674 | WARN_ON(timekeeping_suspended); | ||
| 675 | return tk->ktime_sec; | ||
| 676 | } | ||
| 677 | EXPORT_SYMBOL_GPL(ktime_get_seconds); | ||
| 678 | |||
| 679 | /** | ||
| 680 | * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME | ||
| 681 | * | ||
| 682 | * Returns the wall clock seconds since 1970. This replaces the | ||
| 683 | * get_seconds() interface which is not y2038 safe on 32bit systems. | ||
| 684 | * | ||
| 685 | * For 64bit systems the fast access to tk->xtime_sec is preserved. On | ||
| 686 | * 32bit systems the access must be protected with the sequence | ||
| 687 | * counter to provide "atomic" access to the 64bit tk->xtime_sec | ||
| 688 | * value. | ||
| 689 | */ | ||
| 690 | time64_t ktime_get_real_seconds(void) | ||
| 691 | { | ||
| 692 | struct timekeeper *tk = &tk_core.timekeeper; | ||
| 693 | time64_t seconds; | ||
| 694 | unsigned int seq; | ||
| 695 | |||
| 696 | if (IS_ENABLED(CONFIG_64BIT)) | ||
| 697 | return tk->xtime_sec; | ||
| 698 | |||
| 699 | do { | ||
| 700 | seq = read_seqcount_begin(&tk_core.seq); | ||
| 701 | seconds = tk->xtime_sec; | ||
| 702 | |||
| 703 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
| 704 | |||
| 705 | return seconds; | ||
| 706 | } | ||
| 707 | EXPORT_SYMBOL_GPL(ktime_get_real_seconds); | ||
| 708 | |||
| 651 | #ifdef CONFIG_NTP_PPS | 709 | #ifdef CONFIG_NTP_PPS |
| 652 | 710 | ||
| 653 | /** | 711 | /** |
| @@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv) | |||
| 703 | EXPORT_SYMBOL(do_gettimeofday); | 761 | EXPORT_SYMBOL(do_gettimeofday); |
| 704 | 762 | ||
| 705 | /** | 763 | /** |
| 706 | * do_settimeofday - Sets the time of day | 764 | * do_settimeofday64 - Sets the time of day. |
| 707 | * @tv: pointer to the timespec variable containing the new time | 765 | * @ts: pointer to the timespec64 variable containing the new time |
| 708 | * | 766 | * |
| 709 | * Sets the time of day to the new time and update NTP and notify hrtimers | 767 | * Sets the time of day to the new time and update NTP and notify hrtimers |
| 710 | */ | 768 | */ |
| 711 | int do_settimeofday(const struct timespec *tv) | 769 | int do_settimeofday64(const struct timespec64 *ts) |
| 712 | { | 770 | { |
| 713 | struct timekeeper *tk = &tk_core.timekeeper; | 771 | struct timekeeper *tk = &tk_core.timekeeper; |
| 714 | struct timespec64 ts_delta, xt, tmp; | 772 | struct timespec64 ts_delta, xt; |
| 715 | unsigned long flags; | 773 | unsigned long flags; |
| 716 | 774 | ||
| 717 | if (!timespec_valid_strict(tv)) | 775 | if (!timespec64_valid_strict(ts)) |
| 718 | return -EINVAL; | 776 | return -EINVAL; |
| 719 | 777 | ||
| 720 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 778 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
| @@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv) | |||
| 723 | timekeeping_forward_now(tk); | 781 | timekeeping_forward_now(tk); |
| 724 | 782 | ||
| 725 | xt = tk_xtime(tk); | 783 | xt = tk_xtime(tk); |
| 726 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; | 784 | ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; |
| 727 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; | 785 | ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; |
| 728 | 786 | ||
| 729 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); | 787 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); |
| 730 | 788 | ||
| 731 | tmp = timespec_to_timespec64(*tv); | 789 | tk_set_xtime(tk, ts); |
| 732 | tk_set_xtime(tk, &tmp); | ||
| 733 | 790 | ||
| 734 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 791 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
| 735 | 792 | ||
| @@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv) | |||
| 741 | 798 | ||
| 742 | return 0; | 799 | return 0; |
| 743 | } | 800 | } |
| 744 | EXPORT_SYMBOL(do_settimeofday); | 801 | EXPORT_SYMBOL(do_settimeofday64); |
| 745 | 802 | ||
| 746 | /** | 803 | /** |
| 747 | * timekeeping_inject_offset - Adds or subtracts from the current time. | 804 | * timekeeping_inject_offset - Adds or subtracts from the current time. |
| @@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock) | |||
| 895 | } | 952 | } |
| 896 | 953 | ||
| 897 | /** | 954 | /** |
| 898 | * getrawmonotonic - Returns the raw monotonic time in a timespec | 955 | * getrawmonotonic64 - Returns the raw monotonic time in a timespec |
| 899 | * @ts: pointer to the timespec to be set | 956 | * @ts: pointer to the timespec64 to be set |
| 900 | * | 957 | * |
| 901 | * Returns the raw monotonic time (completely un-modified by ntp) | 958 | * Returns the raw monotonic time (completely un-modified by ntp) |
| 902 | */ | 959 | */ |
| 903 | void getrawmonotonic(struct timespec *ts) | 960 | void getrawmonotonic64(struct timespec64 *ts) |
| 904 | { | 961 | { |
| 905 | struct timekeeper *tk = &tk_core.timekeeper; | 962 | struct timekeeper *tk = &tk_core.timekeeper; |
| 906 | struct timespec64 ts64; | 963 | struct timespec64 ts64; |
| @@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts) | |||
| 915 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 972 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
| 916 | 973 | ||
| 917 | timespec64_add_ns(&ts64, nsecs); | 974 | timespec64_add_ns(&ts64, nsecs); |
| 918 | *ts = timespec64_to_timespec(ts64); | 975 | *ts = ts64; |
| 919 | } | 976 | } |
| 920 | EXPORT_SYMBOL(getrawmonotonic); | 977 | EXPORT_SYMBOL(getrawmonotonic64); |
| 978 | |||
| 921 | 979 | ||
| 922 | /** | 980 | /** |
| 923 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres | 981 | * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres |
| @@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 1068 | } | 1126 | } |
| 1069 | 1127 | ||
| 1070 | /** | 1128 | /** |
| 1071 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | 1129 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values |
| 1072 | * @delta: pointer to a timespec delta value | 1130 | * @delta: pointer to a timespec64 delta value |
| 1073 | * | 1131 | * |
| 1074 | * This hook is for architectures that cannot support read_persistent_clock | 1132 | * This hook is for architectures that cannot support read_persistent_clock |
| 1075 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 1133 | * because their RTC/persistent clock is only accessible when irqs are enabled. |
| @@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 1077 | * This function should only be called by rtc_resume(), and allows | 1135 | * This function should only be called by rtc_resume(), and allows |
| 1078 | * a suspend offset to be injected into the timekeeping values. | 1136 | * a suspend offset to be injected into the timekeeping values. |
| 1079 | */ | 1137 | */ |
| 1080 | void timekeeping_inject_sleeptime(struct timespec *delta) | 1138 | void timekeeping_inject_sleeptime64(struct timespec64 *delta) |
| 1081 | { | 1139 | { |
| 1082 | struct timekeeper *tk = &tk_core.timekeeper; | 1140 | struct timekeeper *tk = &tk_core.timekeeper; |
| 1083 | struct timespec64 tmp; | ||
| 1084 | unsigned long flags; | 1141 | unsigned long flags; |
| 1085 | 1142 | ||
| 1086 | /* | 1143 | /* |
| @@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
| 1095 | 1152 | ||
| 1096 | timekeeping_forward_now(tk); | 1153 | timekeeping_forward_now(tk); |
| 1097 | 1154 | ||
| 1098 | tmp = timespec_to_timespec64(*delta); | 1155 | __timekeeping_inject_sleeptime(tk, delta); |
| 1099 | __timekeeping_inject_sleeptime(tk, &tmp); | ||
| 1100 | 1156 | ||
| 1101 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 1157 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
| 1102 | 1158 | ||
| @@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, | |||
| 1332 | * | 1388 | * |
| 1333 | * XXX - TODO: Doc ntp_error calculation. | 1389 | * XXX - TODO: Doc ntp_error calculation. |
| 1334 | */ | 1390 | */ |
| 1391 | if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { | ||
| 1392 | /* NTP adjustment caused clocksource mult overflow */ | ||
| 1393 | WARN_ON_ONCE(1); | ||
| 1394 | return; | ||
| 1395 | } | ||
| 1396 | |||
| 1335 | tk->tkr.mult += mult_adj; | 1397 | tk->tkr.mult += mult_adj; |
| 1336 | tk->xtime_interval += interval; | 1398 | tk->xtime_interval += interval; |
| 1337 | tk->tkr.xtime_nsec -= offset; | 1399 | tk->tkr.xtime_nsec -= offset; |
| @@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1397 | } | 1459 | } |
| 1398 | 1460 | ||
| 1399 | if (unlikely(tk->tkr.clock->maxadj && | 1461 | if (unlikely(tk->tkr.clock->maxadj && |
| 1400 | (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { | 1462 | (abs(tk->tkr.mult - tk->tkr.clock->mult) |
| 1463 | > tk->tkr.clock->maxadj))) { | ||
| 1401 | printk_once(KERN_WARNING | 1464 | printk_once(KERN_WARNING |
| 1402 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1465 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
| 1403 | tk->tkr.clock->name, (long)tk->tkr.mult, | 1466 | tk->tkr.clock->name, (long)tk->tkr.mult, |
| @@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void) | |||
| 1646 | } | 1709 | } |
| 1647 | EXPORT_SYMBOL(current_kernel_time); | 1710 | EXPORT_SYMBOL(current_kernel_time); |
| 1648 | 1711 | ||
| 1649 | struct timespec get_monotonic_coarse(void) | 1712 | struct timespec64 get_monotonic_coarse64(void) |
| 1650 | { | 1713 | { |
| 1651 | struct timekeeper *tk = &tk_core.timekeeper; | 1714 | struct timekeeper *tk = &tk_core.timekeeper; |
| 1652 | struct timespec64 now, mono; | 1715 | struct timespec64 now, mono; |
| @@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void) | |||
| 1662 | set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, | 1725 | set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, |
| 1663 | now.tv_nsec + mono.tv_nsec); | 1726 | now.tv_nsec + mono.tv_nsec); |
| 1664 | 1727 | ||
| 1665 | return timespec64_to_timespec(now); | 1728 | return now; |
| 1666 | } | 1729 | } |
| 1667 | 1730 | ||
| 1668 | /* | 1731 | /* |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..2d3f5c504939 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer) | |||
| 655 | static void do_init_timer(struct timer_list *timer, unsigned int flags, | 655 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
| 656 | const char *name, struct lock_class_key *key) | 656 | const char *name, struct lock_class_key *key) |
| 657 | { | 657 | { |
| 658 | struct tvec_base *base = __raw_get_cpu_var(tvec_bases); | 658 | struct tvec_base *base = raw_cpu_read(tvec_bases); |
| 659 | 659 | ||
| 660 | timer->entry.next = NULL; | 660 | timer->entry.next = NULL; |
| 661 | timer->base = (void *)((unsigned long)base | flags); | 661 | timer->base = (void *)((unsigned long)base | flags); |
| @@ -1377,15 +1377,14 @@ unsigned long get_next_timer_interrupt(unsigned long now) | |||
| 1377 | void update_process_times(int user_tick) | 1377 | void update_process_times(int user_tick) |
| 1378 | { | 1378 | { |
| 1379 | struct task_struct *p = current; | 1379 | struct task_struct *p = current; |
| 1380 | int cpu = smp_processor_id(); | ||
| 1381 | 1380 | ||
| 1382 | /* Note: this timer irq context must be accounted for as well. */ | 1381 | /* Note: this timer irq context must be accounted for as well. */ |
| 1383 | account_process_tick(p, user_tick); | 1382 | account_process_tick(p, user_tick); |
| 1384 | run_local_timers(); | 1383 | run_local_timers(); |
| 1385 | rcu_check_callbacks(cpu, user_tick); | 1384 | rcu_check_callbacks(user_tick); |
| 1386 | #ifdef CONFIG_IRQ_WORK | 1385 | #ifdef CONFIG_IRQ_WORK |
| 1387 | if (in_irq()) | 1386 | if (in_irq()) |
| 1388 | irq_work_run(); | 1387 | irq_work_tick(); |
| 1389 | #endif | 1388 | #endif |
| 1390 | scheduler_tick(); | 1389 | scheduler_tick(); |
| 1391 | run_posix_cpu_timers(p); | 1390 | run_posix_cpu_timers(p); |
diff --git a/kernel/torture.c b/kernel/torture.c index d600af21f022..dd70993c266c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); | |||
| 211 | /* | 211 | /* |
| 212 | * Print online/offline testing statistics. | 212 | * Print online/offline testing statistics. |
| 213 | */ | 213 | */ |
| 214 | char *torture_onoff_stats(char *page) | 214 | void torture_onoff_stats(void) |
| 215 | { | 215 | { |
| 216 | #ifdef CONFIG_HOTPLUG_CPU | 216 | #ifdef CONFIG_HOTPLUG_CPU |
| 217 | page += sprintf(page, | 217 | pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", |
| 218 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", | 218 | n_online_successes, n_online_attempts, |
| 219 | n_online_successes, n_online_attempts, | 219 | n_offline_successes, n_offline_attempts, |
| 220 | n_offline_successes, n_offline_attempts, | 220 | min_online, max_online, |
| 221 | min_online, max_online, | 221 | min_offline, max_offline, |
| 222 | min_offline, max_offline, | 222 | sum_online, sum_offline, HZ); |
| 223 | sum_online, sum_offline, HZ); | ||
| 224 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 223 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 225 | return page; | ||
| 226 | } | 224 | } |
| 227 | EXPORT_SYMBOL_GPL(torture_onoff_stats); | 225 | EXPORT_SYMBOL_GPL(torture_onoff_stats); |
| 228 | 226 | ||
| @@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); | |||
| 635 | * | 633 | * |
| 636 | * This must be called before the caller starts shutting down its own | 634 | * This must be called before the caller starts shutting down its own |
| 637 | * kthreads. | 635 | * kthreads. |
| 636 | * | ||
| 637 | * Both torture_cleanup_begin() and torture_cleanup_end() must be paired, | ||
| 638 | * in order to correctly perform the cleanup. They are separated because | ||
| 639 | * threads can still need to reference the torture_type type, thus nullify | ||
| 640 | * only after completing all other relevant calls. | ||
| 638 | */ | 641 | */ |
| 639 | bool torture_cleanup(void) | 642 | bool torture_cleanup_begin(void) |
| 640 | { | 643 | { |
| 641 | mutex_lock(&fullstop_mutex); | 644 | mutex_lock(&fullstop_mutex); |
| 642 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 645 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
| @@ -651,12 +654,17 @@ bool torture_cleanup(void) | |||
| 651 | torture_shuffle_cleanup(); | 654 | torture_shuffle_cleanup(); |
| 652 | torture_stutter_cleanup(); | 655 | torture_stutter_cleanup(); |
| 653 | torture_onoff_cleanup(); | 656 | torture_onoff_cleanup(); |
| 657 | return false; | ||
| 658 | } | ||
| 659 | EXPORT_SYMBOL_GPL(torture_cleanup_begin); | ||
| 660 | |||
| 661 | void torture_cleanup_end(void) | ||
| 662 | { | ||
| 654 | mutex_lock(&fullstop_mutex); | 663 | mutex_lock(&fullstop_mutex); |
| 655 | torture_type = NULL; | 664 | torture_type = NULL; |
| 656 | mutex_unlock(&fullstop_mutex); | 665 | mutex_unlock(&fullstop_mutex); |
| 657 | return false; | ||
| 658 | } | 666 | } |
| 659 | EXPORT_SYMBOL_GPL(torture_cleanup); | 667 | EXPORT_SYMBOL_GPL(torture_cleanup_end); |
| 660 | 668 | ||
| 661 | /* | 669 | /* |
| 662 | * Is it time for the current torture test to stop? | 670 | * Is it time for the current torture test to stop? |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf83..979ccde26720 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | |||
| 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o |
| 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
| 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
| 58 | ifeq ($(CONFIG_PM_RUNTIME),y) | 58 | ifeq ($(CONFIG_PM),y) |
| 59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o | 59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o |
| 60 | endif | 60 | endif |
| 61 | ifeq ($(CONFIG_TRACING),y) | 61 | ifeq ($(CONFIG_TRACING),y) |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ada2a04..483cecfa5c17 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent, | |||
| 1142 | r->sector_from = be64_to_cpu(sector_from); | 1142 | r->sector_from = be64_to_cpu(sector_from); |
| 1143 | } | 1143 | } |
| 1144 | 1144 | ||
| 1145 | typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); | 1145 | typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); |
| 1146 | 1146 | ||
| 1147 | static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | 1147 | static void blk_log_action_classic(struct trace_iterator *iter, const char *act) |
| 1148 | { | 1148 | { |
| 1149 | char rwbs[RWBS_LEN]; | 1149 | char rwbs[RWBS_LEN]; |
| 1150 | unsigned long long ts = iter->ts; | 1150 | unsigned long long ts = iter->ts; |
| @@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | |||
| 1154 | 1154 | ||
| 1155 | fill_rwbs(rwbs, t); | 1155 | fill_rwbs(rwbs, t); |
| 1156 | 1156 | ||
| 1157 | return trace_seq_printf(&iter->seq, | 1157 | trace_seq_printf(&iter->seq, |
| 1158 | "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", | 1158 | "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", |
| 1159 | MAJOR(t->device), MINOR(t->device), iter->cpu, | 1159 | MAJOR(t->device), MINOR(t->device), iter->cpu, |
| 1160 | secs, nsec_rem, iter->ent->pid, act, rwbs); | 1160 | secs, nsec_rem, iter->ent->pid, act, rwbs); |
| 1161 | } | 1161 | } |
| 1162 | 1162 | ||
| 1163 | static int blk_log_action(struct trace_iterator *iter, const char *act) | 1163 | static void blk_log_action(struct trace_iterator *iter, const char *act) |
| 1164 | { | 1164 | { |
| 1165 | char rwbs[RWBS_LEN]; | 1165 | char rwbs[RWBS_LEN]; |
| 1166 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); | 1166 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); |
| 1167 | 1167 | ||
| 1168 | fill_rwbs(rwbs, t); | 1168 | fill_rwbs(rwbs, t); |
| 1169 | return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", | 1169 | trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", |
| 1170 | MAJOR(t->device), MINOR(t->device), act, rwbs); | 1170 | MAJOR(t->device), MINOR(t->device), act, rwbs); |
| 1171 | } | 1171 | } |
| 1172 | 1172 | ||
| 1173 | static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) | 1173 | static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) |
| 1174 | { | 1174 | { |
| 1175 | const unsigned char *pdu_buf; | 1175 | const unsigned char *pdu_buf; |
| 1176 | int pdu_len; | 1176 | int pdu_len; |
| 1177 | int i, end, ret; | 1177 | int i, end; |
| 1178 | 1178 | ||
| 1179 | pdu_buf = pdu_start(ent); | 1179 | pdu_buf = pdu_start(ent); |
| 1180 | pdu_len = te_blk_io_trace(ent)->pdu_len; | 1180 | pdu_len = te_blk_io_trace(ent)->pdu_len; |
| 1181 | 1181 | ||
| 1182 | if (!pdu_len) | 1182 | if (!pdu_len) |
| 1183 | return 1; | 1183 | return; |
| 1184 | 1184 | ||
| 1185 | /* find the last zero that needs to be printed */ | 1185 | /* find the last zero that needs to be printed */ |
| 1186 | for (end = pdu_len - 1; end >= 0; end--) | 1186 | for (end = pdu_len - 1; end >= 0; end--) |
| @@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) | |||
| 1188 | break; | 1188 | break; |
| 1189 | end++; | 1189 | end++; |
| 1190 | 1190 | ||
| 1191 | if (!trace_seq_putc(s, '(')) | 1191 | trace_seq_putc(s, '('); |
| 1192 | return 0; | ||
| 1193 | 1192 | ||
| 1194 | for (i = 0; i < pdu_len; i++) { | 1193 | for (i = 0; i < pdu_len; i++) { |
| 1195 | 1194 | ||
| 1196 | ret = trace_seq_printf(s, "%s%02x", | 1195 | trace_seq_printf(s, "%s%02x", |
| 1197 | i == 0 ? "" : " ", pdu_buf[i]); | 1196 | i == 0 ? "" : " ", pdu_buf[i]); |
| 1198 | if (!ret) | ||
| 1199 | return ret; | ||
| 1200 | 1197 | ||
| 1201 | /* | 1198 | /* |
| 1202 | * stop when the rest is just zeroes and indicate so | 1199 | * stop when the rest is just zeroes and indicate so |
| 1203 | * with a ".." appended | 1200 | * with a ".." appended |
| 1204 | */ | 1201 | */ |
| 1205 | if (i == end && end != pdu_len - 1) | 1202 | if (i == end && end != pdu_len - 1) { |
| 1206 | return trace_seq_puts(s, " ..) "); | 1203 | trace_seq_puts(s, " ..) "); |
| 1204 | return; | ||
| 1205 | } | ||
| 1207 | } | 1206 | } |
| 1208 | 1207 | ||
| 1209 | return trace_seq_puts(s, ") "); | 1208 | trace_seq_puts(s, ") "); |
| 1210 | } | 1209 | } |
| 1211 | 1210 | ||
| 1212 | static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) | 1211 | static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) |
| 1213 | { | 1212 | { |
| 1214 | char cmd[TASK_COMM_LEN]; | 1213 | char cmd[TASK_COMM_LEN]; |
| 1215 | 1214 | ||
| 1216 | trace_find_cmdline(ent->pid, cmd); | 1215 | trace_find_cmdline(ent->pid, cmd); |
| 1217 | 1216 | ||
| 1218 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { | 1217 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { |
| 1219 | int ret; | 1218 | trace_seq_printf(s, "%u ", t_bytes(ent)); |
| 1220 | 1219 | blk_log_dump_pdu(s, ent); | |
| 1221 | ret = trace_seq_printf(s, "%u ", t_bytes(ent)); | 1220 | trace_seq_printf(s, "[%s]\n", cmd); |
| 1222 | if (!ret) | ||
| 1223 | return 0; | ||
| 1224 | ret = blk_log_dump_pdu(s, ent); | ||
| 1225 | if (!ret) | ||
| 1226 | return 0; | ||
| 1227 | return trace_seq_printf(s, "[%s]\n", cmd); | ||
| 1228 | } else { | 1221 | } else { |
| 1229 | if (t_sec(ent)) | 1222 | if (t_sec(ent)) |
| 1230 | return trace_seq_printf(s, "%llu + %u [%s]\n", | 1223 | trace_seq_printf(s, "%llu + %u [%s]\n", |
| 1231 | t_sector(ent), t_sec(ent), cmd); | 1224 | t_sector(ent), t_sec(ent), cmd); |
| 1232 | return trace_seq_printf(s, "[%s]\n", cmd); | 1225 | else |
| 1226 | trace_seq_printf(s, "[%s]\n", cmd); | ||
| 1233 | } | 1227 | } |
| 1234 | } | 1228 | } |
| 1235 | 1229 | ||
| 1236 | static int blk_log_with_error(struct trace_seq *s, | 1230 | static void blk_log_with_error(struct trace_seq *s, |
| 1237 | const struct trace_entry *ent) | 1231 | const struct trace_entry *ent) |
| 1238 | { | 1232 | { |
| 1239 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { | 1233 | if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { |
| 1240 | int ret; | 1234 | blk_log_dump_pdu(s, ent); |
| 1241 | 1235 | trace_seq_printf(s, "[%d]\n", t_error(ent)); | |
| 1242 | ret = blk_log_dump_pdu(s, ent); | ||
| 1243 | if (ret) | ||
| 1244 | return trace_seq_printf(s, "[%d]\n", t_error(ent)); | ||
| 1245 | return 0; | ||
| 1246 | } else { | 1236 | } else { |
| 1247 | if (t_sec(ent)) | 1237 | if (t_sec(ent)) |
| 1248 | return trace_seq_printf(s, "%llu + %u [%d]\n", | 1238 | trace_seq_printf(s, "%llu + %u [%d]\n", |
| 1249 | t_sector(ent), | 1239 | t_sector(ent), |
| 1250 | t_sec(ent), t_error(ent)); | 1240 | t_sec(ent), t_error(ent)); |
| 1251 | return trace_seq_printf(s, "%llu [%d]\n", | 1241 | else |
| 1252 | t_sector(ent), t_error(ent)); | 1242 | trace_seq_printf(s, "%llu [%d]\n", |
| 1243 | t_sector(ent), t_error(ent)); | ||
| 1253 | } | 1244 | } |
| 1254 | } | 1245 | } |
| 1255 | 1246 | ||
| 1256 | static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) | 1247 | static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) |
| 1257 | { | 1248 | { |
| 1258 | struct blk_io_trace_remap r = { .device_from = 0, }; | 1249 | struct blk_io_trace_remap r = { .device_from = 0, }; |
| 1259 | 1250 | ||
| 1260 | get_pdu_remap(ent, &r); | 1251 | get_pdu_remap(ent, &r); |
| 1261 | return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", | 1252 | trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", |
| 1262 | t_sector(ent), t_sec(ent), | 1253 | t_sector(ent), t_sec(ent), |
| 1263 | MAJOR(r.device_from), MINOR(r.device_from), | 1254 | MAJOR(r.device_from), MINOR(r.device_from), |
| 1264 | (unsigned long long)r.sector_from); | 1255 | (unsigned long long)r.sector_from); |
| 1265 | } | 1256 | } |
| 1266 | 1257 | ||
| 1267 | static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) | 1258 | static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) |
| 1268 | { | 1259 | { |
| 1269 | char cmd[TASK_COMM_LEN]; | 1260 | char cmd[TASK_COMM_LEN]; |
| 1270 | 1261 | ||
| 1271 | trace_find_cmdline(ent->pid, cmd); | 1262 | trace_find_cmdline(ent->pid, cmd); |
| 1272 | 1263 | ||
| 1273 | return trace_seq_printf(s, "[%s]\n", cmd); | 1264 | trace_seq_printf(s, "[%s]\n", cmd); |
| 1274 | } | 1265 | } |
| 1275 | 1266 | ||
| 1276 | static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) | 1267 | static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) |
| 1277 | { | 1268 | { |
| 1278 | char cmd[TASK_COMM_LEN]; | 1269 | char cmd[TASK_COMM_LEN]; |
| 1279 | 1270 | ||
| 1280 | trace_find_cmdline(ent->pid, cmd); | 1271 | trace_find_cmdline(ent->pid, cmd); |
| 1281 | 1272 | ||
| 1282 | return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); | 1273 | trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); |
| 1283 | } | 1274 | } |
| 1284 | 1275 | ||
| 1285 | static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) | 1276 | static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) |
| 1286 | { | 1277 | { |
| 1287 | char cmd[TASK_COMM_LEN]; | 1278 | char cmd[TASK_COMM_LEN]; |
| 1288 | 1279 | ||
| 1289 | trace_find_cmdline(ent->pid, cmd); | 1280 | trace_find_cmdline(ent->pid, cmd); |
| 1290 | 1281 | ||
| 1291 | return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), | 1282 | trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), |
| 1292 | get_pdu_int(ent), cmd); | 1283 | get_pdu_int(ent), cmd); |
| 1293 | } | 1284 | } |
| 1294 | 1285 | ||
| 1295 | static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) | 1286 | static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) |
| 1296 | { | 1287 | { |
| 1297 | int ret; | ||
| 1298 | const struct blk_io_trace *t = te_blk_io_trace(ent); | 1288 | const struct blk_io_trace *t = te_blk_io_trace(ent); |
| 1299 | 1289 | ||
| 1300 | ret = trace_seq_putmem(s, t + 1, t->pdu_len); | 1290 | trace_seq_putmem(s, t + 1, t->pdu_len); |
| 1301 | if (ret) | 1291 | trace_seq_putc(s, '\n'); |
| 1302 | return trace_seq_putc(s, '\n'); | ||
| 1303 | return ret; | ||
| 1304 | } | 1292 | } |
| 1305 | 1293 | ||
| 1306 | /* | 1294 | /* |
| @@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr) | |||
| 1339 | 1327 | ||
| 1340 | static const struct { | 1328 | static const struct { |
| 1341 | const char *act[2]; | 1329 | const char *act[2]; |
| 1342 | int (*print)(struct trace_seq *s, const struct trace_entry *ent); | 1330 | void (*print)(struct trace_seq *s, const struct trace_entry *ent); |
| 1343 | } what2act[] = { | 1331 | } what2act[] = { |
| 1344 | [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, | 1332 | [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, |
| 1345 | [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, | 1333 | [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, |
| @@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, | |||
| 1364 | struct trace_seq *s = &iter->seq; | 1352 | struct trace_seq *s = &iter->seq; |
| 1365 | const struct blk_io_trace *t; | 1353 | const struct blk_io_trace *t; |
| 1366 | u16 what; | 1354 | u16 what; |
| 1367 | int ret; | ||
| 1368 | bool long_act; | 1355 | bool long_act; |
| 1369 | blk_log_action_t *log_action; | 1356 | blk_log_action_t *log_action; |
| 1370 | 1357 | ||
| @@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, | |||
| 1374 | log_action = classic ? &blk_log_action_classic : &blk_log_action; | 1361 | log_action = classic ? &blk_log_action_classic : &blk_log_action; |
| 1375 | 1362 | ||
| 1376 | if (t->action == BLK_TN_MESSAGE) { | 1363 | if (t->action == BLK_TN_MESSAGE) { |
| 1377 | ret = log_action(iter, long_act ? "message" : "m"); | 1364 | log_action(iter, long_act ? "message" : "m"); |
| 1378 | if (ret) | 1365 | blk_log_msg(s, iter->ent); |
| 1379 | ret = blk_log_msg(s, iter->ent); | ||
| 1380 | goto out; | ||
| 1381 | } | 1366 | } |
| 1382 | 1367 | ||
| 1383 | if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) | 1368 | if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) |
| 1384 | ret = trace_seq_printf(s, "Unknown action %x\n", what); | 1369 | trace_seq_printf(s, "Unknown action %x\n", what); |
| 1385 | else { | 1370 | else { |
| 1386 | ret = log_action(iter, what2act[what].act[long_act]); | 1371 | log_action(iter, what2act[what].act[long_act]); |
| 1387 | if (ret) | 1372 | what2act[what].print(s, iter->ent); |
| 1388 | ret = what2act[what].print(s, iter->ent); | ||
| 1389 | } | 1373 | } |
| 1390 | out: | 1374 | |
| 1391 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 1375 | return trace_handle_return(s); |
| 1392 | } | 1376 | } |
| 1393 | 1377 | ||
| 1394 | static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, | 1378 | static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, |
| @@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, | |||
| 1397 | return print_one_line(iter, false); | 1381 | return print_one_line(iter, false); |
| 1398 | } | 1382 | } |
| 1399 | 1383 | ||
| 1400 | static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) | 1384 | static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) |
| 1401 | { | 1385 | { |
| 1402 | struct trace_seq *s = &iter->seq; | 1386 | struct trace_seq *s = &iter->seq; |
| 1403 | struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; | 1387 | struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; |
| @@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) | |||
| 1407 | .time = iter->ts, | 1391 | .time = iter->ts, |
| 1408 | }; | 1392 | }; |
| 1409 | 1393 | ||
| 1410 | if (!trace_seq_putmem(s, &old, offset)) | 1394 | trace_seq_putmem(s, &old, offset); |
| 1411 | return 0; | 1395 | trace_seq_putmem(s, &t->sector, |
| 1412 | return trace_seq_putmem(s, &t->sector, | 1396 | sizeof(old) - offset + t->pdu_len); |
| 1413 | sizeof(old) - offset + t->pdu_len); | ||
| 1414 | } | 1397 | } |
| 1415 | 1398 | ||
| 1416 | static enum print_line_t | 1399 | static enum print_line_t |
| 1417 | blk_trace_event_print_binary(struct trace_iterator *iter, int flags, | 1400 | blk_trace_event_print_binary(struct trace_iterator *iter, int flags, |
| 1418 | struct trace_event *event) | 1401 | struct trace_event *event) |
| 1419 | { | 1402 | { |
| 1420 | return blk_trace_synthesize_old_trace(iter) ? | 1403 | blk_trace_synthesize_old_trace(iter); |
| 1421 | TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 1404 | |
| 1405 | return trace_handle_return(&iter->seq); | ||
| 1422 | } | 1406 | } |
| 1423 | 1407 | ||
| 1424 | static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) | 1408 | static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) |
| @@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) | |||
| 1493 | if (atomic_dec_and_test(&blk_probes_ref)) | 1477 | if (atomic_dec_and_test(&blk_probes_ref)) |
| 1494 | blk_unregister_tracepoints(); | 1478 | blk_unregister_tracepoints(); |
| 1495 | 1479 | ||
| 1496 | spin_lock_irq(&running_trace_lock); | ||
| 1497 | list_del(&bt->running_list); | ||
| 1498 | spin_unlock_irq(&running_trace_lock); | ||
| 1499 | blk_trace_free(bt); | 1480 | blk_trace_free(bt); |
| 1500 | return 0; | 1481 | return 0; |
| 1501 | } | 1482 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5916a8e59e87..929a733d302e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | |||
| 113 | static struct ftrace_ops global_ops; | 113 | static struct ftrace_ops global_ops; |
| 114 | static struct ftrace_ops control_ops; | 114 | static struct ftrace_ops control_ops; |
| 115 | 115 | ||
| 116 | static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | ||
| 117 | struct ftrace_ops *op, struct pt_regs *regs); | ||
| 118 | |||
| 116 | #if ARCH_SUPPORTS_FTRACE_OPS | 119 | #if ARCH_SUPPORTS_FTRACE_OPS |
| 117 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | 120 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
| 118 | struct ftrace_ops *op, struct pt_regs *regs); | 121 | struct ftrace_ops *op, struct pt_regs *regs); |
| @@ -251,18 +254,24 @@ static void update_ftrace_function(void) | |||
| 251 | ftrace_func_t func; | 254 | ftrace_func_t func; |
| 252 | 255 | ||
| 253 | /* | 256 | /* |
| 257 | * Prepare the ftrace_ops that the arch callback will use. | ||
| 258 | * If there's only one ftrace_ops registered, the ftrace_ops_list | ||
| 259 | * will point to the ops we want. | ||
| 260 | */ | ||
| 261 | set_function_trace_op = ftrace_ops_list; | ||
| 262 | |||
| 263 | /* If there's no ftrace_ops registered, just call the stub function */ | ||
| 264 | if (ftrace_ops_list == &ftrace_list_end) { | ||
| 265 | func = ftrace_stub; | ||
| 266 | |||
| 267 | /* | ||
| 254 | * If we are at the end of the list and this ops is | 268 | * If we are at the end of the list and this ops is |
| 255 | * recursion safe and not dynamic and the arch supports passing ops, | 269 | * recursion safe and not dynamic and the arch supports passing ops, |
| 256 | * then have the mcount trampoline call the function directly. | 270 | * then have the mcount trampoline call the function directly. |
| 257 | */ | 271 | */ |
| 258 | if (ftrace_ops_list == &ftrace_list_end || | 272 | } else if (ftrace_ops_list->next == &ftrace_list_end) { |
| 259 | (ftrace_ops_list->next == &ftrace_list_end && | 273 | func = ftrace_ops_get_func(ftrace_ops_list); |
| 260 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && | 274 | |
| 261 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | ||
| 262 | !FTRACE_FORCE_LIST_FUNC)) { | ||
| 263 | /* Set the ftrace_ops that the arch callback uses */ | ||
| 264 | set_function_trace_op = ftrace_ops_list; | ||
| 265 | func = ftrace_ops_list->func; | ||
| 266 | } else { | 275 | } else { |
| 267 | /* Just use the default ftrace_ops */ | 276 | /* Just use the default ftrace_ops */ |
| 268 | set_function_trace_op = &ftrace_list_end; | 277 | set_function_trace_op = &ftrace_list_end; |
| @@ -378,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, | |||
| 378 | return ret; | 387 | return ret; |
| 379 | } | 388 | } |
| 380 | 389 | ||
| 390 | static void ftrace_update_trampoline(struct ftrace_ops *ops); | ||
| 391 | |||
| 381 | static int __register_ftrace_function(struct ftrace_ops *ops) | 392 | static int __register_ftrace_function(struct ftrace_ops *ops) |
| 382 | { | 393 | { |
| 383 | if (ops->flags & FTRACE_OPS_FL_DELETED) | 394 | if (ops->flags & FTRACE_OPS_FL_DELETED) |
| @@ -407,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 407 | if (control_ops_alloc(ops)) | 418 | if (control_ops_alloc(ops)) |
| 408 | return -ENOMEM; | 419 | return -ENOMEM; |
| 409 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); | 420 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); |
| 421 | /* The control_ops needs the trampoline update */ | ||
| 422 | ops = &control_ops; | ||
| 410 | } else | 423 | } else |
| 411 | add_ftrace_ops(&ftrace_ops_list, ops); | 424 | add_ftrace_ops(&ftrace_ops_list, ops); |
| 412 | 425 | ||
| 426 | ftrace_update_trampoline(ops); | ||
| 427 | |||
| 413 | if (ftrace_enabled) | 428 | if (ftrace_enabled) |
| 414 | update_ftrace_function(); | 429 | update_ftrace_function(); |
| 415 | 430 | ||
| @@ -556,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2) | |||
| 556 | static int function_stat_headers(struct seq_file *m) | 571 | static int function_stat_headers(struct seq_file *m) |
| 557 | { | 572 | { |
| 558 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 573 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 559 | seq_printf(m, " Function " | 574 | seq_puts(m, " Function " |
| 560 | "Hit Time Avg s^2\n" | 575 | "Hit Time Avg s^2\n" |
| 561 | " -------- " | 576 | " -------- " |
| 562 | "--- ---- --- ---\n"); | 577 | "--- ---- --- ---\n"); |
| 563 | #else | 578 | #else |
| 564 | seq_printf(m, " Function Hit\n" | 579 | seq_puts(m, " Function Hit\n" |
| 565 | " -------- ---\n"); | 580 | " -------- ---\n"); |
| 566 | #endif | 581 | #endif |
| 567 | return 0; | 582 | return 0; |
| 568 | } | 583 | } |
| @@ -589,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
| 589 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 604 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
| 590 | 605 | ||
| 591 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 606 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 592 | seq_printf(m, " "); | 607 | seq_puts(m, " "); |
| 593 | avg = rec->time; | 608 | avg = rec->time; |
| 594 | do_div(avg, rec->counter); | 609 | do_div(avg, rec->counter); |
| 595 | 610 | ||
| @@ -1048,6 +1063,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; | |||
| 1048 | 1063 | ||
| 1049 | static struct ftrace_ops *removed_ops; | 1064 | static struct ftrace_ops *removed_ops; |
| 1050 | 1065 | ||
| 1066 | /* | ||
| 1067 | * Set when doing a global update, like enabling all recs or disabling them. | ||
| 1068 | * It is not set when just updating a single ftrace_ops. | ||
| 1069 | */ | ||
| 1070 | static bool update_all_ops; | ||
| 1071 | |||
| 1051 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD | 1072 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD |
| 1052 | # error Dynamic ftrace depends on MCOUNT_RECORD | 1073 | # error Dynamic ftrace depends on MCOUNT_RECORD |
| 1053 | #endif | 1074 | #endif |
| @@ -1096,6 +1117,43 @@ static struct ftrace_ops global_ops = { | |||
| 1096 | FTRACE_OPS_FL_INITIALIZED, | 1117 | FTRACE_OPS_FL_INITIALIZED, |
| 1097 | }; | 1118 | }; |
| 1098 | 1119 | ||
| 1120 | /* | ||
| 1121 | * This is used by __kernel_text_address() to return true if the | ||
| 1122 | * address is on a dynamically allocated trampoline that would | ||
| 1123 | * not return true for either core_kernel_text() or | ||
| 1124 | * is_module_text_address(). | ||
| 1125 | */ | ||
| 1126 | bool is_ftrace_trampoline(unsigned long addr) | ||
| 1127 | { | ||
| 1128 | struct ftrace_ops *op; | ||
| 1129 | bool ret = false; | ||
| 1130 | |||
| 1131 | /* | ||
| 1132 | * Some of the ops may be dynamically allocated, | ||
| 1133 | * they are freed after a synchronize_sched(). | ||
| 1134 | */ | ||
| 1135 | preempt_disable_notrace(); | ||
| 1136 | |||
| 1137 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1138 | /* | ||
| 1139 | * This is to check for dynamically allocated trampolines. | ||
| 1140 | * Trampolines that are in kernel text will have | ||
| 1141 | * core_kernel_text() return true. | ||
| 1142 | */ | ||
| 1143 | if (op->trampoline && op->trampoline_size) | ||
| 1144 | if (addr >= op->trampoline && | ||
| 1145 | addr < op->trampoline + op->trampoline_size) { | ||
| 1146 | ret = true; | ||
| 1147 | goto out; | ||
| 1148 | } | ||
| 1149 | } while_for_each_ftrace_op(op); | ||
| 1150 | |||
| 1151 | out: | ||
| 1152 | preempt_enable_notrace(); | ||
| 1153 | |||
| 1154 | return ret; | ||
| 1155 | } | ||
| 1156 | |||
| 1099 | struct ftrace_page { | 1157 | struct ftrace_page { |
| 1100 | struct ftrace_page *next; | 1158 | struct ftrace_page *next; |
| 1101 | struct dyn_ftrace *records; | 1159 | struct dyn_ftrace *records; |
| @@ -1300,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); | |||
| 1300 | static void | 1358 | static void |
| 1301 | ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); | 1359 | ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); |
| 1302 | 1360 | ||
| 1361 | static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, | ||
| 1362 | struct ftrace_hash *new_hash); | ||
| 1363 | |||
| 1303 | static int | 1364 | static int |
| 1304 | ftrace_hash_move(struct ftrace_ops *ops, int enable, | 1365 | ftrace_hash_move(struct ftrace_ops *ops, int enable, |
| 1305 | struct ftrace_hash **dst, struct ftrace_hash *src) | 1366 | struct ftrace_hash **dst, struct ftrace_hash *src) |
| @@ -1307,12 +1368,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1307 | struct ftrace_func_entry *entry; | 1368 | struct ftrace_func_entry *entry; |
| 1308 | struct hlist_node *tn; | 1369 | struct hlist_node *tn; |
| 1309 | struct hlist_head *hhd; | 1370 | struct hlist_head *hhd; |
| 1310 | struct ftrace_hash *old_hash; | ||
| 1311 | struct ftrace_hash *new_hash; | 1371 | struct ftrace_hash *new_hash; |
| 1312 | int size = src->count; | 1372 | int size = src->count; |
| 1313 | int bits = 0; | 1373 | int bits = 0; |
| 1374 | int ret; | ||
| 1314 | int i; | 1375 | int i; |
| 1315 | 1376 | ||
| 1377 | /* Reject setting notrace hash on IPMODIFY ftrace_ops */ | ||
| 1378 | if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) | ||
| 1379 | return -EINVAL; | ||
| 1380 | |||
| 1316 | /* | 1381 | /* |
| 1317 | * If the new source is empty, just free dst and assign it | 1382 | * If the new source is empty, just free dst and assign it |
| 1318 | * the empty_hash. | 1383 | * the empty_hash. |
| @@ -1346,21 +1411,44 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
| 1346 | } | 1411 | } |
| 1347 | 1412 | ||
| 1348 | update: | 1413 | update: |
| 1414 | /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ | ||
| 1415 | if (enable) { | ||
| 1416 | /* IPMODIFY should be updated only when filter_hash updating */ | ||
| 1417 | ret = ftrace_hash_ipmodify_update(ops, new_hash); | ||
| 1418 | if (ret < 0) { | ||
| 1419 | free_ftrace_hash(new_hash); | ||
| 1420 | return ret; | ||
| 1421 | } | ||
| 1422 | } | ||
| 1423 | |||
| 1349 | /* | 1424 | /* |
| 1350 | * Remove the current set, update the hash and add | 1425 | * Remove the current set, update the hash and add |
| 1351 | * them back. | 1426 | * them back. |
| 1352 | */ | 1427 | */ |
| 1353 | ftrace_hash_rec_disable_modify(ops, enable); | 1428 | ftrace_hash_rec_disable_modify(ops, enable); |
| 1354 | 1429 | ||
| 1355 | old_hash = *dst; | ||
| 1356 | rcu_assign_pointer(*dst, new_hash); | 1430 | rcu_assign_pointer(*dst, new_hash); |
| 1357 | free_ftrace_hash_rcu(old_hash); | ||
| 1358 | 1431 | ||
| 1359 | ftrace_hash_rec_enable_modify(ops, enable); | 1432 | ftrace_hash_rec_enable_modify(ops, enable); |
| 1360 | 1433 | ||
| 1361 | return 0; | 1434 | return 0; |
| 1362 | } | 1435 | } |
| 1363 | 1436 | ||
| 1437 | static bool hash_contains_ip(unsigned long ip, | ||
| 1438 | struct ftrace_ops_hash *hash) | ||
| 1439 | { | ||
| 1440 | /* | ||
| 1441 | * The function record is a match if it exists in the filter | ||
| 1442 | * hash and not in the notrace hash. Note, an emty hash is | ||
| 1443 | * considered a match for the filter hash, but an empty | ||
| 1444 | * notrace hash is considered not in the notrace hash. | ||
| 1445 | */ | ||
| 1446 | return (ftrace_hash_empty(hash->filter_hash) || | ||
| 1447 | ftrace_lookup_ip(hash->filter_hash, ip)) && | ||
| 1448 | (ftrace_hash_empty(hash->notrace_hash) || | ||
| 1449 | !ftrace_lookup_ip(hash->notrace_hash, ip)); | ||
| 1450 | } | ||
| 1451 | |||
| 1364 | /* | 1452 | /* |
| 1365 | * Test the hashes for this ops to see if we want to call | 1453 | * Test the hashes for this ops to see if we want to call |
| 1366 | * the ops->func or not. | 1454 | * the ops->func or not. |
| @@ -1376,8 +1464,7 @@ update: | |||
| 1376 | static int | 1464 | static int |
| 1377 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | 1465 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) |
| 1378 | { | 1466 | { |
| 1379 | struct ftrace_hash *filter_hash; | 1467 | struct ftrace_ops_hash hash; |
| 1380 | struct ftrace_hash *notrace_hash; | ||
| 1381 | int ret; | 1468 | int ret; |
| 1382 | 1469 | ||
| 1383 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS | 1470 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
| @@ -1390,13 +1477,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 1390 | return 0; | 1477 | return 0; |
| 1391 | #endif | 1478 | #endif |
| 1392 | 1479 | ||
| 1393 | filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); | 1480 | hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); |
| 1394 | notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); | 1481 | hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); |
| 1395 | 1482 | ||
| 1396 | if ((ftrace_hash_empty(filter_hash) || | 1483 | if (hash_contains_ip(ip, &hash)) |
| 1397 | ftrace_lookup_ip(filter_hash, ip)) && | ||
| 1398 | (ftrace_hash_empty(notrace_hash) || | ||
| 1399 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
| 1400 | ret = 1; | 1484 | ret = 1; |
| 1401 | else | 1485 | else |
| 1402 | ret = 0; | 1486 | ret = 0; |
| @@ -1508,46 +1592,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) | |||
| 1508 | return keep_regs; | 1592 | return keep_regs; |
| 1509 | } | 1593 | } |
| 1510 | 1594 | ||
| 1511 | static void ftrace_remove_tramp(struct ftrace_ops *ops, | ||
| 1512 | struct dyn_ftrace *rec) | ||
| 1513 | { | ||
| 1514 | /* If TRAMP is not set, no ops should have a trampoline for this */ | ||
| 1515 | if (!(rec->flags & FTRACE_FL_TRAMP)) | ||
| 1516 | return; | ||
| 1517 | |||
| 1518 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
| 1519 | |||
| 1520 | if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && | ||
| 1521 | !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || | ||
| 1522 | ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) | ||
| 1523 | return; | ||
| 1524 | /* | ||
| 1525 | * The tramp_hash entry will be removed at time | ||
| 1526 | * of update. | ||
| 1527 | */ | ||
| 1528 | ops->nr_trampolines--; | ||
| 1529 | } | ||
| 1530 | |||
| 1531 | static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) | ||
| 1532 | { | ||
| 1533 | struct ftrace_ops *op; | ||
| 1534 | |||
| 1535 | /* If TRAMP is not set, no ops should have a trampoline for this */ | ||
| 1536 | if (!(rec->flags & FTRACE_FL_TRAMP)) | ||
| 1537 | return; | ||
| 1538 | |||
| 1539 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 1540 | /* | ||
| 1541 | * This function is called to clear other tramps | ||
| 1542 | * not the one that is being updated. | ||
| 1543 | */ | ||
| 1544 | if (op == ops) | ||
| 1545 | continue; | ||
| 1546 | if (op->nr_trampolines) | ||
| 1547 | ftrace_remove_tramp(op, rec); | ||
| 1548 | } while_for_each_ftrace_op(op); | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1595 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
| 1552 | int filter_hash, | 1596 | int filter_hash, |
| 1553 | bool inc) | 1597 | bool inc) |
| @@ -1636,18 +1680,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1636 | * function, and the ops has a trampoline registered | 1680 | * function, and the ops has a trampoline registered |
| 1637 | * for it, then we can call it directly. | 1681 | * for it, then we can call it directly. |
| 1638 | */ | 1682 | */ |
| 1639 | if (ftrace_rec_count(rec) == 1 && ops->trampoline) { | 1683 | if (ftrace_rec_count(rec) == 1 && ops->trampoline) |
| 1640 | rec->flags |= FTRACE_FL_TRAMP; | 1684 | rec->flags |= FTRACE_FL_TRAMP; |
| 1641 | ops->nr_trampolines++; | 1685 | else |
| 1642 | } else { | ||
| 1643 | /* | 1686 | /* |
| 1644 | * If we are adding another function callback | 1687 | * If we are adding another function callback |
| 1645 | * to this function, and the previous had a | 1688 | * to this function, and the previous had a |
| 1646 | * custom trampoline in use, then we need to go | 1689 | * custom trampoline in use, then we need to go |
| 1647 | * back to the default trampoline. | 1690 | * back to the default trampoline. |
| 1648 | */ | 1691 | */ |
| 1649 | ftrace_clear_tramps(rec, ops); | 1692 | rec->flags &= ~FTRACE_FL_TRAMP; |
| 1650 | } | ||
| 1651 | 1693 | ||
| 1652 | /* | 1694 | /* |
| 1653 | * If any ops wants regs saved for this function | 1695 | * If any ops wants regs saved for this function |
| @@ -1660,9 +1702,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1660 | return; | 1702 | return; |
| 1661 | rec->flags--; | 1703 | rec->flags--; |
| 1662 | 1704 | ||
| 1663 | if (ops->trampoline && !ftrace_rec_count(rec)) | ||
| 1664 | ftrace_remove_tramp(ops, rec); | ||
| 1665 | |||
| 1666 | /* | 1705 | /* |
| 1667 | * If the rec had REGS enabled and the ops that is | 1706 | * If the rec had REGS enabled and the ops that is |
| 1668 | * being removed had REGS set, then see if there is | 1707 | * being removed had REGS set, then see if there is |
| @@ -1677,6 +1716,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1677 | } | 1716 | } |
| 1678 | 1717 | ||
| 1679 | /* | 1718 | /* |
| 1719 | * If the rec had TRAMP enabled, then it needs to | ||
| 1720 | * be cleared. As TRAMP can only be enabled iff | ||
| 1721 | * there is only a single ops attached to it. | ||
| 1722 | * In otherwords, always disable it on decrementing. | ||
| 1723 | * In the future, we may set it if rec count is | ||
| 1724 | * decremented to one, and the ops that is left | ||
| 1725 | * has a trampoline. | ||
| 1726 | */ | ||
| 1727 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
| 1728 | |||
| 1729 | /* | ||
| 1680 | * flags will be cleared in ftrace_check_record() | 1730 | * flags will be cleared in ftrace_check_record() |
| 1681 | * if rec count is zero. | 1731 | * if rec count is zero. |
| 1682 | */ | 1732 | */ |
| @@ -1735,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, | |||
| 1735 | ftrace_hash_rec_update_modify(ops, filter_hash, 1); | 1785 | ftrace_hash_rec_update_modify(ops, filter_hash, 1); |
| 1736 | } | 1786 | } |
| 1737 | 1787 | ||
| 1788 | /* | ||
| 1789 | * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK | ||
| 1790 | * or no-needed to update, -EBUSY if it detects a conflict of the flag | ||
| 1791 | * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. | ||
| 1792 | * Note that old_hash and new_hash has below meanings | ||
| 1793 | * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) | ||
| 1794 | * - If the hash is EMPTY_HASH, it hits nothing | ||
| 1795 | * - Anything else hits the recs which match the hash entries. | ||
| 1796 | */ | ||
| 1797 | static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, | ||
| 1798 | struct ftrace_hash *old_hash, | ||
| 1799 | struct ftrace_hash *new_hash) | ||
| 1800 | { | ||
| 1801 | struct ftrace_page *pg; | ||
| 1802 | struct dyn_ftrace *rec, *end = NULL; | ||
| 1803 | int in_old, in_new; | ||
| 1804 | |||
| 1805 | /* Only update if the ops has been registered */ | ||
| 1806 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
| 1807 | return 0; | ||
| 1808 | |||
| 1809 | if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) | ||
| 1810 | return 0; | ||
| 1811 | |||
| 1812 | /* | ||
| 1813 | * Since the IPMODIFY is a very address sensitive action, we do not | ||
| 1814 | * allow ftrace_ops to set all functions to new hash. | ||
| 1815 | */ | ||
| 1816 | if (!new_hash || !old_hash) | ||
| 1817 | return -EINVAL; | ||
| 1818 | |||
| 1819 | /* Update rec->flags */ | ||
| 1820 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1821 | /* We need to update only differences of filter_hash */ | ||
| 1822 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); | ||
| 1823 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); | ||
| 1824 | if (in_old == in_new) | ||
| 1825 | continue; | ||
| 1826 | |||
| 1827 | if (in_new) { | ||
| 1828 | /* New entries must ensure no others are using it */ | ||
| 1829 | if (rec->flags & FTRACE_FL_IPMODIFY) | ||
| 1830 | goto rollback; | ||
| 1831 | rec->flags |= FTRACE_FL_IPMODIFY; | ||
| 1832 | } else /* Removed entry */ | ||
| 1833 | rec->flags &= ~FTRACE_FL_IPMODIFY; | ||
| 1834 | } while_for_each_ftrace_rec(); | ||
| 1835 | |||
| 1836 | return 0; | ||
| 1837 | |||
| 1838 | rollback: | ||
| 1839 | end = rec; | ||
| 1840 | |||
| 1841 | /* Roll back what we did above */ | ||
| 1842 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1843 | if (rec == end) | ||
| 1844 | goto err_out; | ||
| 1845 | |||
| 1846 | in_old = !!ftrace_lookup_ip(old_hash, rec->ip); | ||
| 1847 | in_new = !!ftrace_lookup_ip(new_hash, rec->ip); | ||
| 1848 | if (in_old == in_new) | ||
| 1849 | continue; | ||
| 1850 | |||
| 1851 | if (in_new) | ||
| 1852 | rec->flags &= ~FTRACE_FL_IPMODIFY; | ||
| 1853 | else | ||
| 1854 | rec->flags |= FTRACE_FL_IPMODIFY; | ||
| 1855 | } while_for_each_ftrace_rec(); | ||
| 1856 | |||
| 1857 | err_out: | ||
| 1858 | return -EBUSY; | ||
| 1859 | } | ||
| 1860 | |||
| 1861 | static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) | ||
| 1862 | { | ||
| 1863 | struct ftrace_hash *hash = ops->func_hash->filter_hash; | ||
| 1864 | |||
| 1865 | if (ftrace_hash_empty(hash)) | ||
| 1866 | hash = NULL; | ||
| 1867 | |||
| 1868 | return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | /* Disabling always succeeds */ | ||
| 1872 | static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) | ||
| 1873 | { | ||
| 1874 | struct ftrace_hash *hash = ops->func_hash->filter_hash; | ||
| 1875 | |||
| 1876 | if (ftrace_hash_empty(hash)) | ||
| 1877 | hash = NULL; | ||
| 1878 | |||
| 1879 | __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, | ||
| 1883 | struct ftrace_hash *new_hash) | ||
| 1884 | { | ||
| 1885 | struct ftrace_hash *old_hash = ops->func_hash->filter_hash; | ||
| 1886 | |||
| 1887 | if (ftrace_hash_empty(old_hash)) | ||
| 1888 | old_hash = NULL; | ||
| 1889 | |||
| 1890 | if (ftrace_hash_empty(new_hash)) | ||
| 1891 | new_hash = NULL; | ||
| 1892 | |||
| 1893 | return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); | ||
| 1894 | } | ||
| 1895 | |||
| 1738 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1896 | static void print_ip_ins(const char *fmt, unsigned char *p) |
| 1739 | { | 1897 | { |
| 1740 | int i; | 1898 | int i; |
| @@ -1745,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p) | |||
| 1745 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); | 1903 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); |
| 1746 | } | 1904 | } |
| 1747 | 1905 | ||
| 1906 | static struct ftrace_ops * | ||
| 1907 | ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); | ||
| 1908 | |||
| 1748 | /** | 1909 | /** |
| 1749 | * ftrace_bug - report and shutdown function tracer | 1910 | * ftrace_bug - report and shutdown function tracer |
| 1750 | * @failed: The failed type (EFAULT, EINVAL, EPERM) | 1911 | * @failed: The failed type (EFAULT, EINVAL, EPERM) |
| 1751 | * @ip: The address that failed | 1912 | * @rec: The record that failed |
| 1752 | * | 1913 | * |
| 1753 | * The arch code that enables or disables the function tracing | 1914 | * The arch code that enables or disables the function tracing |
| 1754 | * can call ftrace_bug() when it has detected a problem in | 1915 | * can call ftrace_bug() when it has detected a problem in |
| @@ -1757,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p) | |||
| 1757 | * EINVAL - if what is read at @ip is not what was expected | 1918 | * EINVAL - if what is read at @ip is not what was expected |
| 1758 | * EPERM - if the problem happens on writting to the @ip address | 1919 | * EPERM - if the problem happens on writting to the @ip address |
| 1759 | */ | 1920 | */ |
| 1760 | void ftrace_bug(int failed, unsigned long ip) | 1921 | void ftrace_bug(int failed, struct dyn_ftrace *rec) |
| 1761 | { | 1922 | { |
| 1923 | unsigned long ip = rec ? rec->ip : 0; | ||
| 1924 | |||
| 1762 | switch (failed) { | 1925 | switch (failed) { |
| 1763 | case -EFAULT: | 1926 | case -EFAULT: |
| 1764 | FTRACE_WARN_ON_ONCE(1); | 1927 | FTRACE_WARN_ON_ONCE(1); |
| @@ -1770,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip) | |||
| 1770 | pr_info("ftrace failed to modify "); | 1933 | pr_info("ftrace failed to modify "); |
| 1771 | print_ip_sym(ip); | 1934 | print_ip_sym(ip); |
| 1772 | print_ip_ins(" actual: ", (unsigned char *)ip); | 1935 | print_ip_ins(" actual: ", (unsigned char *)ip); |
| 1773 | printk(KERN_CONT "\n"); | 1936 | pr_cont("\n"); |
| 1774 | break; | 1937 | break; |
| 1775 | case -EPERM: | 1938 | case -EPERM: |
| 1776 | FTRACE_WARN_ON_ONCE(1); | 1939 | FTRACE_WARN_ON_ONCE(1); |
| @@ -1782,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip) | |||
| 1782 | pr_info("ftrace faulted on unknown error "); | 1945 | pr_info("ftrace faulted on unknown error "); |
| 1783 | print_ip_sym(ip); | 1946 | print_ip_sym(ip); |
| 1784 | } | 1947 | } |
| 1948 | if (rec) { | ||
| 1949 | struct ftrace_ops *ops = NULL; | ||
| 1950 | |||
| 1951 | pr_info("ftrace record flags: %lx\n", rec->flags); | ||
| 1952 | pr_cont(" (%ld)%s", ftrace_rec_count(rec), | ||
| 1953 | rec->flags & FTRACE_FL_REGS ? " R" : " "); | ||
| 1954 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | ||
| 1955 | ops = ftrace_find_tramp_ops_any(rec); | ||
| 1956 | if (ops) | ||
| 1957 | pr_cont("\ttramp: %pS", | ||
| 1958 | (void *)ops->trampoline); | ||
| 1959 | else | ||
| 1960 | pr_cont("\ttramp: ERROR!"); | ||
| 1961 | |||
| 1962 | } | ||
| 1963 | ip = ftrace_get_addr_curr(rec); | ||
| 1964 | pr_cont(" expected tramp: %lx\n", ip); | ||
| 1965 | } | ||
| 1785 | } | 1966 | } |
| 1786 | 1967 | ||
| 1787 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1968 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
| @@ -1895,21 +2076,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
| 1895 | } | 2076 | } |
| 1896 | 2077 | ||
| 1897 | static struct ftrace_ops * | 2078 | static struct ftrace_ops * |
| 2079 | ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) | ||
| 2080 | { | ||
| 2081 | struct ftrace_ops *op; | ||
| 2082 | unsigned long ip = rec->ip; | ||
| 2083 | |||
| 2084 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 2085 | |||
| 2086 | if (!op->trampoline) | ||
| 2087 | continue; | ||
| 2088 | |||
| 2089 | if (hash_contains_ip(ip, op->func_hash)) | ||
| 2090 | return op; | ||
| 2091 | } while_for_each_ftrace_op(op); | ||
| 2092 | |||
| 2093 | return NULL; | ||
| 2094 | } | ||
| 2095 | |||
| 2096 | static struct ftrace_ops * | ||
| 1898 | ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) | 2097 | ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) |
| 1899 | { | 2098 | { |
| 1900 | struct ftrace_ops *op; | 2099 | struct ftrace_ops *op; |
| 2100 | unsigned long ip = rec->ip; | ||
| 1901 | 2101 | ||
| 1902 | /* Removed ops need to be tested first */ | 2102 | /* |
| 1903 | if (removed_ops && removed_ops->tramp_hash) { | 2103 | * Need to check removed ops first. |
| 1904 | if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) | 2104 | * If they are being removed, and this rec has a tramp, |
| 2105 | * and this rec is in the ops list, then it would be the | ||
| 2106 | * one with the tramp. | ||
| 2107 | */ | ||
| 2108 | if (removed_ops) { | ||
| 2109 | if (hash_contains_ip(ip, &removed_ops->old_hash)) | ||
| 1905 | return removed_ops; | 2110 | return removed_ops; |
| 1906 | } | 2111 | } |
| 1907 | 2112 | ||
| 2113 | /* | ||
| 2114 | * Need to find the current trampoline for a rec. | ||
| 2115 | * Now, a trampoline is only attached to a rec if there | ||
| 2116 | * was a single 'ops' attached to it. But this can be called | ||
| 2117 | * when we are adding another op to the rec or removing the | ||
| 2118 | * current one. Thus, if the op is being added, we can | ||
| 2119 | * ignore it because it hasn't attached itself to the rec | ||
| 2120 | * yet. | ||
| 2121 | * | ||
| 2122 | * If an ops is being modified (hooking to different functions) | ||
| 2123 | * then we don't care about the new functions that are being | ||
| 2124 | * added, just the old ones (that are probably being removed). | ||
| 2125 | * | ||
| 2126 | * If we are adding an ops to a function that already is using | ||
| 2127 | * a trampoline, it needs to be removed (trampolines are only | ||
| 2128 | * for single ops connected), then an ops that is not being | ||
| 2129 | * modified also needs to be checked. | ||
| 2130 | */ | ||
| 1908 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 2131 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 1909 | if (!op->tramp_hash) | 2132 | |
| 2133 | if (!op->trampoline) | ||
| 1910 | continue; | 2134 | continue; |
| 1911 | 2135 | ||
| 1912 | if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) | 2136 | /* |
| 2137 | * If the ops is being added, it hasn't gotten to | ||
| 2138 | * the point to be removed from this tree yet. | ||
| 2139 | */ | ||
| 2140 | if (op->flags & FTRACE_OPS_FL_ADDING) | ||
| 2141 | continue; | ||
| 2142 | |||
| 2143 | |||
| 2144 | /* | ||
| 2145 | * If the ops is being modified and is in the old | ||
| 2146 | * hash, then it is probably being removed from this | ||
| 2147 | * function. | ||
| 2148 | */ | ||
| 2149 | if ((op->flags & FTRACE_OPS_FL_MODIFYING) && | ||
| 2150 | hash_contains_ip(ip, &op->old_hash)) | ||
| 2151 | return op; | ||
| 2152 | /* | ||
| 2153 | * If the ops is not being added or modified, and it's | ||
| 2154 | * in its normal filter hash, then this must be the one | ||
| 2155 | * we want! | ||
| 2156 | */ | ||
| 2157 | if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && | ||
| 2158 | hash_contains_ip(ip, op->func_hash)) | ||
| 1913 | return op; | 2159 | return op; |
| 1914 | 2160 | ||
| 1915 | } while_for_each_ftrace_op(op); | 2161 | } while_for_each_ftrace_op(op); |
| @@ -1921,10 +2167,11 @@ static struct ftrace_ops * | |||
| 1921 | ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) | 2167 | ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) |
| 1922 | { | 2168 | { |
| 1923 | struct ftrace_ops *op; | 2169 | struct ftrace_ops *op; |
| 2170 | unsigned long ip = rec->ip; | ||
| 1924 | 2171 | ||
| 1925 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 2172 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 1926 | /* pass rec in as regs to have non-NULL val */ | 2173 | /* pass rec in as regs to have non-NULL val */ |
| 1927 | if (ftrace_ops_test(op, rec->ip, rec)) | 2174 | if (hash_contains_ip(ip, op->func_hash)) |
| 1928 | return op; | 2175 | return op; |
| 1929 | } while_for_each_ftrace_op(op); | 2176 | } while_for_each_ftrace_op(op); |
| 1930 | 2177 | ||
| @@ -2038,7 +2285,7 @@ void __weak ftrace_replace_code(int enable) | |||
| 2038 | do_for_each_ftrace_rec(pg, rec) { | 2285 | do_for_each_ftrace_rec(pg, rec) { |
| 2039 | failed = __ftrace_replace_code(rec, enable); | 2286 | failed = __ftrace_replace_code(rec, enable); |
| 2040 | if (failed) { | 2287 | if (failed) { |
| 2041 | ftrace_bug(failed, rec->ip); | 2288 | ftrace_bug(failed, rec); |
| 2042 | /* Stop processing */ | 2289 | /* Stop processing */ |
| 2043 | return; | 2290 | return; |
| 2044 | } | 2291 | } |
| @@ -2120,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) | |||
| 2120 | static int | 2367 | static int |
| 2121 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | 2368 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) |
| 2122 | { | 2369 | { |
| 2123 | unsigned long ip; | ||
| 2124 | int ret; | 2370 | int ret; |
| 2125 | 2371 | ||
| 2126 | ip = rec->ip; | ||
| 2127 | |||
| 2128 | if (unlikely(ftrace_disabled)) | 2372 | if (unlikely(ftrace_disabled)) |
| 2129 | return 0; | 2373 | return 0; |
| 2130 | 2374 | ||
| 2131 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 2375 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
| 2132 | if (ret) { | 2376 | if (ret) { |
| 2133 | ftrace_bug(ret, ip); | 2377 | ftrace_bug(ret, rec); |
| 2134 | return 0; | 2378 | return 0; |
| 2135 | } | 2379 | } |
| 2136 | return 1; | 2380 | return 1; |
| @@ -2231,92 +2475,6 @@ void __weak arch_ftrace_update_code(int command) | |||
| 2231 | ftrace_run_stop_machine(command); | 2475 | ftrace_run_stop_machine(command); |
| 2232 | } | 2476 | } |
| 2233 | 2477 | ||
| 2234 | static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) | ||
| 2235 | { | ||
| 2236 | struct ftrace_page *pg; | ||
| 2237 | struct dyn_ftrace *rec; | ||
| 2238 | int size, bits; | ||
| 2239 | int ret; | ||
| 2240 | |||
| 2241 | size = ops->nr_trampolines; | ||
| 2242 | bits = 0; | ||
| 2243 | /* | ||
| 2244 | * Make the hash size about 1/2 the # found | ||
| 2245 | */ | ||
| 2246 | for (size /= 2; size; size >>= 1) | ||
| 2247 | bits++; | ||
| 2248 | |||
| 2249 | ops->tramp_hash = alloc_ftrace_hash(bits); | ||
| 2250 | /* | ||
| 2251 | * TODO: a failed allocation is going to screw up | ||
| 2252 | * the accounting of what needs to be modified | ||
| 2253 | * and not. For now, we kill ftrace if we fail | ||
| 2254 | * to allocate here. But there are ways around this, | ||
| 2255 | * but that will take a little more work. | ||
| 2256 | */ | ||
| 2257 | if (!ops->tramp_hash) | ||
| 2258 | return -ENOMEM; | ||
| 2259 | |||
| 2260 | do_for_each_ftrace_rec(pg, rec) { | ||
| 2261 | if (ftrace_rec_count(rec) == 1 && | ||
| 2262 | ftrace_ops_test(ops, rec->ip, rec)) { | ||
| 2263 | |||
| 2264 | /* | ||
| 2265 | * If another ops adds to a rec, the rec will | ||
| 2266 | * lose its trampoline and never get it back | ||
| 2267 | * until all ops are off of it. | ||
| 2268 | */ | ||
| 2269 | if (!(rec->flags & FTRACE_FL_TRAMP)) | ||
| 2270 | continue; | ||
| 2271 | |||
| 2272 | /* This record had better have a trampoline */ | ||
| 2273 | if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) | ||
| 2274 | return -1; | ||
| 2275 | |||
| 2276 | ret = add_hash_entry(ops->tramp_hash, rec->ip); | ||
| 2277 | if (ret < 0) | ||
| 2278 | return ret; | ||
| 2279 | } | ||
| 2280 | } while_for_each_ftrace_rec(); | ||
| 2281 | |||
| 2282 | /* The number of recs in the hash must match nr_trampolines */ | ||
| 2283 | if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) | ||
| 2284 | pr_warn("count=%ld trampolines=%d\n", | ||
| 2285 | ops->tramp_hash->count, | ||
| 2286 | ops->nr_trampolines); | ||
| 2287 | |||
| 2288 | return 0; | ||
| 2289 | } | ||
| 2290 | |||
| 2291 | static int ftrace_save_tramp_hashes(void) | ||
| 2292 | { | ||
| 2293 | struct ftrace_ops *op; | ||
| 2294 | int ret; | ||
| 2295 | |||
| 2296 | /* | ||
| 2297 | * Now that any trampoline is being used, we need to save the | ||
| 2298 | * hashes for the ops that have them. This allows the mapping | ||
| 2299 | * back from the record to the ops that has the trampoline to | ||
| 2300 | * know what code is being replaced. Modifying code must always | ||
| 2301 | * verify what it is changing. | ||
| 2302 | */ | ||
| 2303 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
| 2304 | |||
| 2305 | /* The tramp_hash is recreated each time. */ | ||
| 2306 | free_ftrace_hash(op->tramp_hash); | ||
| 2307 | op->tramp_hash = NULL; | ||
| 2308 | |||
| 2309 | if (op->nr_trampolines) { | ||
| 2310 | ret = ftrace_save_ops_tramp_hash(op); | ||
| 2311 | if (ret) | ||
| 2312 | return ret; | ||
| 2313 | } | ||
| 2314 | |||
| 2315 | } while_for_each_ftrace_op(op); | ||
| 2316 | |||
| 2317 | return 0; | ||
| 2318 | } | ||
| 2319 | |||
| 2320 | static void ftrace_run_update_code(int command) | 2478 | static void ftrace_run_update_code(int command) |
| 2321 | { | 2479 | { |
| 2322 | int ret; | 2480 | int ret; |
| @@ -2336,14 +2494,25 @@ static void ftrace_run_update_code(int command) | |||
| 2336 | 2494 | ||
| 2337 | ret = ftrace_arch_code_modify_post_process(); | 2495 | ret = ftrace_arch_code_modify_post_process(); |
| 2338 | FTRACE_WARN_ON(ret); | 2496 | FTRACE_WARN_ON(ret); |
| 2497 | } | ||
| 2339 | 2498 | ||
| 2340 | ret = ftrace_save_tramp_hashes(); | 2499 | static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, |
| 2341 | FTRACE_WARN_ON(ret); | 2500 | struct ftrace_hash *old_hash) |
| 2501 | { | ||
| 2502 | ops->flags |= FTRACE_OPS_FL_MODIFYING; | ||
| 2503 | ops->old_hash.filter_hash = old_hash; | ||
| 2504 | ftrace_run_update_code(command); | ||
| 2505 | ops->old_hash.filter_hash = NULL; | ||
| 2506 | ops->flags &= ~FTRACE_OPS_FL_MODIFYING; | ||
| 2342 | } | 2507 | } |
| 2343 | 2508 | ||
| 2344 | static ftrace_func_t saved_ftrace_func; | 2509 | static ftrace_func_t saved_ftrace_func; |
| 2345 | static int ftrace_start_up; | 2510 | static int ftrace_start_up; |
| 2346 | 2511 | ||
| 2512 | void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) | ||
| 2513 | { | ||
| 2514 | } | ||
| 2515 | |||
| 2347 | static void control_ops_free(struct ftrace_ops *ops) | 2516 | static void control_ops_free(struct ftrace_ops *ops) |
| 2348 | { | 2517 | { |
| 2349 | free_percpu(ops->disabled); | 2518 | free_percpu(ops->disabled); |
| @@ -2362,6 +2531,13 @@ static void ftrace_startup_enable(int command) | |||
| 2362 | ftrace_run_update_code(command); | 2531 | ftrace_run_update_code(command); |
| 2363 | } | 2532 | } |
| 2364 | 2533 | ||
| 2534 | static void ftrace_startup_all(int command) | ||
| 2535 | { | ||
| 2536 | update_all_ops = true; | ||
| 2537 | ftrace_startup_enable(command); | ||
| 2538 | update_all_ops = false; | ||
| 2539 | } | ||
| 2540 | |||
| 2365 | static int ftrace_startup(struct ftrace_ops *ops, int command) | 2541 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
| 2366 | { | 2542 | { |
| 2367 | int ret; | 2543 | int ret; |
| @@ -2376,12 +2552,31 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
| 2376 | ftrace_start_up++; | 2552 | ftrace_start_up++; |
| 2377 | command |= FTRACE_UPDATE_CALLS; | 2553 | command |= FTRACE_UPDATE_CALLS; |
| 2378 | 2554 | ||
| 2379 | ops->flags |= FTRACE_OPS_FL_ENABLED; | 2555 | /* |
| 2556 | * Note that ftrace probes uses this to start up | ||
| 2557 | * and modify functions it will probe. But we still | ||
| 2558 | * set the ADDING flag for modification, as probes | ||
| 2559 | * do not have trampolines. If they add them in the | ||
| 2560 | * future, then the probes will need to distinguish | ||
| 2561 | * between adding and updating probes. | ||
| 2562 | */ | ||
| 2563 | ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; | ||
| 2564 | |||
| 2565 | ret = ftrace_hash_ipmodify_enable(ops); | ||
| 2566 | if (ret < 0) { | ||
| 2567 | /* Rollback registration process */ | ||
| 2568 | __unregister_ftrace_function(ops); | ||
| 2569 | ftrace_start_up--; | ||
| 2570 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
| 2571 | return ret; | ||
| 2572 | } | ||
| 2380 | 2573 | ||
| 2381 | ftrace_hash_rec_enable(ops, 1); | 2574 | ftrace_hash_rec_enable(ops, 1); |
| 2382 | 2575 | ||
| 2383 | ftrace_startup_enable(command); | 2576 | ftrace_startup_enable(command); |
| 2384 | 2577 | ||
| 2578 | ops->flags &= ~FTRACE_OPS_FL_ADDING; | ||
| 2579 | |||
| 2385 | return 0; | 2580 | return 0; |
| 2386 | } | 2581 | } |
| 2387 | 2582 | ||
| @@ -2404,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2404 | */ | 2599 | */ |
| 2405 | WARN_ON_ONCE(ftrace_start_up < 0); | 2600 | WARN_ON_ONCE(ftrace_start_up < 0); |
| 2406 | 2601 | ||
| 2602 | /* Disabling ipmodify never fails */ | ||
| 2603 | ftrace_hash_ipmodify_disable(ops); | ||
| 2407 | ftrace_hash_rec_disable(ops, 1); | 2604 | ftrace_hash_rec_disable(ops, 1); |
| 2408 | 2605 | ||
| 2409 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 2606 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
| @@ -2431,11 +2628,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2431 | * If the ops uses a trampoline, then it needs to be | 2628 | * If the ops uses a trampoline, then it needs to be |
| 2432 | * tested first on update. | 2629 | * tested first on update. |
| 2433 | */ | 2630 | */ |
| 2631 | ops->flags |= FTRACE_OPS_FL_REMOVING; | ||
| 2434 | removed_ops = ops; | 2632 | removed_ops = ops; |
| 2435 | 2633 | ||
| 2634 | /* The trampoline logic checks the old hashes */ | ||
| 2635 | ops->old_hash.filter_hash = ops->func_hash->filter_hash; | ||
| 2636 | ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; | ||
| 2637 | |||
| 2436 | ftrace_run_update_code(command); | 2638 | ftrace_run_update_code(command); |
| 2437 | 2639 | ||
| 2640 | /* | ||
| 2641 | * If there's no more ops registered with ftrace, run a | ||
| 2642 | * sanity check to make sure all rec flags are cleared. | ||
| 2643 | */ | ||
| 2644 | if (ftrace_ops_list == &ftrace_list_end) { | ||
| 2645 | struct ftrace_page *pg; | ||
| 2646 | struct dyn_ftrace *rec; | ||
| 2647 | |||
| 2648 | do_for_each_ftrace_rec(pg, rec) { | ||
| 2649 | if (FTRACE_WARN_ON_ONCE(rec->flags)) | ||
| 2650 | pr_warn(" %pS flags:%lx\n", | ||
| 2651 | (void *)rec->ip, rec->flags); | ||
| 2652 | } while_for_each_ftrace_rec(); | ||
| 2653 | } | ||
| 2654 | |||
| 2655 | ops->old_hash.filter_hash = NULL; | ||
| 2656 | ops->old_hash.notrace_hash = NULL; | ||
| 2657 | |||
| 2438 | removed_ops = NULL; | 2658 | removed_ops = NULL; |
| 2659 | ops->flags &= ~FTRACE_OPS_FL_REMOVING; | ||
| 2439 | 2660 | ||
| 2440 | /* | 2661 | /* |
| 2441 | * Dynamic ops may be freed, we must make sure that all | 2662 | * Dynamic ops may be freed, we must make sure that all |
| @@ -2454,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2454 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { | 2675 | if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { |
| 2455 | schedule_on_each_cpu(ftrace_sync); | 2676 | schedule_on_each_cpu(ftrace_sync); |
| 2456 | 2677 | ||
| 2678 | arch_ftrace_trampoline_free(ops); | ||
| 2679 | |||
| 2457 | if (ops->flags & FTRACE_OPS_FL_CONTROL) | 2680 | if (ops->flags & FTRACE_OPS_FL_CONTROL) |
| 2458 | control_ops_free(ops); | 2681 | control_ops_free(ops); |
| 2459 | } | 2682 | } |
| @@ -2606,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) | |||
| 2606 | if (ftrace_start_up && cnt) { | 2829 | if (ftrace_start_up && cnt) { |
| 2607 | int failed = __ftrace_replace_code(p, 1); | 2830 | int failed = __ftrace_replace_code(p, 1); |
| 2608 | if (failed) | 2831 | if (failed) |
| 2609 | ftrace_bug(failed, p->ip); | 2832 | ftrace_bug(failed, p); |
| 2610 | } | 2833 | } |
| 2611 | } | 2834 | } |
| 2612 | } | 2835 | } |
| @@ -2931,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p) | |||
| 2931 | mutex_unlock(&ftrace_lock); | 3154 | mutex_unlock(&ftrace_lock); |
| 2932 | } | 3155 | } |
| 2933 | 3156 | ||
| 3157 | void * __weak | ||
| 3158 | arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) | ||
| 3159 | { | ||
| 3160 | return NULL; | ||
| 3161 | } | ||
| 3162 | |||
| 3163 | static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, | ||
| 3164 | struct dyn_ftrace *rec) | ||
| 3165 | { | ||
| 3166 | void *ptr; | ||
| 3167 | |||
| 3168 | ptr = arch_ftrace_trampoline_func(ops, rec); | ||
| 3169 | if (ptr) | ||
| 3170 | seq_printf(m, " ->%pS", ptr); | ||
| 3171 | } | ||
| 3172 | |||
| 2934 | static int t_show(struct seq_file *m, void *v) | 3173 | static int t_show(struct seq_file *m, void *v) |
| 2935 | { | 3174 | { |
| 2936 | struct ftrace_iterator *iter = m->private; | 3175 | struct ftrace_iterator *iter = m->private; |
| @@ -2941,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2941 | 3180 | ||
| 2942 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3181 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 2943 | if (iter->flags & FTRACE_ITER_NOTRACE) | 3182 | if (iter->flags & FTRACE_ITER_NOTRACE) |
| 2944 | seq_printf(m, "#### no functions disabled ####\n"); | 3183 | seq_puts(m, "#### no functions disabled ####\n"); |
| 2945 | else | 3184 | else |
| 2946 | seq_printf(m, "#### all functions enabled ####\n"); | 3185 | seq_puts(m, "#### all functions enabled ####\n"); |
| 2947 | return 0; | 3186 | return 0; |
| 2948 | } | 3187 | } |
| 2949 | 3188 | ||
| @@ -2954,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2954 | 3193 | ||
| 2955 | seq_printf(m, "%ps", (void *)rec->ip); | 3194 | seq_printf(m, "%ps", (void *)rec->ip); |
| 2956 | if (iter->flags & FTRACE_ITER_ENABLED) { | 3195 | if (iter->flags & FTRACE_ITER_ENABLED) { |
| 2957 | seq_printf(m, " (%ld)%s", | 3196 | struct ftrace_ops *ops = NULL; |
| 3197 | |||
| 3198 | seq_printf(m, " (%ld)%s%s", | ||
| 2958 | ftrace_rec_count(rec), | 3199 | ftrace_rec_count(rec), |
| 2959 | rec->flags & FTRACE_FL_REGS ? " R" : " "); | 3200 | rec->flags & FTRACE_FL_REGS ? " R" : " ", |
| 3201 | rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); | ||
| 2960 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | 3202 | if (rec->flags & FTRACE_FL_TRAMP_EN) { |
| 2961 | struct ftrace_ops *ops; | 3203 | ops = ftrace_find_tramp_ops_any(rec); |
| 2962 | 3204 | if (ops) | |
| 2963 | ops = ftrace_find_tramp_ops_curr(rec); | ||
| 2964 | if (ops && ops->trampoline) | ||
| 2965 | seq_printf(m, "\ttramp: %pS", | 3205 | seq_printf(m, "\ttramp: %pS", |
| 2966 | (void *)ops->trampoline); | 3206 | (void *)ops->trampoline); |
| 2967 | else | 3207 | else |
| 2968 | seq_printf(m, "\ttramp: ERROR!"); | 3208 | seq_puts(m, "\ttramp: ERROR!"); |
| 3209 | |||
| 2969 | } | 3210 | } |
| 3211 | add_trampoline_func(m, ops, rec); | ||
| 2970 | } | 3212 | } |
| 2971 | 3213 | ||
| 2972 | seq_printf(m, "\n"); | 3214 | seq_putc(m, '\n'); |
| 2973 | 3215 | ||
| 2974 | return 0; | 3216 | return 0; |
| 2975 | } | 3217 | } |
| @@ -3003,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) | |||
| 3003 | { | 3245 | { |
| 3004 | struct ftrace_iterator *iter; | 3246 | struct ftrace_iterator *iter; |
| 3005 | 3247 | ||
| 3006 | if (unlikely(ftrace_disabled)) | ||
| 3007 | return -ENODEV; | ||
| 3008 | |||
| 3009 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); | 3248 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
| 3010 | if (iter) { | 3249 | if (iter) { |
| 3011 | iter->pg = ftrace_pages_start; | 3250 | iter->pg = ftrace_pages_start; |
| @@ -3340,7 +3579,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = | |||
| 3340 | 3579 | ||
| 3341 | static int ftrace_probe_registered; | 3580 | static int ftrace_probe_registered; |
| 3342 | 3581 | ||
| 3343 | static void __enable_ftrace_function_probe(void) | 3582 | static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) |
| 3344 | { | 3583 | { |
| 3345 | int ret; | 3584 | int ret; |
| 3346 | int i; | 3585 | int i; |
| @@ -3348,7 +3587,8 @@ static void __enable_ftrace_function_probe(void) | |||
| 3348 | if (ftrace_probe_registered) { | 3587 | if (ftrace_probe_registered) { |
| 3349 | /* still need to update the function call sites */ | 3588 | /* still need to update the function call sites */ |
| 3350 | if (ftrace_enabled) | 3589 | if (ftrace_enabled) |
| 3351 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | 3590 | ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, |
| 3591 | old_hash); | ||
| 3352 | return; | 3592 | return; |
| 3353 | } | 3593 | } |
| 3354 | 3594 | ||
| @@ -3399,6 +3639,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3399 | { | 3639 | { |
| 3400 | struct ftrace_func_probe *entry; | 3640 | struct ftrace_func_probe *entry; |
| 3401 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; | 3641 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; |
| 3642 | struct ftrace_hash *old_hash = *orig_hash; | ||
| 3402 | struct ftrace_hash *hash; | 3643 | struct ftrace_hash *hash; |
| 3403 | struct ftrace_page *pg; | 3644 | struct ftrace_page *pg; |
| 3404 | struct dyn_ftrace *rec; | 3645 | struct dyn_ftrace *rec; |
| @@ -3417,7 +3658,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3417 | 3658 | ||
| 3418 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); | 3659 | mutex_lock(&trace_probe_ops.func_hash->regex_lock); |
| 3419 | 3660 | ||
| 3420 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 3661 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); |
| 3421 | if (!hash) { | 3662 | if (!hash) { |
| 3422 | count = -ENOMEM; | 3663 | count = -ENOMEM; |
| 3423 | goto out; | 3664 | goto out; |
| @@ -3476,10 +3717,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3476 | } while_for_each_ftrace_rec(); | 3717 | } while_for_each_ftrace_rec(); |
| 3477 | 3718 | ||
| 3478 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 3719 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); |
| 3479 | if (ret < 0) | ||
| 3480 | count = ret; | ||
| 3481 | 3720 | ||
| 3482 | __enable_ftrace_function_probe(); | 3721 | __enable_ftrace_function_probe(old_hash); |
| 3722 | |||
| 3723 | if (!ret) | ||
| 3724 | free_ftrace_hash_rcu(old_hash); | ||
| 3725 | else | ||
| 3726 | count = ret; | ||
| 3483 | 3727 | ||
| 3484 | out_unlock: | 3728 | out_unlock: |
| 3485 | mutex_unlock(&ftrace_lock); | 3729 | mutex_unlock(&ftrace_lock); |
| @@ -3503,6 +3747,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3503 | struct ftrace_func_probe *entry; | 3747 | struct ftrace_func_probe *entry; |
| 3504 | struct ftrace_func_probe *p; | 3748 | struct ftrace_func_probe *p; |
| 3505 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; | 3749 | struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; |
| 3750 | struct ftrace_hash *old_hash = *orig_hash; | ||
| 3506 | struct list_head free_list; | 3751 | struct list_head free_list; |
| 3507 | struct ftrace_hash *hash; | 3752 | struct ftrace_hash *hash; |
| 3508 | struct hlist_node *tmp; | 3753 | struct hlist_node *tmp; |
| @@ -3510,6 +3755,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3510 | int type = MATCH_FULL; | 3755 | int type = MATCH_FULL; |
| 3511 | int i, len = 0; | 3756 | int i, len = 0; |
| 3512 | char *search; | 3757 | char *search; |
| 3758 | int ret; | ||
| 3513 | 3759 | ||
| 3514 | if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) | 3760 | if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) |
| 3515 | glob = NULL; | 3761 | glob = NULL; |
| @@ -3568,8 +3814,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 3568 | * Remove after the disable is called. Otherwise, if the last | 3814 | * Remove after the disable is called. Otherwise, if the last |
| 3569 | * probe is removed, a null hash means *all enabled*. | 3815 | * probe is removed, a null hash means *all enabled*. |
| 3570 | */ | 3816 | */ |
| 3571 | ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); | 3817 | ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); |
| 3572 | synchronize_sched(); | 3818 | synchronize_sched(); |
| 3819 | if (!ret) | ||
| 3820 | free_ftrace_hash_rcu(old_hash); | ||
| 3821 | |||
| 3573 | list_for_each_entry_safe(entry, p, &free_list, free_list) { | 3822 | list_for_each_entry_safe(entry, p, &free_list, free_list) { |
| 3574 | list_del(&entry->free_list); | 3823 | list_del(&entry->free_list); |
| 3575 | ftrace_free_entry(entry); | 3824 | ftrace_free_entry(entry); |
| @@ -3756,10 +4005,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) | |||
| 3756 | return add_hash_entry(hash, ip); | 4005 | return add_hash_entry(hash, ip); |
| 3757 | } | 4006 | } |
| 3758 | 4007 | ||
| 3759 | static void ftrace_ops_update_code(struct ftrace_ops *ops) | 4008 | static void ftrace_ops_update_code(struct ftrace_ops *ops, |
| 4009 | struct ftrace_hash *old_hash) | ||
| 3760 | { | 4010 | { |
| 3761 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) | 4011 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) |
| 3762 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | 4012 | ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); |
| 3763 | } | 4013 | } |
| 3764 | 4014 | ||
| 3765 | static int | 4015 | static int |
| @@ -3767,6 +4017,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3767 | unsigned long ip, int remove, int reset, int enable) | 4017 | unsigned long ip, int remove, int reset, int enable) |
| 3768 | { | 4018 | { |
| 3769 | struct ftrace_hash **orig_hash; | 4019 | struct ftrace_hash **orig_hash; |
| 4020 | struct ftrace_hash *old_hash; | ||
| 3770 | struct ftrace_hash *hash; | 4021 | struct ftrace_hash *hash; |
| 3771 | int ret; | 4022 | int ret; |
| 3772 | 4023 | ||
| @@ -3801,10 +4052,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3801 | } | 4052 | } |
| 3802 | 4053 | ||
| 3803 | mutex_lock(&ftrace_lock); | 4054 | mutex_lock(&ftrace_lock); |
| 4055 | old_hash = *orig_hash; | ||
| 3804 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 4056 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
| 3805 | if (!ret) | 4057 | if (!ret) { |
| 3806 | ftrace_ops_update_code(ops); | 4058 | ftrace_ops_update_code(ops, old_hash); |
| 3807 | 4059 | free_ftrace_hash_rcu(old_hash); | |
| 4060 | } | ||
| 3808 | mutex_unlock(&ftrace_lock); | 4061 | mutex_unlock(&ftrace_lock); |
| 3809 | 4062 | ||
| 3810 | out_regex_unlock: | 4063 | out_regex_unlock: |
| @@ -3944,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | |||
| 3944 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; | 4197 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; |
| 3945 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); | 4198 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); |
| 3946 | 4199 | ||
| 4200 | static unsigned long save_global_trampoline; | ||
| 4201 | static unsigned long save_global_flags; | ||
| 4202 | |||
| 3947 | static int __init set_graph_function(char *str) | 4203 | static int __init set_graph_function(char *str) |
| 3948 | { | 4204 | { |
| 3949 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); | 4205 | strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); |
| @@ -4013,6 +4269,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 4013 | struct seq_file *m = (struct seq_file *)file->private_data; | 4269 | struct seq_file *m = (struct seq_file *)file->private_data; |
| 4014 | struct ftrace_iterator *iter; | 4270 | struct ftrace_iterator *iter; |
| 4015 | struct ftrace_hash **orig_hash; | 4271 | struct ftrace_hash **orig_hash; |
| 4272 | struct ftrace_hash *old_hash; | ||
| 4016 | struct trace_parser *parser; | 4273 | struct trace_parser *parser; |
| 4017 | int filter_hash; | 4274 | int filter_hash; |
| 4018 | int ret; | 4275 | int ret; |
| @@ -4042,11 +4299,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 4042 | orig_hash = &iter->ops->func_hash->notrace_hash; | 4299 | orig_hash = &iter->ops->func_hash->notrace_hash; |
| 4043 | 4300 | ||
| 4044 | mutex_lock(&ftrace_lock); | 4301 | mutex_lock(&ftrace_lock); |
| 4302 | old_hash = *orig_hash; | ||
| 4045 | ret = ftrace_hash_move(iter->ops, filter_hash, | 4303 | ret = ftrace_hash_move(iter->ops, filter_hash, |
| 4046 | orig_hash, iter->hash); | 4304 | orig_hash, iter->hash); |
| 4047 | if (!ret) | 4305 | if (!ret) { |
| 4048 | ftrace_ops_update_code(iter->ops); | 4306 | ftrace_ops_update_code(iter->ops, old_hash); |
| 4049 | 4307 | free_ftrace_hash_rcu(old_hash); | |
| 4308 | } | ||
| 4050 | mutex_unlock(&ftrace_lock); | 4309 | mutex_unlock(&ftrace_lock); |
| 4051 | } | 4310 | } |
| 4052 | 4311 | ||
| @@ -4149,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v) | |||
| 4149 | struct ftrace_graph_data *fgd = m->private; | 4408 | struct ftrace_graph_data *fgd = m->private; |
| 4150 | 4409 | ||
| 4151 | if (fgd->table == ftrace_graph_funcs) | 4410 | if (fgd->table == ftrace_graph_funcs) |
| 4152 | seq_printf(m, "#### all functions enabled ####\n"); | 4411 | seq_puts(m, "#### all functions enabled ####\n"); |
| 4153 | else | 4412 | else |
| 4154 | seq_printf(m, "#### no functions disabled ####\n"); | 4413 | seq_puts(m, "#### no functions disabled ####\n"); |
| 4155 | return 0; | 4414 | return 0; |
| 4156 | } | 4415 | } |
| 4157 | 4416 | ||
| @@ -4662,6 +4921,32 @@ void __init ftrace_init(void) | |||
| 4662 | ftrace_disabled = 1; | 4921 | ftrace_disabled = 1; |
| 4663 | } | 4922 | } |
| 4664 | 4923 | ||
| 4924 | /* Do nothing if arch does not support this */ | ||
| 4925 | void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 4926 | { | ||
| 4927 | } | ||
| 4928 | |||
| 4929 | static void ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 4930 | { | ||
| 4931 | |||
| 4932 | /* | ||
| 4933 | * Currently there's no safe way to free a trampoline when the kernel | ||
| 4934 | * is configured with PREEMPT. That is because a task could be preempted | ||
| 4935 | * when it jumped to the trampoline, it may be preempted for a long time | ||
| 4936 | * depending on the system load, and currently there's no way to know | ||
| 4937 | * when it will be off the trampoline. If the trampoline is freed | ||
| 4938 | * too early, when the task runs again, it will be executing on freed | ||
| 4939 | * memory and crash. | ||
| 4940 | */ | ||
| 4941 | #ifdef CONFIG_PREEMPT | ||
| 4942 | /* Currently, only non dynamic ops can have a trampoline */ | ||
| 4943 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
| 4944 | return; | ||
| 4945 | #endif | ||
| 4946 | |||
| 4947 | arch_ftrace_update_trampoline(ops); | ||
| 4948 | } | ||
| 4949 | |||
| 4665 | #else | 4950 | #else |
| 4666 | 4951 | ||
| 4667 | static struct ftrace_ops global_ops = { | 4952 | static struct ftrace_ops global_ops = { |
| @@ -4678,6 +4963,7 @@ core_initcall(ftrace_nodyn_init); | |||
| 4678 | 4963 | ||
| 4679 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 4964 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
| 4680 | static inline void ftrace_startup_enable(int command) { } | 4965 | static inline void ftrace_startup_enable(int command) { } |
| 4966 | static inline void ftrace_startup_all(int command) { } | ||
| 4681 | /* Keep as macros so we do not need to define the commands */ | 4967 | /* Keep as macros so we do not need to define the commands */ |
| 4682 | # define ftrace_startup(ops, command) \ | 4968 | # define ftrace_startup(ops, command) \ |
| 4683 | ({ \ | 4969 | ({ \ |
| @@ -4703,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 4703 | return 1; | 4989 | return 1; |
| 4704 | } | 4990 | } |
| 4705 | 4991 | ||
| 4992 | static void ftrace_update_trampoline(struct ftrace_ops *ops) | ||
| 4993 | { | ||
| 4994 | } | ||
| 4995 | |||
| 4706 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 4996 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 4707 | 4997 | ||
| 4708 | __init void ftrace_init_global_array_ops(struct trace_array *tr) | 4998 | __init void ftrace_init_global_array_ops(struct trace_array *tr) |
| @@ -4827,6 +5117,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) | |||
| 4827 | } | 5117 | } |
| 4828 | #endif | 5118 | #endif |
| 4829 | 5119 | ||
| 5120 | /* | ||
| 5121 | * If there's only one function registered but it does not support | ||
| 5122 | * recursion, this function will be called by the mcount trampoline. | ||
| 5123 | * This function will handle recursion protection. | ||
| 5124 | */ | ||
| 5125 | static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | ||
| 5126 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 5127 | { | ||
| 5128 | int bit; | ||
| 5129 | |||
| 5130 | bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); | ||
| 5131 | if (bit < 0) | ||
| 5132 | return; | ||
| 5133 | |||
| 5134 | op->func(ip, parent_ip, op, regs); | ||
| 5135 | |||
| 5136 | trace_clear_recursion(bit); | ||
| 5137 | } | ||
| 5138 | |||
| 5139 | /** | ||
| 5140 | * ftrace_ops_get_func - get the function a trampoline should call | ||
| 5141 | * @ops: the ops to get the function for | ||
| 5142 | * | ||
| 5143 | * Normally the mcount trampoline will call the ops->func, but there | ||
| 5144 | * are times that it should not. For example, if the ops does not | ||
| 5145 | * have its own recursion protection, then it should call the | ||
| 5146 | * ftrace_ops_recurs_func() instead. | ||
| 5147 | * | ||
| 5148 | * Returns the function that the trampoline should call for @ops. | ||
| 5149 | */ | ||
| 5150 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | ||
| 5151 | { | ||
| 5152 | /* | ||
| 5153 | * If this is a dynamic ops or we force list func, | ||
| 5154 | * then it needs to call the list anyway. | ||
| 5155 | */ | ||
| 5156 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
| 5157 | return ftrace_ops_list_func; | ||
| 5158 | |||
| 5159 | /* | ||
| 5160 | * If the func handles its own recursion, call it directly. | ||
| 5161 | * Otherwise call the recursion protected function that | ||
| 5162 | * will call the ftrace ops function. | ||
| 5163 | */ | ||
| 5164 | if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) | ||
| 5165 | return ftrace_ops_recurs_func; | ||
| 5166 | |||
| 5167 | return ops->func; | ||
| 5168 | } | ||
| 5169 | |||
| 4830 | static void clear_ftrace_swapper(void) | 5170 | static void clear_ftrace_swapper(void) |
| 4831 | { | 5171 | { |
| 4832 | struct task_struct *p; | 5172 | struct task_struct *p; |
| @@ -4927,7 +5267,8 @@ static int ftrace_pid_add(int p) | |||
| 4927 | set_ftrace_pid_task(pid); | 5267 | set_ftrace_pid_task(pid); |
| 4928 | 5268 | ||
| 4929 | ftrace_update_pid_func(); | 5269 | ftrace_update_pid_func(); |
| 4930 | ftrace_startup_enable(0); | 5270 | |
| 5271 | ftrace_startup_all(0); | ||
| 4931 | 5272 | ||
| 4932 | mutex_unlock(&ftrace_lock); | 5273 | mutex_unlock(&ftrace_lock); |
| 4933 | return 0; | 5274 | return 0; |
| @@ -4956,7 +5297,7 @@ static void ftrace_pid_reset(void) | |||
| 4956 | } | 5297 | } |
| 4957 | 5298 | ||
| 4958 | ftrace_update_pid_func(); | 5299 | ftrace_update_pid_func(); |
| 4959 | ftrace_startup_enable(0); | 5300 | ftrace_startup_all(0); |
| 4960 | 5301 | ||
| 4961 | mutex_unlock(&ftrace_lock); | 5302 | mutex_unlock(&ftrace_lock); |
| 4962 | } | 5303 | } |
| @@ -4989,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v) | |||
| 4989 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); | 5330 | const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); |
| 4990 | 5331 | ||
| 4991 | if (v == (void *)1) { | 5332 | if (v == (void *)1) { |
| 4992 | seq_printf(m, "no pid\n"); | 5333 | seq_puts(m, "no pid\n"); |
| 4993 | return 0; | 5334 | return 0; |
| 4994 | } | 5335 | } |
| 4995 | 5336 | ||
| 4996 | if (fpid->pid == ftrace_swapper_pid) | 5337 | if (fpid->pid == ftrace_swapper_pid) |
| 4997 | seq_printf(m, "swapper tasks\n"); | 5338 | seq_puts(m, "swapper tasks\n"); |
| 4998 | else | 5339 | else |
| 4999 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); | 5340 | seq_printf(m, "%u\n", pid_vnr(fpid->pid)); |
| 5000 | 5341 | ||
| @@ -5207,6 +5548,7 @@ static struct ftrace_ops graph_ops = { | |||
| 5207 | FTRACE_OPS_FL_STUB, | 5548 | FTRACE_OPS_FL_STUB, |
| 5208 | #ifdef FTRACE_GRAPH_TRAMP_ADDR | 5549 | #ifdef FTRACE_GRAPH_TRAMP_ADDR |
| 5209 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, | 5550 | .trampoline = FTRACE_GRAPH_TRAMP_ADDR, |
| 5551 | /* trampoline_size is only needed for dynamically allocated tramps */ | ||
| 5210 | #endif | 5552 | #endif |
| 5211 | ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) | 5553 | ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) |
| 5212 | }; | 5554 | }; |
| @@ -5436,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 5436 | update_function_graph_func(); | 5778 | update_function_graph_func(); |
| 5437 | 5779 | ||
| 5438 | ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); | 5780 | ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); |
| 5439 | |||
| 5440 | out: | 5781 | out: |
| 5441 | mutex_unlock(&ftrace_lock); | 5782 | mutex_unlock(&ftrace_lock); |
| 5442 | return ret; | 5783 | return ret; |
| @@ -5457,6 +5798,17 @@ void unregister_ftrace_graph(void) | |||
| 5457 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5798 | unregister_pm_notifier(&ftrace_suspend_notifier); |
| 5458 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5799 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
| 5459 | 5800 | ||
| 5801 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 5802 | /* | ||
| 5803 | * Function graph does not allocate the trampoline, but | ||
| 5804 | * other global_ops do. We need to reset the ALLOC_TRAMP flag | ||
| 5805 | * if one was used. | ||
| 5806 | */ | ||
| 5807 | global_ops.trampoline = save_global_trampoline; | ||
| 5808 | if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) | ||
| 5809 | global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; | ||
| 5810 | #endif | ||
| 5811 | |||
| 5460 | out: | 5812 | out: |
| 5461 | mutex_unlock(&ftrace_lock); | 5813 | mutex_unlock(&ftrace_lock); |
| 5462 | } | 5814 | } |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..7a4104cb95cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work); | |||
| 34 | */ | 34 | */ |
| 35 | int ring_buffer_print_entry_header(struct trace_seq *s) | 35 | int ring_buffer_print_entry_header(struct trace_seq *s) |
| 36 | { | 36 | { |
| 37 | int ret; | 37 | trace_seq_puts(s, "# compressed entry header\n"); |
| 38 | 38 | trace_seq_puts(s, "\ttype_len : 5 bits\n"); | |
| 39 | ret = trace_seq_puts(s, "# compressed entry header\n"); | 39 | trace_seq_puts(s, "\ttime_delta : 27 bits\n"); |
| 40 | ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); | 40 | trace_seq_puts(s, "\tarray : 32 bits\n"); |
| 41 | ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); | 41 | trace_seq_putc(s, '\n'); |
| 42 | ret = trace_seq_puts(s, "\tarray : 32 bits\n"); | 42 | trace_seq_printf(s, "\tpadding : type == %d\n", |
| 43 | ret = trace_seq_putc(s, '\n'); | 43 | RINGBUF_TYPE_PADDING); |
| 44 | ret = trace_seq_printf(s, "\tpadding : type == %d\n", | 44 | trace_seq_printf(s, "\ttime_extend : type == %d\n", |
| 45 | RINGBUF_TYPE_PADDING); | 45 | RINGBUF_TYPE_TIME_EXTEND); |
| 46 | ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", | 46 | trace_seq_printf(s, "\tdata max type_len == %d\n", |
| 47 | RINGBUF_TYPE_TIME_EXTEND); | 47 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX); |
| 48 | ret = trace_seq_printf(s, "\tdata max type_len == %d\n", | ||
| 49 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | ||
| 50 | 48 | ||
| 51 | return ret; | 49 | return !trace_seq_has_overflowed(s); |
| 52 | } | 50 | } |
| 53 | 51 | ||
| 54 | /* | 52 | /* |
| @@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta) | |||
| 419 | int ring_buffer_print_page_header(struct trace_seq *s) | 417 | int ring_buffer_print_page_header(struct trace_seq *s) |
| 420 | { | 418 | { |
| 421 | struct buffer_data_page field; | 419 | struct buffer_data_page field; |
| 422 | int ret; | ||
| 423 | 420 | ||
| 424 | ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" | 421 | trace_seq_printf(s, "\tfield: u64 timestamp;\t" |
| 425 | "offset:0;\tsize:%u;\tsigned:%u;\n", | 422 | "offset:0;\tsize:%u;\tsigned:%u;\n", |
| 426 | (unsigned int)sizeof(field.time_stamp), | 423 | (unsigned int)sizeof(field.time_stamp), |
| 427 | (unsigned int)is_signed_type(u64)); | 424 | (unsigned int)is_signed_type(u64)); |
| 428 | |||
| 429 | ret = trace_seq_printf(s, "\tfield: local_t commit;\t" | ||
| 430 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 431 | (unsigned int)offsetof(typeof(field), commit), | ||
| 432 | (unsigned int)sizeof(field.commit), | ||
| 433 | (unsigned int)is_signed_type(long)); | ||
| 434 | |||
| 435 | ret = trace_seq_printf(s, "\tfield: int overwrite;\t" | ||
| 436 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 437 | (unsigned int)offsetof(typeof(field), commit), | ||
| 438 | 1, | ||
| 439 | (unsigned int)is_signed_type(long)); | ||
| 440 | |||
| 441 | ret = trace_seq_printf(s, "\tfield: char data;\t" | ||
| 442 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 443 | (unsigned int)offsetof(typeof(field), data), | ||
| 444 | (unsigned int)BUF_PAGE_SIZE, | ||
| 445 | (unsigned int)is_signed_type(char)); | ||
| 446 | 425 | ||
| 447 | return ret; | 426 | trace_seq_printf(s, "\tfield: local_t commit;\t" |
| 427 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 428 | (unsigned int)offsetof(typeof(field), commit), | ||
| 429 | (unsigned int)sizeof(field.commit), | ||
| 430 | (unsigned int)is_signed_type(long)); | ||
| 431 | |||
| 432 | trace_seq_printf(s, "\tfield: int overwrite;\t" | ||
| 433 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 434 | (unsigned int)offsetof(typeof(field), commit), | ||
| 435 | 1, | ||
| 436 | (unsigned int)is_signed_type(long)); | ||
| 437 | |||
| 438 | trace_seq_printf(s, "\tfield: char data;\t" | ||
| 439 | "offset:%u;\tsize:%u;\tsigned:%u;\n", | ||
| 440 | (unsigned int)offsetof(typeof(field), data), | ||
| 441 | (unsigned int)BUF_PAGE_SIZE, | ||
| 442 | (unsigned int)is_signed_type(char)); | ||
| 443 | |||
| 444 | return !trace_seq_has_overflowed(s); | ||
| 448 | } | 445 | } |
| 449 | 446 | ||
| 450 | struct rb_irq_work { | 447 | struct rb_irq_work { |
| @@ -538,16 +535,18 @@ static void rb_wake_up_waiters(struct irq_work *work) | |||
| 538 | * ring_buffer_wait - wait for input to the ring buffer | 535 | * ring_buffer_wait - wait for input to the ring buffer |
| 539 | * @buffer: buffer to wait on | 536 | * @buffer: buffer to wait on |
| 540 | * @cpu: the cpu buffer to wait on | 537 | * @cpu: the cpu buffer to wait on |
| 538 | * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS | ||
| 541 | * | 539 | * |
| 542 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon | 540 | * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon |
| 543 | * as data is added to any of the @buffer's cpu buffers. Otherwise | 541 | * as data is added to any of the @buffer's cpu buffers. Otherwise |
| 544 | * it will wait for data to be added to a specific cpu buffer. | 542 | * it will wait for data to be added to a specific cpu buffer. |
| 545 | */ | 543 | */ |
| 546 | int ring_buffer_wait(struct ring_buffer *buffer, int cpu) | 544 | int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) |
| 547 | { | 545 | { |
| 548 | struct ring_buffer_per_cpu *cpu_buffer; | 546 | struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); |
| 549 | DEFINE_WAIT(wait); | 547 | DEFINE_WAIT(wait); |
| 550 | struct rb_irq_work *work; | 548 | struct rb_irq_work *work; |
| 549 | int ret = 0; | ||
| 551 | 550 | ||
| 552 | /* | 551 | /* |
| 553 | * Depending on what the caller is waiting for, either any | 552 | * Depending on what the caller is waiting for, either any |
| @@ -564,36 +563,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) | |||
| 564 | } | 563 | } |
| 565 | 564 | ||
| 566 | 565 | ||
| 567 | prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); | 566 | while (true) { |
| 567 | prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); | ||
| 568 | 568 | ||
| 569 | /* | 569 | /* |
| 570 | * The events can happen in critical sections where | 570 | * The events can happen in critical sections where |
| 571 | * checking a work queue can cause deadlocks. | 571 | * checking a work queue can cause deadlocks. |
| 572 | * After adding a task to the queue, this flag is set | 572 | * After adding a task to the queue, this flag is set |
| 573 | * only to notify events to try to wake up the queue | 573 | * only to notify events to try to wake up the queue |
| 574 | * using irq_work. | 574 | * using irq_work. |
| 575 | * | 575 | * |
| 576 | * We don't clear it even if the buffer is no longer | 576 | * We don't clear it even if the buffer is no longer |
| 577 | * empty. The flag only causes the next event to run | 577 | * empty. The flag only causes the next event to run |
| 578 | * irq_work to do the work queue wake up. The worse | 578 | * irq_work to do the work queue wake up. The worse |
| 579 | * that can happen if we race with !trace_empty() is that | 579 | * that can happen if we race with !trace_empty() is that |
| 580 | * an event will cause an irq_work to try to wake up | 580 | * an event will cause an irq_work to try to wake up |
| 581 | * an empty queue. | 581 | * an empty queue. |
| 582 | * | 582 | * |
| 583 | * There's no reason to protect this flag either, as | 583 | * There's no reason to protect this flag either, as |
| 584 | * the work queue and irq_work logic will do the necessary | 584 | * the work queue and irq_work logic will do the necessary |
| 585 | * synchronization for the wake ups. The only thing | 585 | * synchronization for the wake ups. The only thing |
| 586 | * that is necessary is that the wake up happens after | 586 | * that is necessary is that the wake up happens after |
| 587 | * a task has been queued. It's OK for spurious wake ups. | 587 | * a task has been queued. It's OK for spurious wake ups. |
| 588 | */ | 588 | */ |
| 589 | work->waiters_pending = true; | 589 | work->waiters_pending = true; |
| 590 | |||
| 591 | if (signal_pending(current)) { | ||
| 592 | ret = -EINTR; | ||
| 593 | break; | ||
| 594 | } | ||
| 595 | |||
| 596 | if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) | ||
| 597 | break; | ||
| 598 | |||
| 599 | if (cpu != RING_BUFFER_ALL_CPUS && | ||
| 600 | !ring_buffer_empty_cpu(buffer, cpu)) { | ||
| 601 | unsigned long flags; | ||
| 602 | bool pagebusy; | ||
| 603 | |||
| 604 | if (!full) | ||
| 605 | break; | ||
| 606 | |||
| 607 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
| 608 | pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; | ||
| 609 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
| 610 | |||
| 611 | if (!pagebusy) | ||
| 612 | break; | ||
| 613 | } | ||
| 590 | 614 | ||
| 591 | if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || | ||
| 592 | (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) | ||
| 593 | schedule(); | 615 | schedule(); |
| 616 | } | ||
| 594 | 617 | ||
| 595 | finish_wait(&work->waiters, &wait); | 618 | finish_wait(&work->waiters, &wait); |
| 596 | return 0; | 619 | |
| 620 | return ret; | ||
| 597 | } | 621 | } |
| 598 | 622 | ||
| 599 | /** | 623 | /** |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) | |||
| 205 | break; | 205 | break; |
| 206 | 206 | ||
| 207 | schedule(); | 207 | schedule(); |
| 208 | __set_current_state(TASK_RUNNING); | ||
| 209 | } | 208 | } |
| 210 | reader_finish = 0; | 209 | reader_finish = 0; |
| 211 | complete(&read_done); | 210 | complete(&read_done); |
| @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) | |||
| 379 | break; | 378 | break; |
| 380 | 379 | ||
| 381 | schedule(); | 380 | schedule(); |
| 382 | __set_current_state(TASK_RUNNING); | ||
| 383 | } | 381 | } |
| 384 | __set_current_state(TASK_RUNNING); | 382 | __set_current_state(TASK_RUNNING); |
| 385 | 383 | ||
| @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) | |||
| 407 | trace_printk("Sleeping for 10 secs\n"); | 405 | trace_printk("Sleeping for 10 secs\n"); |
| 408 | set_current_state(TASK_INTERRUPTIBLE); | 406 | set_current_state(TASK_INTERRUPTIBLE); |
| 409 | schedule_timeout(HZ * SLEEP_TIME); | 407 | schedule_timeout(HZ * SLEEP_TIME); |
| 410 | __set_current_state(TASK_RUNNING); | ||
| 411 | } | 408 | } |
| 412 | 409 | ||
| 413 | if (kill_test) | 410 | if (kill_test) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..2e767972e99c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running; | |||
| 63 | */ | 63 | */ |
| 64 | bool __read_mostly tracing_selftest_disabled; | 64 | bool __read_mostly tracing_selftest_disabled; |
| 65 | 65 | ||
| 66 | /* Pipe tracepoints to printk */ | ||
| 67 | struct trace_iterator *tracepoint_print_iter; | ||
| 68 | int tracepoint_printk; | ||
| 69 | |||
| 66 | /* For tracers that don't implement custom flags */ | 70 | /* For tracers that don't implement custom flags */ |
| 67 | static struct tracer_opt dummy_tracer_opt[] = { | 71 | static struct tracer_opt dummy_tracer_opt[] = { |
| 68 | { } | 72 | { } |
| @@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | |||
| 155 | 159 | ||
| 156 | static int __init stop_trace_on_warning(char *str) | 160 | static int __init stop_trace_on_warning(char *str) |
| 157 | { | 161 | { |
| 158 | __disable_trace_on_warning = 1; | 162 | if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) |
| 163 | __disable_trace_on_warning = 1; | ||
| 159 | return 1; | 164 | return 1; |
| 160 | } | 165 | } |
| 161 | __setup("traceoff_on_warning=", stop_trace_on_warning); | 166 | __setup("traceoff_on_warning", stop_trace_on_warning); |
| 162 | 167 | ||
| 163 | static int __init boot_alloc_snapshot(char *str) | 168 | static int __init boot_alloc_snapshot(char *str) |
| 164 | { | 169 | { |
| @@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str) | |||
| 192 | } | 197 | } |
| 193 | __setup("trace_clock=", set_trace_boot_clock); | 198 | __setup("trace_clock=", set_trace_boot_clock); |
| 194 | 199 | ||
| 200 | static int __init set_tracepoint_printk(char *str) | ||
| 201 | { | ||
| 202 | if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) | ||
| 203 | tracepoint_printk = 1; | ||
| 204 | return 1; | ||
| 205 | } | ||
| 206 | __setup("tp_printk", set_tracepoint_printk); | ||
| 195 | 207 | ||
| 196 | unsigned long long ns2usecs(cycle_t nsec) | 208 | unsigned long long ns2usecs(cycle_t nsec) |
| 197 | { | 209 | { |
| @@ -938,19 +950,20 @@ out: | |||
| 938 | return ret; | 950 | return ret; |
| 939 | } | 951 | } |
| 940 | 952 | ||
| 953 | /* TODO add a seq_buf_to_buffer() */ | ||
| 941 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 954 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
| 942 | { | 955 | { |
| 943 | int len; | 956 | int len; |
| 944 | 957 | ||
| 945 | if (s->len <= s->readpos) | 958 | if (trace_seq_used(s) <= s->seq.readpos) |
| 946 | return -EBUSY; | 959 | return -EBUSY; |
| 947 | 960 | ||
| 948 | len = s->len - s->readpos; | 961 | len = trace_seq_used(s) - s->seq.readpos; |
| 949 | if (cnt > len) | 962 | if (cnt > len) |
| 950 | cnt = len; | 963 | cnt = len; |
| 951 | memcpy(buf, s->buffer + s->readpos, cnt); | 964 | memcpy(buf, s->buffer + s->seq.readpos, cnt); |
| 952 | 965 | ||
| 953 | s->readpos += cnt; | 966 | s->seq.readpos += cnt; |
| 954 | return cnt; | 967 | return cnt; |
| 955 | } | 968 | } |
| 956 | 969 | ||
| @@ -1076,13 +1089,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1076 | } | 1089 | } |
| 1077 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 1090 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
| 1078 | 1091 | ||
| 1079 | static int wait_on_pipe(struct trace_iterator *iter) | 1092 | static int wait_on_pipe(struct trace_iterator *iter, bool full) |
| 1080 | { | 1093 | { |
| 1081 | /* Iterators are static, they should be filled or empty */ | 1094 | /* Iterators are static, they should be filled or empty */ |
| 1082 | if (trace_buffer_iter(iter, iter->cpu_file)) | 1095 | if (trace_buffer_iter(iter, iter->cpu_file)) |
| 1083 | return 0; | 1096 | return 0; |
| 1084 | 1097 | ||
| 1085 | return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); | 1098 | return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, |
| 1099 | full); | ||
| 1086 | } | 1100 | } |
| 1087 | 1101 | ||
| 1088 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1102 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
| @@ -2028,7 +2042,7 @@ void trace_printk_init_buffers(void) | |||
| 2028 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); | 2042 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); |
| 2029 | pr_warning("** **\n"); | 2043 | pr_warning("** **\n"); |
| 2030 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); | 2044 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); |
| 2031 | pr_warning("** unsafe for produciton use. **\n"); | 2045 | pr_warning("** unsafe for production use. **\n"); |
| 2032 | pr_warning("** **\n"); | 2046 | pr_warning("** **\n"); |
| 2033 | pr_warning("** If you see this message and you are not debugging **\n"); | 2047 | pr_warning("** If you see this message and you are not debugging **\n"); |
| 2034 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); | 2048 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); |
| @@ -2157,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2157 | goto out; | 2171 | goto out; |
| 2158 | } | 2172 | } |
| 2159 | 2173 | ||
| 2160 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); | 2174 | len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
| 2161 | if (len > TRACE_BUF_SIZE) | ||
| 2162 | goto out; | ||
| 2163 | 2175 | ||
| 2164 | local_save_flags(flags); | 2176 | local_save_flags(flags); |
| 2165 | size = sizeof(*entry) + len + 1; | 2177 | size = sizeof(*entry) + len + 1; |
| @@ -2170,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, | |||
| 2170 | entry = ring_buffer_event_data(event); | 2182 | entry = ring_buffer_event_data(event); |
| 2171 | entry->ip = ip; | 2183 | entry->ip = ip; |
| 2172 | 2184 | ||
| 2173 | memcpy(&entry->buf, tbuffer, len); | 2185 | memcpy(&entry->buf, tbuffer, len + 1); |
| 2174 | entry->buf[len] = '\0'; | ||
| 2175 | if (!call_filter_check_discard(call, entry, buffer, event)) { | 2186 | if (!call_filter_check_discard(call, entry, buffer, event)) { |
| 2176 | __buffer_unlock_commit(buffer, event); | 2187 | __buffer_unlock_commit(buffer, event); |
| 2177 | ftrace_trace_stack(buffer, flags, 6, pc); | 2188 | ftrace_trace_stack(buffer, flags, 6, pc); |
| @@ -2508,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf, | |||
| 2508 | 2519 | ||
| 2509 | static void print_lat_help_header(struct seq_file *m) | 2520 | static void print_lat_help_header(struct seq_file *m) |
| 2510 | { | 2521 | { |
| 2511 | seq_puts(m, "# _------=> CPU# \n"); | 2522 | seq_puts(m, "# _------=> CPU# \n" |
| 2512 | seq_puts(m, "# / _-----=> irqs-off \n"); | 2523 | "# / _-----=> irqs-off \n" |
| 2513 | seq_puts(m, "# | / _----=> need-resched \n"); | 2524 | "# | / _----=> need-resched \n" |
| 2514 | seq_puts(m, "# || / _---=> hardirq/softirq \n"); | 2525 | "# || / _---=> hardirq/softirq \n" |
| 2515 | seq_puts(m, "# ||| / _--=> preempt-depth \n"); | 2526 | "# ||| / _--=> preempt-depth \n" |
| 2516 | seq_puts(m, "# |||| / delay \n"); | 2527 | "# |||| / delay \n" |
| 2517 | seq_puts(m, "# cmd pid ||||| time | caller \n"); | 2528 | "# cmd pid ||||| time | caller \n" |
| 2518 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 2529 | "# \\ / ||||| \\ | / \n"); |
| 2519 | } | 2530 | } |
| 2520 | 2531 | ||
| 2521 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | 2532 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) |
| @@ -2532,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | |||
| 2532 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) | 2543 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) |
| 2533 | { | 2544 | { |
| 2534 | print_event_info(buf, m); | 2545 | print_event_info(buf, m); |
| 2535 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 2546 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" |
| 2536 | seq_puts(m, "# | | | | |\n"); | 2547 | "# | | | | |\n"); |
| 2537 | } | 2548 | } |
| 2538 | 2549 | ||
| 2539 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) | 2550 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) |
| 2540 | { | 2551 | { |
| 2541 | print_event_info(buf, m); | 2552 | print_event_info(buf, m); |
| 2542 | seq_puts(m, "# _-----=> irqs-off\n"); | 2553 | seq_puts(m, "# _-----=> irqs-off\n" |
| 2543 | seq_puts(m, "# / _----=> need-resched\n"); | 2554 | "# / _----=> need-resched\n" |
| 2544 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | 2555 | "# | / _---=> hardirq/softirq\n" |
| 2545 | seq_puts(m, "# || / _--=> preempt-depth\n"); | 2556 | "# || / _--=> preempt-depth\n" |
| 2546 | seq_puts(m, "# ||| / delay\n"); | 2557 | "# ||| / delay\n" |
| 2547 | seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); | 2558 | "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" |
| 2548 | seq_puts(m, "# | | | |||| | |\n"); | 2559 | "# | | | |||| | |\n"); |
| 2549 | } | 2560 | } |
| 2550 | 2561 | ||
| 2551 | void | 2562 | void |
| @@ -2648,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) | |||
| 2648 | event = ftrace_find_event(entry->type); | 2659 | event = ftrace_find_event(entry->type); |
| 2649 | 2660 | ||
| 2650 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2661 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2651 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { | 2662 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) |
| 2652 | if (!trace_print_lat_context(iter)) | 2663 | trace_print_lat_context(iter); |
| 2653 | goto partial; | 2664 | else |
| 2654 | } else { | 2665 | trace_print_context(iter); |
| 2655 | if (!trace_print_context(iter)) | ||
| 2656 | goto partial; | ||
| 2657 | } | ||
| 2658 | } | 2666 | } |
| 2659 | 2667 | ||
| 2668 | if (trace_seq_has_overflowed(s)) | ||
| 2669 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2670 | |||
| 2660 | if (event) | 2671 | if (event) |
| 2661 | return event->funcs->trace(iter, sym_flags, event); | 2672 | return event->funcs->trace(iter, sym_flags, event); |
| 2662 | 2673 | ||
| 2663 | if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) | 2674 | trace_seq_printf(s, "Unknown type %d\n", entry->type); |
| 2664 | goto partial; | ||
| 2665 | 2675 | ||
| 2666 | return TRACE_TYPE_HANDLED; | 2676 | return trace_handle_return(s); |
| 2667 | partial: | ||
| 2668 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2669 | } | 2677 | } |
| 2670 | 2678 | ||
| 2671 | static enum print_line_t print_raw_fmt(struct trace_iterator *iter) | 2679 | static enum print_line_t print_raw_fmt(struct trace_iterator *iter) |
| @@ -2676,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) | |||
| 2676 | 2684 | ||
| 2677 | entry = iter->ent; | 2685 | entry = iter->ent; |
| 2678 | 2686 | ||
| 2679 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2687 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) |
| 2680 | if (!trace_seq_printf(s, "%d %d %llu ", | 2688 | trace_seq_printf(s, "%d %d %llu ", |
| 2681 | entry->pid, iter->cpu, iter->ts)) | 2689 | entry->pid, iter->cpu, iter->ts); |
| 2682 | goto partial; | 2690 | |
| 2683 | } | 2691 | if (trace_seq_has_overflowed(s)) |
| 2692 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2684 | 2693 | ||
| 2685 | event = ftrace_find_event(entry->type); | 2694 | event = ftrace_find_event(entry->type); |
| 2686 | if (event) | 2695 | if (event) |
| 2687 | return event->funcs->raw(iter, 0, event); | 2696 | return event->funcs->raw(iter, 0, event); |
| 2688 | 2697 | ||
| 2689 | if (!trace_seq_printf(s, "%d ?\n", entry->type)) | 2698 | trace_seq_printf(s, "%d ?\n", entry->type); |
| 2690 | goto partial; | ||
| 2691 | 2699 | ||
| 2692 | return TRACE_TYPE_HANDLED; | 2700 | return trace_handle_return(s); |
| 2693 | partial: | ||
| 2694 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2695 | } | 2701 | } |
| 2696 | 2702 | ||
| 2697 | static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | 2703 | static enum print_line_t print_hex_fmt(struct trace_iterator *iter) |
| @@ -2704,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | |||
| 2704 | entry = iter->ent; | 2710 | entry = iter->ent; |
| 2705 | 2711 | ||
| 2706 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2712 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2707 | SEQ_PUT_HEX_FIELD_RET(s, entry->pid); | 2713 | SEQ_PUT_HEX_FIELD(s, entry->pid); |
| 2708 | SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); | 2714 | SEQ_PUT_HEX_FIELD(s, iter->cpu); |
| 2709 | SEQ_PUT_HEX_FIELD_RET(s, iter->ts); | 2715 | SEQ_PUT_HEX_FIELD(s, iter->ts); |
| 2716 | if (trace_seq_has_overflowed(s)) | ||
| 2717 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2710 | } | 2718 | } |
| 2711 | 2719 | ||
| 2712 | event = ftrace_find_event(entry->type); | 2720 | event = ftrace_find_event(entry->type); |
| @@ -2716,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) | |||
| 2716 | return ret; | 2724 | return ret; |
| 2717 | } | 2725 | } |
| 2718 | 2726 | ||
| 2719 | SEQ_PUT_FIELD_RET(s, newline); | 2727 | SEQ_PUT_FIELD(s, newline); |
| 2720 | 2728 | ||
| 2721 | return TRACE_TYPE_HANDLED; | 2729 | return trace_handle_return(s); |
| 2722 | } | 2730 | } |
| 2723 | 2731 | ||
| 2724 | static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | 2732 | static enum print_line_t print_bin_fmt(struct trace_iterator *iter) |
| @@ -2730,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) | |||
| 2730 | entry = iter->ent; | 2738 | entry = iter->ent; |
| 2731 | 2739 | ||
| 2732 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 2740 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 2733 | SEQ_PUT_FIELD_RET(s, entry->pid); | 2741 | SEQ_PUT_FIELD(s, entry->pid); |
| 2734 | SEQ_PUT_FIELD_RET(s, iter->cpu); | 2742 | SEQ_PUT_FIELD(s, iter->cpu); |
| 2735 | SEQ_PUT_FIELD_RET(s, iter->ts); | 2743 | SEQ_PUT_FIELD(s, iter->ts); |
| 2744 | if (trace_seq_has_overflowed(s)) | ||
| 2745 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2736 | } | 2746 | } |
| 2737 | 2747 | ||
| 2738 | event = ftrace_find_event(entry->type); | 2748 | event = ftrace_find_event(entry->type); |
| @@ -2778,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
| 2778 | { | 2788 | { |
| 2779 | enum print_line_t ret; | 2789 | enum print_line_t ret; |
| 2780 | 2790 | ||
| 2781 | if (iter->lost_events && | 2791 | if (iter->lost_events) { |
| 2782 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2792 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
| 2783 | iter->cpu, iter->lost_events)) | 2793 | iter->cpu, iter->lost_events); |
| 2784 | return TRACE_TYPE_PARTIAL_LINE; | 2794 | if (trace_seq_has_overflowed(&iter->seq)) |
| 2795 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2796 | } | ||
| 2785 | 2797 | ||
| 2786 | if (iter->trace && iter->trace->print_line) { | 2798 | if (iter->trace && iter->trace->print_line) { |
| 2787 | ret = iter->trace->print_line(iter); | 2799 | ret = iter->trace->print_line(iter); |
| @@ -2859,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m) | |||
| 2859 | { | 2871 | { |
| 2860 | if (!ftrace_is_dead()) | 2872 | if (!ftrace_is_dead()) |
| 2861 | return; | 2873 | return; |
| 2862 | seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | 2874 | seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" |
| 2863 | seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); | 2875 | "# MAY BE MISSING FUNCTION EVENTS\n"); |
| 2864 | } | 2876 | } |
| 2865 | 2877 | ||
| 2866 | #ifdef CONFIG_TRACER_MAX_TRACE | 2878 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 2867 | static void show_snapshot_main_help(struct seq_file *m) | 2879 | static void show_snapshot_main_help(struct seq_file *m) |
| 2868 | { | 2880 | { |
| 2869 | seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); | 2881 | seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" |
| 2870 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2882 | "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" |
| 2871 | seq_printf(m, "# Takes a snapshot of the main buffer.\n"); | 2883 | "# Takes a snapshot of the main buffer.\n" |
| 2872 | seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); | 2884 | "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" |
| 2873 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2885 | "# (Doesn't have to be '2' works with any number that\n" |
| 2874 | seq_printf(m, "# is not a '0' or '1')\n"); | 2886 | "# is not a '0' or '1')\n"); |
| 2875 | } | 2887 | } |
| 2876 | 2888 | ||
| 2877 | static void show_snapshot_percpu_help(struct seq_file *m) | 2889 | static void show_snapshot_percpu_help(struct seq_file *m) |
| 2878 | { | 2890 | { |
| 2879 | seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); | 2891 | seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); |
| 2880 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP | 2892 | #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP |
| 2881 | seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); | 2893 | seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" |
| 2882 | seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); | 2894 | "# Takes a snapshot of the main buffer for this cpu.\n"); |
| 2883 | #else | 2895 | #else |
| 2884 | seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); | 2896 | seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" |
| 2885 | seq_printf(m, "# Must use main snapshot file to allocate.\n"); | 2897 | "# Must use main snapshot file to allocate.\n"); |
| 2886 | #endif | 2898 | #endif |
| 2887 | seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); | 2899 | seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" |
| 2888 | seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); | 2900 | "# (Doesn't have to be '2' works with any number that\n" |
| 2889 | seq_printf(m, "# is not a '0' or '1')\n"); | 2901 | "# is not a '0' or '1')\n"); |
| 2890 | } | 2902 | } |
| 2891 | 2903 | ||
| 2892 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) | 2904 | static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) |
| 2893 | { | 2905 | { |
| 2894 | if (iter->tr->allocated_snapshot) | 2906 | if (iter->tr->allocated_snapshot) |
| 2895 | seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); | 2907 | seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); |
| 2896 | else | 2908 | else |
| 2897 | seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); | 2909 | seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); |
| 2898 | 2910 | ||
| 2899 | seq_printf(m, "# Snapshot commands:\n"); | 2911 | seq_puts(m, "# Snapshot commands:\n"); |
| 2900 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) | 2912 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) |
| 2901 | show_snapshot_main_help(m); | 2913 | show_snapshot_main_help(m); |
| 2902 | else | 2914 | else |
| @@ -3250,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v) | |||
| 3250 | if (!t) | 3262 | if (!t) |
| 3251 | return 0; | 3263 | return 0; |
| 3252 | 3264 | ||
| 3253 | seq_printf(m, "%s", t->name); | 3265 | seq_puts(m, t->name); |
| 3254 | if (t->next) | 3266 | if (t->next) |
| 3255 | seq_putc(m, ' '); | 3267 | seq_putc(m, ' '); |
| 3256 | else | 3268 | else |
| @@ -4313,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
| 4313 | goto out; | 4325 | goto out; |
| 4314 | } | 4326 | } |
| 4315 | 4327 | ||
| 4328 | trace_seq_init(&iter->seq); | ||
| 4329 | |||
| 4316 | /* | 4330 | /* |
| 4317 | * We make a copy of the current tracer to avoid concurrent | 4331 | * We make a copy of the current tracer to avoid concurrent |
| 4318 | * changes on it while we are reading. | 4332 | * changes on it while we are reading. |
| @@ -4434,15 +4448,12 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 4434 | 4448 | ||
| 4435 | mutex_unlock(&iter->mutex); | 4449 | mutex_unlock(&iter->mutex); |
| 4436 | 4450 | ||
| 4437 | ret = wait_on_pipe(iter); | 4451 | ret = wait_on_pipe(iter, false); |
| 4438 | 4452 | ||
| 4439 | mutex_lock(&iter->mutex); | 4453 | mutex_lock(&iter->mutex); |
| 4440 | 4454 | ||
| 4441 | if (ret) | 4455 | if (ret) |
| 4442 | return ret; | 4456 | return ret; |
| 4443 | |||
| 4444 | if (signal_pending(current)) | ||
| 4445 | return -EINTR; | ||
| 4446 | } | 4457 | } |
| 4447 | 4458 | ||
| 4448 | return 1; | 4459 | return 1; |
| @@ -4509,18 +4520,18 @@ waitagain: | |||
| 4509 | trace_access_lock(iter->cpu_file); | 4520 | trace_access_lock(iter->cpu_file); |
| 4510 | while (trace_find_next_entry_inc(iter) != NULL) { | 4521 | while (trace_find_next_entry_inc(iter) != NULL) { |
| 4511 | enum print_line_t ret; | 4522 | enum print_line_t ret; |
| 4512 | int len = iter->seq.len; | 4523 | int save_len = iter->seq.seq.len; |
| 4513 | 4524 | ||
| 4514 | ret = print_trace_line(iter); | 4525 | ret = print_trace_line(iter); |
| 4515 | if (ret == TRACE_TYPE_PARTIAL_LINE) { | 4526 | if (ret == TRACE_TYPE_PARTIAL_LINE) { |
| 4516 | /* don't print partial lines */ | 4527 | /* don't print partial lines */ |
| 4517 | iter->seq.len = len; | 4528 | iter->seq.seq.len = save_len; |
| 4518 | break; | 4529 | break; |
| 4519 | } | 4530 | } |
| 4520 | if (ret != TRACE_TYPE_NO_CONSUME) | 4531 | if (ret != TRACE_TYPE_NO_CONSUME) |
| 4521 | trace_consume(iter); | 4532 | trace_consume(iter); |
| 4522 | 4533 | ||
| 4523 | if (iter->seq.len >= cnt) | 4534 | if (trace_seq_used(&iter->seq) >= cnt) |
| 4524 | break; | 4535 | break; |
| 4525 | 4536 | ||
| 4526 | /* | 4537 | /* |
| @@ -4536,7 +4547,7 @@ waitagain: | |||
| 4536 | 4547 | ||
| 4537 | /* Now copy what we have to the user */ | 4548 | /* Now copy what we have to the user */ |
| 4538 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); | 4549 | sret = trace_seq_to_user(&iter->seq, ubuf, cnt); |
| 4539 | if (iter->seq.readpos >= iter->seq.len) | 4550 | if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) |
| 4540 | trace_seq_init(&iter->seq); | 4551 | trace_seq_init(&iter->seq); |
| 4541 | 4552 | ||
| 4542 | /* | 4553 | /* |
| @@ -4570,20 +4581,33 @@ static size_t | |||
| 4570 | tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) | 4581 | tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) |
| 4571 | { | 4582 | { |
| 4572 | size_t count; | 4583 | size_t count; |
| 4584 | int save_len; | ||
| 4573 | int ret; | 4585 | int ret; |
| 4574 | 4586 | ||
| 4575 | /* Seq buffer is page-sized, exactly what we need. */ | 4587 | /* Seq buffer is page-sized, exactly what we need. */ |
| 4576 | for (;;) { | 4588 | for (;;) { |
| 4577 | count = iter->seq.len; | 4589 | save_len = iter->seq.seq.len; |
| 4578 | ret = print_trace_line(iter); | 4590 | ret = print_trace_line(iter); |
| 4579 | count = iter->seq.len - count; | 4591 | |
| 4580 | if (rem < count) { | 4592 | if (trace_seq_has_overflowed(&iter->seq)) { |
| 4581 | rem = 0; | 4593 | iter->seq.seq.len = save_len; |
| 4582 | iter->seq.len -= count; | ||
| 4583 | break; | 4594 | break; |
| 4584 | } | 4595 | } |
| 4596 | |||
| 4597 | /* | ||
| 4598 | * This should not be hit, because it should only | ||
| 4599 | * be set if the iter->seq overflowed. But check it | ||
| 4600 | * anyway to be safe. | ||
| 4601 | */ | ||
| 4585 | if (ret == TRACE_TYPE_PARTIAL_LINE) { | 4602 | if (ret == TRACE_TYPE_PARTIAL_LINE) { |
| 4586 | iter->seq.len -= count; | 4603 | iter->seq.seq.len = save_len; |
| 4604 | break; | ||
| 4605 | } | ||
| 4606 | |||
| 4607 | count = trace_seq_used(&iter->seq) - save_len; | ||
| 4608 | if (rem < count) { | ||
| 4609 | rem = 0; | ||
| 4610 | iter->seq.seq.len = save_len; | ||
| 4587 | break; | 4611 | break; |
| 4588 | } | 4612 | } |
| 4589 | 4613 | ||
| @@ -4664,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
| 4664 | /* Copy the data into the page, so we can start over. */ | 4688 | /* Copy the data into the page, so we can start over. */ |
| 4665 | ret = trace_seq_to_buffer(&iter->seq, | 4689 | ret = trace_seq_to_buffer(&iter->seq, |
| 4666 | page_address(spd.pages[i]), | 4690 | page_address(spd.pages[i]), |
| 4667 | iter->seq.len); | 4691 | trace_seq_used(&iter->seq)); |
| 4668 | if (ret < 0) { | 4692 | if (ret < 0) { |
| 4669 | __free_page(spd.pages[i]); | 4693 | __free_page(spd.pages[i]); |
| 4670 | break; | 4694 | break; |
| 4671 | } | 4695 | } |
| 4672 | spd.partial[i].offset = 0; | 4696 | spd.partial[i].offset = 0; |
| 4673 | spd.partial[i].len = iter->seq.len; | 4697 | spd.partial[i].len = trace_seq_used(&iter->seq); |
| 4674 | 4698 | ||
| 4675 | trace_seq_init(&iter->seq); | 4699 | trace_seq_init(&iter->seq); |
| 4676 | } | 4700 | } |
| @@ -5372,16 +5396,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
| 5372 | goto out_unlock; | 5396 | goto out_unlock; |
| 5373 | } | 5397 | } |
| 5374 | mutex_unlock(&trace_types_lock); | 5398 | mutex_unlock(&trace_types_lock); |
| 5375 | ret = wait_on_pipe(iter); | 5399 | ret = wait_on_pipe(iter, false); |
| 5376 | mutex_lock(&trace_types_lock); | 5400 | mutex_lock(&trace_types_lock); |
| 5377 | if (ret) { | 5401 | if (ret) { |
| 5378 | size = ret; | 5402 | size = ret; |
| 5379 | goto out_unlock; | 5403 | goto out_unlock; |
| 5380 | } | 5404 | } |
| 5381 | if (signal_pending(current)) { | ||
| 5382 | size = -EINTR; | ||
| 5383 | goto out_unlock; | ||
| 5384 | } | ||
| 5385 | goto again; | 5405 | goto again; |
| 5386 | } | 5406 | } |
| 5387 | size = 0; | 5407 | size = 0; |
| @@ -5500,7 +5520,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5500 | }; | 5520 | }; |
| 5501 | struct buffer_ref *ref; | 5521 | struct buffer_ref *ref; |
| 5502 | int entries, size, i; | 5522 | int entries, size, i; |
| 5503 | ssize_t ret; | 5523 | ssize_t ret = 0; |
| 5504 | 5524 | ||
| 5505 | mutex_lock(&trace_types_lock); | 5525 | mutex_lock(&trace_types_lock); |
| 5506 | 5526 | ||
| @@ -5538,13 +5558,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5538 | int r; | 5558 | int r; |
| 5539 | 5559 | ||
| 5540 | ref = kzalloc(sizeof(*ref), GFP_KERNEL); | 5560 | ref = kzalloc(sizeof(*ref), GFP_KERNEL); |
| 5541 | if (!ref) | 5561 | if (!ref) { |
| 5562 | ret = -ENOMEM; | ||
| 5542 | break; | 5563 | break; |
| 5564 | } | ||
| 5543 | 5565 | ||
| 5544 | ref->ref = 1; | 5566 | ref->ref = 1; |
| 5545 | ref->buffer = iter->trace_buffer->buffer; | 5567 | ref->buffer = iter->trace_buffer->buffer; |
| 5546 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); | 5568 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); |
| 5547 | if (!ref->page) { | 5569 | if (!ref->page) { |
| 5570 | ret = -ENOMEM; | ||
| 5548 | kfree(ref); | 5571 | kfree(ref); |
| 5549 | break; | 5572 | break; |
| 5550 | } | 5573 | } |
| @@ -5582,19 +5605,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5582 | 5605 | ||
| 5583 | /* did we read anything? */ | 5606 | /* did we read anything? */ |
| 5584 | if (!spd.nr_pages) { | 5607 | if (!spd.nr_pages) { |
| 5608 | if (ret) | ||
| 5609 | goto out; | ||
| 5610 | |||
| 5585 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { | 5611 | if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { |
| 5586 | ret = -EAGAIN; | 5612 | ret = -EAGAIN; |
| 5587 | goto out; | 5613 | goto out; |
| 5588 | } | 5614 | } |
| 5589 | mutex_unlock(&trace_types_lock); | 5615 | mutex_unlock(&trace_types_lock); |
| 5590 | ret = wait_on_pipe(iter); | 5616 | ret = wait_on_pipe(iter, true); |
| 5591 | mutex_lock(&trace_types_lock); | 5617 | mutex_lock(&trace_types_lock); |
| 5592 | if (ret) | 5618 | if (ret) |
| 5593 | goto out; | 5619 | goto out; |
| 5594 | if (signal_pending(current)) { | 5620 | |
| 5595 | ret = -EINTR; | ||
| 5596 | goto out; | ||
| 5597 | } | ||
| 5598 | goto again; | 5621 | goto again; |
| 5599 | } | 5622 | } |
| 5600 | 5623 | ||
| @@ -5671,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
| 5671 | cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); | 5694 | cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); |
| 5672 | trace_seq_printf(s, "read events: %ld\n", cnt); | 5695 | trace_seq_printf(s, "read events: %ld\n", cnt); |
| 5673 | 5696 | ||
| 5674 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 5697 | count = simple_read_from_buffer(ubuf, count, ppos, |
| 5698 | s->buffer, trace_seq_used(s)); | ||
| 5675 | 5699 | ||
| 5676 | kfree(s); | 5700 | kfree(s); |
| 5677 | 5701 | ||
| @@ -5752,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, | |||
| 5752 | 5776 | ||
| 5753 | seq_printf(m, "%ps:", (void *)ip); | 5777 | seq_printf(m, "%ps:", (void *)ip); |
| 5754 | 5778 | ||
| 5755 | seq_printf(m, "snapshot"); | 5779 | seq_puts(m, "snapshot"); |
| 5756 | 5780 | ||
| 5757 | if (count == -1) | 5781 | if (count == -1) |
| 5758 | seq_printf(m, ":unlimited\n"); | 5782 | seq_puts(m, ":unlimited\n"); |
| 5759 | else | 5783 | else |
| 5760 | seq_printf(m, ":count=%ld\n", count); | 5784 | seq_printf(m, ":count=%ld\n", count); |
| 5761 | 5785 | ||
| @@ -6420,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m | |||
| 6420 | int ret; | 6444 | int ret; |
| 6421 | 6445 | ||
| 6422 | /* Paranoid: Make sure the parent is the "instances" directory */ | 6446 | /* Paranoid: Make sure the parent is the "instances" directory */ |
| 6423 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | 6447 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); |
| 6424 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | 6448 | if (WARN_ON_ONCE(parent != trace_instance_dir)) |
| 6425 | return -ENOENT; | 6449 | return -ENOENT; |
| 6426 | 6450 | ||
| @@ -6447,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) | |||
| 6447 | int ret; | 6471 | int ret; |
| 6448 | 6472 | ||
| 6449 | /* Paranoid: Make sure the parent is the "instances" directory */ | 6473 | /* Paranoid: Make sure the parent is the "instances" directory */ |
| 6450 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); | 6474 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); |
| 6451 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | 6475 | if (WARN_ON_ONCE(parent != trace_instance_dir)) |
| 6452 | return -ENOENT; | 6476 | return -ENOENT; |
| 6453 | 6477 | ||
| @@ -6634,11 +6658,19 @@ void | |||
| 6634 | trace_printk_seq(struct trace_seq *s) | 6658 | trace_printk_seq(struct trace_seq *s) |
| 6635 | { | 6659 | { |
| 6636 | /* Probably should print a warning here. */ | 6660 | /* Probably should print a warning here. */ |
| 6637 | if (s->len >= TRACE_MAX_PRINT) | 6661 | if (s->seq.len >= TRACE_MAX_PRINT) |
| 6638 | s->len = TRACE_MAX_PRINT; | 6662 | s->seq.len = TRACE_MAX_PRINT; |
| 6663 | |||
| 6664 | /* | ||
| 6665 | * More paranoid code. Although the buffer size is set to | ||
| 6666 | * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just | ||
| 6667 | * an extra layer of protection. | ||
| 6668 | */ | ||
| 6669 | if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) | ||
| 6670 | s->seq.len = s->seq.size - 1; | ||
| 6639 | 6671 | ||
| 6640 | /* should be zero ended, but we are paranoid. */ | 6672 | /* should be zero ended, but we are paranoid. */ |
| 6641 | s->buffer[s->len] = 0; | 6673 | s->buffer[s->seq.len] = 0; |
| 6642 | 6674 | ||
| 6643 | printk(KERN_TRACE "%s", s->buffer); | 6675 | printk(KERN_TRACE "%s", s->buffer); |
| 6644 | 6676 | ||
| @@ -6877,6 +6909,19 @@ out: | |||
| 6877 | return ret; | 6909 | return ret; |
| 6878 | } | 6910 | } |
| 6879 | 6911 | ||
| 6912 | void __init trace_init(void) | ||
| 6913 | { | ||
| 6914 | if (tracepoint_printk) { | ||
| 6915 | tracepoint_print_iter = | ||
| 6916 | kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); | ||
| 6917 | if (WARN_ON(!tracepoint_print_iter)) | ||
| 6918 | tracepoint_printk = 0; | ||
| 6919 | } | ||
| 6920 | tracer_alloc_buffers(); | ||
| 6921 | init_ftrace_syscalls(); | ||
| 6922 | trace_event_init(); | ||
| 6923 | } | ||
| 6924 | |||
| 6880 | __init static int clear_boot_tracer(void) | 6925 | __init static int clear_boot_tracer(void) |
| 6881 | { | 6926 | { |
| 6882 | /* | 6927 | /* |
| @@ -6896,6 +6941,5 @@ __init static int clear_boot_tracer(void) | |||
| 6896 | return 0; | 6941 | return 0; |
| 6897 | } | 6942 | } |
| 6898 | 6943 | ||
| 6899 | early_initcall(tracer_alloc_buffers); | ||
| 6900 | fs_initcall(tracer_init_debugfs); | 6944 | fs_initcall(tracer_init_debugfs); |
| 6901 | late_initcall(clear_boot_tracer); | 6945 | late_initcall(clear_boot_tracer); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 385391fb1d3b..8de48bac1ce2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/trace_seq.h> | 14 | #include <linux/trace_seq.h> |
| 15 | #include <linux/ftrace_event.h> | 15 | #include <linux/ftrace_event.h> |
| 16 | #include <linux/compiler.h> | 16 | #include <linux/compiler.h> |
| 17 | #include <linux/trace_seq.h> | ||
| 17 | 18 | ||
| 18 | #ifdef CONFIG_FTRACE_SYSCALLS | 19 | #ifdef CONFIG_FTRACE_SYSCALLS |
| 19 | #include <asm/unistd.h> /* For NR_SYSCALLS */ | 20 | #include <asm/unistd.h> /* For NR_SYSCALLS */ |
| @@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
| 569 | 570 | ||
| 570 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 571 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
| 571 | 572 | ||
| 572 | void tracing_sched_switch_trace(struct trace_array *tr, | ||
| 573 | struct task_struct *prev, | ||
| 574 | struct task_struct *next, | ||
| 575 | unsigned long flags, int pc); | ||
| 576 | |||
| 577 | void tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 578 | struct task_struct *wakee, | ||
| 579 | struct task_struct *cur, | ||
| 580 | unsigned long flags, int pc); | ||
| 581 | void trace_function(struct trace_array *tr, | 573 | void trace_function(struct trace_array *tr, |
| 582 | unsigned long ip, | 574 | unsigned long ip, |
| 583 | unsigned long parent_ip, | 575 | unsigned long parent_ip, |
| @@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr); | |||
| 597 | 589 | ||
| 598 | void tracing_start_cmdline_record(void); | 590 | void tracing_start_cmdline_record(void); |
| 599 | void tracing_stop_cmdline_record(void); | 591 | void tracing_stop_cmdline_record(void); |
| 600 | void tracing_sched_switch_assign_trace(struct trace_array *tr); | ||
| 601 | void tracing_stop_sched_switch_record(void); | ||
| 602 | void tracing_start_sched_switch_record(void); | ||
| 603 | int register_tracer(struct tracer *type); | 592 | int register_tracer(struct tracer *type); |
| 604 | int is_tracing_stopped(void); | 593 | int is_tracing_stopped(void); |
| 605 | 594 | ||
| @@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); | |||
| 719 | 708 | ||
| 720 | extern unsigned long trace_flags; | 709 | extern unsigned long trace_flags; |
| 721 | 710 | ||
| 711 | extern char trace_find_mark(unsigned long long duration); | ||
| 712 | |||
| 722 | /* Standard output formatting function used for function return traces */ | 713 | /* Standard output formatting function used for function return traces */ |
| 723 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 714 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 724 | 715 | ||
| @@ -737,7 +728,7 @@ extern unsigned long trace_flags; | |||
| 737 | extern enum print_line_t | 728 | extern enum print_line_t |
| 738 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); | 729 | print_graph_function_flags(struct trace_iterator *iter, u32 flags); |
| 739 | extern void print_graph_headers_flags(struct seq_file *s, u32 flags); | 730 | extern void print_graph_headers_flags(struct seq_file *s, u32 flags); |
| 740 | extern enum print_line_t | 731 | extern void |
| 741 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); | 732 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); |
| 742 | extern void graph_trace_open(struct trace_iterator *iter); | 733 | extern void graph_trace_open(struct trace_iterator *iter); |
| 743 | extern void graph_trace_close(struct trace_iterator *iter); | 734 | extern void graph_trace_close(struct trace_iterator *iter); |
| @@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, | |||
| 1310 | #define perf_ftrace_event_register NULL | 1301 | #define perf_ftrace_event_register NULL |
| 1311 | #endif | 1302 | #endif |
| 1312 | 1303 | ||
| 1304 | #ifdef CONFIG_FTRACE_SYSCALLS | ||
| 1305 | void init_ftrace_syscalls(void); | ||
| 1306 | #else | ||
| 1307 | static inline void init_ftrace_syscalls(void) { } | ||
| 1308 | #endif | ||
| 1309 | |||
| 1310 | #ifdef CONFIG_EVENT_TRACING | ||
| 1311 | void trace_event_init(void); | ||
| 1312 | #else | ||
| 1313 | static inline void __init trace_event_init(void) { } | ||
| 1314 | #endif | ||
| 1315 | |||
| 1316 | extern struct trace_iterator *tracepoint_print_iter; | ||
| 1317 | |||
| 1313 | #endif /* _LINUX_KERNEL_TRACE_H */ | 1318 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f0..7d6e2afde669 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
| @@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, | |||
| 151 | 151 | ||
| 152 | trace_assign_type(field, iter->ent); | 152 | trace_assign_type(field, iter->ent); |
| 153 | 153 | ||
| 154 | if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", | 154 | trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", |
| 155 | field->correct ? " ok " : " MISS ", | 155 | field->correct ? " ok " : " MISS ", |
| 156 | field->func, | 156 | field->func, |
| 157 | field->file, | 157 | field->file, |
| 158 | field->line)) | 158 | field->line); |
| 159 | return TRACE_TYPE_PARTIAL_LINE; | 159 | |
| 160 | 160 | return trace_handle_return(&iter->seq); | |
| 161 | return TRACE_TYPE_HANDLED; | ||
| 162 | } | 161 | } |
| 163 | 162 | ||
| 164 | static void branch_print_header(struct seq_file *s) | 163 | static void branch_print_header(struct seq_file *s) |
| 165 | { | 164 | { |
| 166 | seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" | 165 | seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" |
| 167 | " FUNC:FILE:LINE\n"); | 166 | " FUNC:FILE:LINE\n" |
| 168 | seq_puts(s, "# | | | | | " | 167 | "# | | | | | " |
| 169 | " |\n"); | 168 | " |\n"); |
| 170 | } | 169 | } |
| 171 | 170 | ||
| 172 | static struct trace_event_functions trace_branch_funcs = { | 171 | static struct trace_event_functions trace_branch_funcs = { |
| @@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[]; | |||
| 233 | 232 | ||
| 234 | static int annotated_branch_stat_headers(struct seq_file *m) | 233 | static int annotated_branch_stat_headers(struct seq_file *m) |
| 235 | { | 234 | { |
| 236 | seq_printf(m, " correct incorrect %% "); | 235 | seq_puts(m, " correct incorrect % " |
| 237 | seq_printf(m, " Function " | 236 | " Function " |
| 238 | " File Line\n" | 237 | " File Line\n" |
| 239 | " ------- --------- - " | 238 | " ------- --------- - " |
| 240 | " -------- " | 239 | " -------- " |
| 241 | " ---- ----\n"); | 240 | " ---- ----\n"); |
| 242 | return 0; | 241 | return 0; |
| 243 | } | 242 | } |
| 244 | 243 | ||
| @@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) | |||
| 274 | 273 | ||
| 275 | seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); | 274 | seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); |
| 276 | if (percent < 0) | 275 | if (percent < 0) |
| 277 | seq_printf(m, " X "); | 276 | seq_puts(m, " X "); |
| 278 | else | 277 | else |
| 279 | seq_printf(m, "%3ld ", percent); | 278 | seq_printf(m, "%3ld ", percent); |
| 280 | seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); | 279 | seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); |
| @@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[]; | |||
| 362 | 361 | ||
| 363 | static int all_branch_stat_headers(struct seq_file *m) | 362 | static int all_branch_stat_headers(struct seq_file *m) |
| 364 | { | 363 | { |
| 365 | seq_printf(m, " miss hit %% "); | 364 | seq_puts(m, " miss hit % " |
| 366 | seq_printf(m, " Function " | 365 | " Function " |
| 367 | " File Line\n" | 366 | " File Line\n" |
| 368 | " ------- --------- - " | 367 | " ------- --------- - " |
| 369 | " -------- " | 368 | " -------- " |
| 370 | " ---- ----\n"); | 369 | " ---- ----\n"); |
| 371 | return 0; | 370 | return 0; |
| 372 | } | 371 | } |
| 373 | 372 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ef06ce7e9cf8..366a78a3e61e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, | |||
| 212 | } | 212 | } |
| 213 | EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); | 213 | EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); |
| 214 | 214 | ||
| 215 | static DEFINE_SPINLOCK(tracepoint_iter_lock); | ||
| 216 | |||
| 217 | static void output_printk(struct ftrace_event_buffer *fbuffer) | ||
| 218 | { | ||
| 219 | struct ftrace_event_call *event_call; | ||
| 220 | struct trace_event *event; | ||
| 221 | unsigned long flags; | ||
| 222 | struct trace_iterator *iter = tracepoint_print_iter; | ||
| 223 | |||
| 224 | if (!iter) | ||
| 225 | return; | ||
| 226 | |||
| 227 | event_call = fbuffer->ftrace_file->event_call; | ||
| 228 | if (!event_call || !event_call->event.funcs || | ||
| 229 | !event_call->event.funcs->trace) | ||
| 230 | return; | ||
| 231 | |||
| 232 | event = &fbuffer->ftrace_file->event_call->event; | ||
| 233 | |||
| 234 | spin_lock_irqsave(&tracepoint_iter_lock, flags); | ||
| 235 | trace_seq_init(&iter->seq); | ||
| 236 | iter->ent = fbuffer->entry; | ||
| 237 | event_call->event.funcs->trace(iter, 0, event); | ||
| 238 | trace_seq_putc(&iter->seq, 0); | ||
| 239 | printk("%s", iter->seq.buffer); | ||
| 240 | |||
| 241 | spin_unlock_irqrestore(&tracepoint_iter_lock, flags); | ||
| 242 | } | ||
| 243 | |||
| 215 | void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) | 244 | void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) |
| 216 | { | 245 | { |
| 246 | if (tracepoint_printk) | ||
| 247 | output_printk(fbuffer); | ||
| 248 | |||
| 217 | event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, | 249 | event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, |
| 218 | fbuffer->event, fbuffer->entry, | 250 | fbuffer->event, fbuffer->entry, |
| 219 | fbuffer->flags, fbuffer->pc); | 251 | fbuffer->flags, fbuffer->pc); |
| @@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
| 461 | 493 | ||
| 462 | if (dir) { | 494 | if (dir) { |
| 463 | spin_lock(&dir->d_lock); /* probably unneeded */ | 495 | spin_lock(&dir->d_lock); /* probably unneeded */ |
| 464 | list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { | 496 | list_for_each_entry(child, &dir->d_subdirs, d_child) { |
| 465 | if (child->d_inode) /* probably unneeded */ | 497 | if (child->d_inode) /* probably unneeded */ |
| 466 | child->d_inode->i_private = NULL; | 498 | child->d_inode->i_private = NULL; |
| 467 | } | 499 | } |
| @@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v) | |||
| 918 | case FORMAT_HEADER: | 950 | case FORMAT_HEADER: |
| 919 | seq_printf(m, "name: %s\n", ftrace_event_name(call)); | 951 | seq_printf(m, "name: %s\n", ftrace_event_name(call)); |
| 920 | seq_printf(m, "ID: %d\n", call->event.type); | 952 | seq_printf(m, "ID: %d\n", call->event.type); |
| 921 | seq_printf(m, "format:\n"); | 953 | seq_puts(m, "format:\n"); |
| 922 | return 0; | 954 | return 0; |
| 923 | 955 | ||
| 924 | case FORMAT_FIELD_SEPERATOR: | 956 | case FORMAT_FIELD_SEPERATOR: |
| @@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 1044 | mutex_unlock(&event_mutex); | 1076 | mutex_unlock(&event_mutex); |
| 1045 | 1077 | ||
| 1046 | if (file) | 1078 | if (file) |
| 1047 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1079 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1080 | s->buffer, trace_seq_used(s)); | ||
| 1048 | 1081 | ||
| 1049 | kfree(s); | 1082 | kfree(s); |
| 1050 | 1083 | ||
| @@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 1210 | trace_seq_init(s); | 1243 | trace_seq_init(s); |
| 1211 | 1244 | ||
| 1212 | print_subsystem_event_filter(system, s); | 1245 | print_subsystem_event_filter(system, s); |
| 1213 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1246 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1247 | s->buffer, trace_seq_used(s)); | ||
| 1214 | 1248 | ||
| 1215 | kfree(s); | 1249 | kfree(s); |
| 1216 | 1250 | ||
| @@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
| 1265 | trace_seq_init(s); | 1299 | trace_seq_init(s); |
| 1266 | 1300 | ||
| 1267 | func(s); | 1301 | func(s); |
| 1268 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1302 | r = simple_read_from_buffer(ubuf, cnt, ppos, |
| 1303 | s->buffer, trace_seq_used(s)); | ||
| 1269 | 1304 | ||
| 1270 | kfree(s); | 1305 | kfree(s); |
| 1271 | 1306 | ||
| @@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, | |||
| 1988 | ftrace_event_name(data->file->event_call)); | 2023 | ftrace_event_name(data->file->event_call)); |
| 1989 | 2024 | ||
| 1990 | if (data->count == -1) | 2025 | if (data->count == -1) |
| 1991 | seq_printf(m, ":unlimited\n"); | 2026 | seq_puts(m, ":unlimited\n"); |
| 1992 | else | 2027 | else |
| 1993 | seq_printf(m, ":count=%ld\n", data->count); | 2028 | seq_printf(m, ":count=%ld\n", data->count); |
| 1994 | 2029 | ||
| @@ -2477,8 +2512,14 @@ static __init int event_trace_init(void) | |||
| 2477 | #endif | 2512 | #endif |
| 2478 | return 0; | 2513 | return 0; |
| 2479 | } | 2514 | } |
| 2480 | early_initcall(event_trace_memsetup); | 2515 | |
| 2481 | core_initcall(event_trace_enable); | 2516 | void __init trace_event_init(void) |
| 2517 | { | ||
| 2518 | event_trace_memsetup(); | ||
| 2519 | init_ftrace_syscalls(); | ||
| 2520 | event_trace_enable(); | ||
| 2521 | } | ||
| 2522 | |||
| 2482 | fs_initcall(event_trace_init); | 2523 | fs_initcall(event_trace_init); |
| 2483 | 2524 | ||
| 2484 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 2525 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
| @@ -2513,8 +2554,11 @@ static __init int event_test_thread(void *unused) | |||
| 2513 | kfree(test_malloc); | 2554 | kfree(test_malloc); |
| 2514 | 2555 | ||
| 2515 | set_current_state(TASK_INTERRUPTIBLE); | 2556 | set_current_state(TASK_INTERRUPTIBLE); |
| 2516 | while (!kthread_should_stop()) | 2557 | while (!kthread_should_stop()) { |
| 2517 | schedule(); | 2558 | schedule(); |
| 2559 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 2560 | } | ||
| 2561 | __set_current_state(TASK_RUNNING); | ||
| 2518 | 2562 | ||
| 2519 | return 0; | 2563 | return 0; |
| 2520 | } | 2564 | } |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7a8c1528e141..ced69da0ff55 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -45,6 +45,7 @@ enum filter_op_ids | |||
| 45 | OP_GT, | 45 | OP_GT, |
| 46 | OP_GE, | 46 | OP_GE, |
| 47 | OP_BAND, | 47 | OP_BAND, |
| 48 | OP_NOT, | ||
| 48 | OP_NONE, | 49 | OP_NONE, |
| 49 | OP_OPEN_PAREN, | 50 | OP_OPEN_PAREN, |
| 50 | }; | 51 | }; |
| @@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = { | |||
| 67 | { OP_GT, ">", 5 }, | 68 | { OP_GT, ">", 5 }, |
| 68 | { OP_GE, ">=", 5 }, | 69 | { OP_GE, ">=", 5 }, |
| 69 | { OP_BAND, "&", 6 }, | 70 | { OP_BAND, "&", 6 }, |
| 71 | { OP_NOT, "!", 6 }, | ||
| 70 | { OP_NONE, "OP_NONE", 0 }, | 72 | { OP_NONE, "OP_NONE", 0 }, |
| 71 | { OP_OPEN_PAREN, "(", 0 }, | 73 | { OP_OPEN_PAREN, "(", 0 }, |
| 72 | }; | 74 | }; |
| @@ -85,6 +87,7 @@ enum { | |||
| 85 | FILT_ERR_MISSING_FIELD, | 87 | FILT_ERR_MISSING_FIELD, |
| 86 | FILT_ERR_INVALID_FILTER, | 88 | FILT_ERR_INVALID_FILTER, |
| 87 | FILT_ERR_IP_FIELD_ONLY, | 89 | FILT_ERR_IP_FIELD_ONLY, |
| 90 | FILT_ERR_ILLEGAL_NOT_OP, | ||
| 88 | }; | 91 | }; |
| 89 | 92 | ||
| 90 | static char *err_text[] = { | 93 | static char *err_text[] = { |
| @@ -101,6 +104,7 @@ static char *err_text[] = { | |||
| 101 | "Missing field name and/or value", | 104 | "Missing field name and/or value", |
| 102 | "Meaningless filter expression", | 105 | "Meaningless filter expression", |
| 103 | "Only 'ip' field is supported for function trace", | 106 | "Only 'ip' field is supported for function trace", |
| 107 | "Illegal use of '!'", | ||
| 104 | }; | 108 | }; |
| 105 | 109 | ||
| 106 | struct opstack_op { | 110 | struct opstack_op { |
| @@ -139,6 +143,7 @@ struct pred_stack { | |||
| 139 | int index; | 143 | int index; |
| 140 | }; | 144 | }; |
| 141 | 145 | ||
| 146 | /* If not of not match is equal to not of not, then it is a match */ | ||
| 142 | #define DEFINE_COMPARISON_PRED(type) \ | 147 | #define DEFINE_COMPARISON_PRED(type) \ |
| 143 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ | 148 | static int filter_pred_##type(struct filter_pred *pred, void *event) \ |
| 144 | { \ | 149 | { \ |
| @@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ | |||
| 166 | break; \ | 171 | break; \ |
| 167 | } \ | 172 | } \ |
| 168 | \ | 173 | \ |
| 169 | return match; \ | 174 | return !!match == !pred->not; \ |
| 170 | } | 175 | } |
| 171 | 176 | ||
| 172 | #define DEFINE_EQUALITY_PRED(size) \ | 177 | #define DEFINE_EQUALITY_PRED(size) \ |
| @@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds, | |||
| 484 | if (!WARN_ON_ONCE(!pred->fn)) | 489 | if (!WARN_ON_ONCE(!pred->fn)) |
| 485 | match = pred->fn(pred, rec); | 490 | match = pred->fn(pred, rec); |
| 486 | if (!!match == type) | 491 | if (!!match == type) |
| 487 | return match; | 492 | break; |
| 488 | } | 493 | } |
| 489 | return match; | 494 | /* If not of not match is equal to not of not, then it is a match */ |
| 495 | return !!match == !op->not; | ||
| 490 | } | 496 | } |
| 491 | 497 | ||
| 492 | struct filter_match_preds_data { | 498 | struct filter_match_preds_data { |
| @@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter, | |||
| 735 | * then this op can be folded. | 741 | * then this op can be folded. |
| 736 | */ | 742 | */ |
| 737 | if (left->index & FILTER_PRED_FOLD && | 743 | if (left->index & FILTER_PRED_FOLD && |
| 738 | (left->op == dest->op || | 744 | ((left->op == dest->op && !left->not) || |
| 739 | left->left == FILTER_PRED_INVALID) && | 745 | left->left == FILTER_PRED_INVALID) && |
| 740 | right->index & FILTER_PRED_FOLD && | 746 | right->index & FILTER_PRED_FOLD && |
| 741 | (right->op == dest->op || | 747 | ((right->op == dest->op && !right->not) || |
| 742 | right->left == FILTER_PRED_INVALID)) | 748 | right->left == FILTER_PRED_INVALID)) |
| 743 | dest->index |= FILTER_PRED_FOLD; | 749 | dest->index |= FILTER_PRED_FOLD; |
| 744 | 750 | ||
| @@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps, | |||
| 1028 | } | 1034 | } |
| 1029 | 1035 | ||
| 1030 | if (pred->op == OP_NE) | 1036 | if (pred->op == OP_NE) |
| 1031 | pred->not = 1; | 1037 | pred->not ^= 1; |
| 1032 | 1038 | ||
| 1033 | pred->fn = fn; | 1039 | pred->fn = fn; |
| 1034 | return 0; | 1040 | return 0; |
| @@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call, | |||
| 1590 | continue; | 1596 | continue; |
| 1591 | } | 1597 | } |
| 1592 | 1598 | ||
| 1599 | if (elt->op == OP_NOT) { | ||
| 1600 | if (!n_preds || operand1 || operand2) { | ||
| 1601 | parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); | ||
| 1602 | err = -EINVAL; | ||
| 1603 | goto fail; | ||
| 1604 | } | ||
| 1605 | if (!dry_run) | ||
| 1606 | filter->preds[n_preds - 1].not ^= 1; | ||
| 1607 | continue; | ||
| 1608 | } | ||
| 1609 | |||
| 1593 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { | 1610 | if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { |
| 1594 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 1611 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
| 1595 | err = -ENOSPC; | 1612 | err = -ENOSPC; |
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4747b476a030..8712df9decb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m, | |||
| 373 | { | 373 | { |
| 374 | long count = (long)data; | 374 | long count = (long)data; |
| 375 | 375 | ||
| 376 | seq_printf(m, "%s", name); | 376 | seq_puts(m, name); |
| 377 | 377 | ||
| 378 | if (count == -1) | 378 | if (count == -1) |
| 379 | seq_puts(m, ":unlimited"); | 379 | seq_puts(m, ":unlimited"); |
| @@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m, | |||
| 383 | if (filter_str) | 383 | if (filter_str) |
| 384 | seq_printf(m, " if %s\n", filter_str); | 384 | seq_printf(m, " if %s\n", filter_str); |
| 385 | else | 385 | else |
| 386 | seq_puts(m, "\n"); | 386 | seq_putc(m, '\n'); |
| 387 | 387 | ||
| 388 | return 0; | 388 | return 0; |
| 389 | } | 389 | } |
| @@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, | |||
| 1105 | if (data->filter_str) | 1105 | if (data->filter_str) |
| 1106 | seq_printf(m, " if %s\n", data->filter_str); | 1106 | seq_printf(m, " if %s\n", data->filter_str); |
| 1107 | else | 1107 | else |
| 1108 | seq_puts(m, "\n"); | 1108 | seq_putc(m, '\n'); |
| 1109 | 1109 | ||
| 1110 | return 0; | 1110 | return 0; |
| 1111 | } | 1111 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 57f0ec962d2c..fcd41a166405 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data = | |||
| 261 | }; | 261 | }; |
| 262 | 262 | ||
| 263 | #ifdef CONFIG_DYNAMIC_FTRACE | 263 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 264 | static int update_count(void **data) | 264 | static void update_traceon_count(void **data, bool on) |
| 265 | { | 265 | { |
| 266 | unsigned long *count = (long *)data; | 266 | long *count = (long *)data; |
| 267 | long old_count = *count; | ||
| 267 | 268 | ||
| 268 | if (!*count) | 269 | /* |
| 269 | return 0; | 270 | * Tracing gets disabled (or enabled) once per count. |
| 271 | * This function can be called at the same time on multiple CPUs. | ||
| 272 | * It is fine if both disable (or enable) tracing, as disabling | ||
| 273 | * (or enabling) the second time doesn't do anything as the | ||
| 274 | * state of the tracer is already disabled (or enabled). | ||
| 275 | * What needs to be synchronized in this case is that the count | ||
| 276 | * only gets decremented once, even if the tracer is disabled | ||
| 277 | * (or enabled) twice, as the second one is really a nop. | ||
| 278 | * | ||
| 279 | * The memory barriers guarantee that we only decrement the | ||
| 280 | * counter once. First the count is read to a local variable | ||
| 281 | * and a read barrier is used to make sure that it is loaded | ||
| 282 | * before checking if the tracer is in the state we want. | ||
| 283 | * If the tracer is not in the state we want, then the count | ||
| 284 | * is guaranteed to be the old count. | ||
| 285 | * | ||
| 286 | * Next the tracer is set to the state we want (disabled or enabled) | ||
| 287 | * then a write memory barrier is used to make sure that | ||
| 288 | * the new state is visible before changing the counter by | ||
| 289 | * one minus the old counter. This guarantees that another CPU | ||
| 290 | * executing this code will see the new state before seeing | ||
| 291 | * the new counter value, and would not do anything if the new | ||
| 292 | * counter is seen. | ||
| 293 | * | ||
| 294 | * Note, there is no synchronization between this and a user | ||
| 295 | * setting the tracing_on file. But we currently don't care | ||
| 296 | * about that. | ||
| 297 | */ | ||
| 298 | if (!old_count) | ||
| 299 | return; | ||
| 270 | 300 | ||
| 271 | if (*count != -1) | 301 | /* Make sure we see count before checking tracing state */ |
| 272 | (*count)--; | 302 | smp_rmb(); |
| 273 | 303 | ||
| 274 | return 1; | 304 | if (on == !!tracing_is_on()) |
| 305 | return; | ||
| 306 | |||
| 307 | if (on) | ||
| 308 | tracing_on(); | ||
| 309 | else | ||
| 310 | tracing_off(); | ||
| 311 | |||
| 312 | /* unlimited? */ | ||
| 313 | if (old_count == -1) | ||
| 314 | return; | ||
| 315 | |||
| 316 | /* Make sure tracing state is visible before updating count */ | ||
| 317 | smp_wmb(); | ||
| 318 | |||
| 319 | *count = old_count - 1; | ||
| 275 | } | 320 | } |
| 276 | 321 | ||
| 277 | static void | 322 | static void |
| 278 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) | 323 | ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 279 | { | 324 | { |
| 280 | if (tracing_is_on()) | 325 | update_traceon_count(data, 1); |
| 281 | return; | ||
| 282 | |||
| 283 | if (update_count(data)) | ||
| 284 | tracing_on(); | ||
| 285 | } | 326 | } |
| 286 | 327 | ||
| 287 | static void | 328 | static void |
| 288 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) | 329 | ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 289 | { | 330 | { |
| 290 | if (!tracing_is_on()) | 331 | update_traceon_count(data, 0); |
| 291 | return; | ||
| 292 | |||
| 293 | if (update_count(data)) | ||
| 294 | tracing_off(); | ||
| 295 | } | 332 | } |
| 296 | 333 | ||
| 297 | static void | 334 | static void |
| @@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) | |||
| 330 | static void | 367 | static void |
| 331 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) | 368 | ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) |
| 332 | { | 369 | { |
| 333 | if (!tracing_is_on()) | 370 | long *count = (long *)data; |
| 334 | return; | 371 | long old_count; |
| 372 | long new_count; | ||
| 335 | 373 | ||
| 336 | if (update_count(data)) | 374 | /* |
| 337 | trace_dump_stack(STACK_SKIP); | 375 | * Stack traces should only execute the number of times the |
| 376 | * user specified in the counter. | ||
| 377 | */ | ||
| 378 | do { | ||
| 379 | |||
| 380 | if (!tracing_is_on()) | ||
| 381 | return; | ||
| 382 | |||
| 383 | old_count = *count; | ||
| 384 | |||
| 385 | if (!old_count) | ||
| 386 | return; | ||
| 387 | |||
| 388 | /* unlimited? */ | ||
| 389 | if (old_count == -1) { | ||
| 390 | trace_dump_stack(STACK_SKIP); | ||
| 391 | return; | ||
| 392 | } | ||
| 393 | |||
| 394 | new_count = old_count - 1; | ||
| 395 | new_count = cmpxchg(count, old_count, new_count); | ||
| 396 | if (new_count == old_count) | ||
| 397 | trace_dump_stack(STACK_SKIP); | ||
| 398 | |||
| 399 | } while (new_count != old_count); | ||
| 400 | } | ||
| 401 | |||
| 402 | static int update_count(void **data) | ||
| 403 | { | ||
| 404 | unsigned long *count = (long *)data; | ||
| 405 | |||
| 406 | if (!*count) | ||
| 407 | return 0; | ||
| 408 | |||
| 409 | if (*count != -1) | ||
| 410 | (*count)--; | ||
| 411 | |||
| 412 | return 1; | ||
| 338 | } | 413 | } |
| 339 | 414 | ||
| 340 | static void | 415 | static void |
| @@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m, | |||
| 361 | seq_printf(m, "%ps:%s", (void *)ip, name); | 436 | seq_printf(m, "%ps:%s", (void *)ip, name); |
| 362 | 437 | ||
| 363 | if (count == -1) | 438 | if (count == -1) |
| 364 | seq_printf(m, ":unlimited\n"); | 439 | seq_puts(m, ":unlimited\n"); |
| 365 | else | 440 | else |
| 366 | seq_printf(m, ":count=%ld\n", count); | 441 | seq_printf(m, ":count=%ld\n", count); |
| 367 | 442 | ||
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f0a0c982cde3..ba476009e5de 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -107,7 +107,7 @@ enum { | |||
| 107 | FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, | 107 | FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, |
| 108 | }; | 108 | }; |
| 109 | 109 | ||
| 110 | static enum print_line_t | 110 | static void |
| 111 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | 111 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
| 112 | u32 flags); | 112 | u32 flags); |
| 113 | 113 | ||
| @@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr) | |||
| 483 | 483 | ||
| 484 | static int max_bytes_for_cpu; | 484 | static int max_bytes_for_cpu; |
| 485 | 485 | ||
| 486 | static enum print_line_t | 486 | static void print_graph_cpu(struct trace_seq *s, int cpu) |
| 487 | print_graph_cpu(struct trace_seq *s, int cpu) | ||
| 488 | { | 487 | { |
| 489 | int ret; | ||
| 490 | |||
| 491 | /* | 488 | /* |
| 492 | * Start with a space character - to make it stand out | 489 | * Start with a space character - to make it stand out |
| 493 | * to the right a bit when trace output is pasted into | 490 | * to the right a bit when trace output is pasted into |
| 494 | * email: | 491 | * email: |
| 495 | */ | 492 | */ |
| 496 | ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); | 493 | trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); |
| 497 | if (!ret) | ||
| 498 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 499 | |||
| 500 | return TRACE_TYPE_HANDLED; | ||
| 501 | } | 494 | } |
| 502 | 495 | ||
| 503 | #define TRACE_GRAPH_PROCINFO_LENGTH 14 | 496 | #define TRACE_GRAPH_PROCINFO_LENGTH 14 |
| 504 | 497 | ||
| 505 | static enum print_line_t | 498 | static void print_graph_proc(struct trace_seq *s, pid_t pid) |
| 506 | print_graph_proc(struct trace_seq *s, pid_t pid) | ||
| 507 | { | 499 | { |
| 508 | char comm[TASK_COMM_LEN]; | 500 | char comm[TASK_COMM_LEN]; |
| 509 | /* sign + log10(MAX_INT) + '\0' */ | 501 | /* sign + log10(MAX_INT) + '\0' */ |
| 510 | char pid_str[11]; | 502 | char pid_str[11]; |
| 511 | int spaces = 0; | 503 | int spaces = 0; |
| 512 | int ret; | ||
| 513 | int len; | 504 | int len; |
| 514 | int i; | 505 | int i; |
| 515 | 506 | ||
| @@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid) | |||
| 524 | spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; | 515 | spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; |
| 525 | 516 | ||
| 526 | /* First spaces to align center */ | 517 | /* First spaces to align center */ |
| 527 | for (i = 0; i < spaces / 2; i++) { | 518 | for (i = 0; i < spaces / 2; i++) |
| 528 | ret = trace_seq_putc(s, ' '); | 519 | trace_seq_putc(s, ' '); |
| 529 | if (!ret) | ||
| 530 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 531 | } | ||
| 532 | 520 | ||
| 533 | ret = trace_seq_printf(s, "%s-%s", comm, pid_str); | 521 | trace_seq_printf(s, "%s-%s", comm, pid_str); |
| 534 | if (!ret) | ||
| 535 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 536 | 522 | ||
| 537 | /* Last spaces to align center */ | 523 | /* Last spaces to align center */ |
| 538 | for (i = 0; i < spaces - (spaces / 2); i++) { | 524 | for (i = 0; i < spaces - (spaces / 2); i++) |
| 539 | ret = trace_seq_putc(s, ' '); | 525 | trace_seq_putc(s, ' '); |
| 540 | if (!ret) | ||
| 541 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 542 | } | ||
| 543 | return TRACE_TYPE_HANDLED; | ||
| 544 | } | 526 | } |
| 545 | 527 | ||
| 546 | 528 | ||
| 547 | static enum print_line_t | 529 | static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
| 548 | print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | ||
| 549 | { | 530 | { |
| 550 | if (!trace_seq_putc(s, ' ')) | 531 | trace_seq_putc(s, ' '); |
| 551 | return 0; | 532 | trace_print_lat_fmt(s, entry); |
| 552 | |||
| 553 | return trace_print_lat_fmt(s, entry); | ||
| 554 | } | 533 | } |
| 555 | 534 | ||
| 556 | /* If the pid changed since the last trace, output this event */ | 535 | /* If the pid changed since the last trace, output this event */ |
| 557 | static enum print_line_t | 536 | static void |
| 558 | verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | 537 | verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) |
| 559 | { | 538 | { |
| 560 | pid_t prev_pid; | 539 | pid_t prev_pid; |
| 561 | pid_t *last_pid; | 540 | pid_t *last_pid; |
| 562 | int ret; | ||
| 563 | 541 | ||
| 564 | if (!data) | 542 | if (!data) |
| 565 | return TRACE_TYPE_HANDLED; | 543 | return; |
| 566 | 544 | ||
| 567 | last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 545 | last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
| 568 | 546 | ||
| 569 | if (*last_pid == pid) | 547 | if (*last_pid == pid) |
| 570 | return TRACE_TYPE_HANDLED; | 548 | return; |
| 571 | 549 | ||
| 572 | prev_pid = *last_pid; | 550 | prev_pid = *last_pid; |
| 573 | *last_pid = pid; | 551 | *last_pid = pid; |
| 574 | 552 | ||
| 575 | if (prev_pid == -1) | 553 | if (prev_pid == -1) |
| 576 | return TRACE_TYPE_HANDLED; | 554 | return; |
| 577 | /* | 555 | /* |
| 578 | * Context-switch trace line: | 556 | * Context-switch trace line: |
| 579 | 557 | ||
| @@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | |||
| 582 | ------------------------------------------ | 560 | ------------------------------------------ |
| 583 | 561 | ||
| 584 | */ | 562 | */ |
| 585 | ret = trace_seq_puts(s, | 563 | trace_seq_puts(s, " ------------------------------------------\n"); |
| 586 | " ------------------------------------------\n"); | 564 | print_graph_cpu(s, cpu); |
| 587 | if (!ret) | 565 | print_graph_proc(s, prev_pid); |
| 588 | return TRACE_TYPE_PARTIAL_LINE; | 566 | trace_seq_puts(s, " => "); |
| 589 | 567 | print_graph_proc(s, pid); | |
| 590 | ret = print_graph_cpu(s, cpu); | 568 | trace_seq_puts(s, "\n ------------------------------------------\n\n"); |
| 591 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 592 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 593 | |||
| 594 | ret = print_graph_proc(s, prev_pid); | ||
| 595 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 596 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 597 | |||
| 598 | ret = trace_seq_puts(s, " => "); | ||
| 599 | if (!ret) | ||
| 600 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 601 | |||
| 602 | ret = print_graph_proc(s, pid); | ||
| 603 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 604 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 605 | |||
| 606 | ret = trace_seq_puts(s, | ||
| 607 | "\n ------------------------------------------\n\n"); | ||
| 608 | if (!ret) | ||
| 609 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 610 | |||
| 611 | return TRACE_TYPE_HANDLED; | ||
| 612 | } | 569 | } |
| 613 | 570 | ||
| 614 | static struct ftrace_graph_ret_entry * | 571 | static struct ftrace_graph_ret_entry * |
| @@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
| 682 | return next; | 639 | return next; |
| 683 | } | 640 | } |
| 684 | 641 | ||
| 685 | static int print_graph_abs_time(u64 t, struct trace_seq *s) | 642 | static void print_graph_abs_time(u64 t, struct trace_seq *s) |
| 686 | { | 643 | { |
| 687 | unsigned long usecs_rem; | 644 | unsigned long usecs_rem; |
| 688 | 645 | ||
| 689 | usecs_rem = do_div(t, NSEC_PER_SEC); | 646 | usecs_rem = do_div(t, NSEC_PER_SEC); |
| 690 | usecs_rem /= 1000; | 647 | usecs_rem /= 1000; |
| 691 | 648 | ||
| 692 | return trace_seq_printf(s, "%5lu.%06lu | ", | 649 | trace_seq_printf(s, "%5lu.%06lu | ", |
| 693 | (unsigned long)t, usecs_rem); | 650 | (unsigned long)t, usecs_rem); |
| 694 | } | 651 | } |
| 695 | 652 | ||
| 696 | static enum print_line_t | 653 | static void |
| 697 | print_graph_irq(struct trace_iterator *iter, unsigned long addr, | 654 | print_graph_irq(struct trace_iterator *iter, unsigned long addr, |
| 698 | enum trace_type type, int cpu, pid_t pid, u32 flags) | 655 | enum trace_type type, int cpu, pid_t pid, u32 flags) |
| 699 | { | 656 | { |
| 700 | int ret; | ||
| 701 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
| 658 | struct trace_entry *ent = iter->ent; | ||
| 702 | 659 | ||
| 703 | if (addr < (unsigned long)__irqentry_text_start || | 660 | if (addr < (unsigned long)__irqentry_text_start || |
| 704 | addr >= (unsigned long)__irqentry_text_end) | 661 | addr >= (unsigned long)__irqentry_text_end) |
| 705 | return TRACE_TYPE_UNHANDLED; | 662 | return; |
| 706 | 663 | ||
| 707 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { | 664 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 708 | /* Absolute time */ | 665 | /* Absolute time */ |
| 709 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 666 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 710 | ret = print_graph_abs_time(iter->ts, s); | 667 | print_graph_abs_time(iter->ts, s); |
| 711 | if (!ret) | ||
| 712 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 713 | } | ||
| 714 | 668 | ||
| 715 | /* Cpu */ | 669 | /* Cpu */ |
| 716 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 670 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 717 | ret = print_graph_cpu(s, cpu); | 671 | print_graph_cpu(s, cpu); |
| 718 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 719 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 720 | } | ||
| 721 | 672 | ||
| 722 | /* Proc */ | 673 | /* Proc */ |
| 723 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 674 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
| 724 | ret = print_graph_proc(s, pid); | 675 | print_graph_proc(s, pid); |
| 725 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 676 | trace_seq_puts(s, " | "); |
| 726 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 727 | ret = trace_seq_puts(s, " | "); | ||
| 728 | if (!ret) | ||
| 729 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 730 | } | 677 | } |
| 678 | |||
| 679 | /* Latency format */ | ||
| 680 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
| 681 | print_graph_lat_fmt(s, ent); | ||
| 731 | } | 682 | } |
| 732 | 683 | ||
| 733 | /* No overhead */ | 684 | /* No overhead */ |
| 734 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); | 685 | print_graph_duration(0, s, flags | FLAGS_FILL_START); |
| 735 | if (ret != TRACE_TYPE_HANDLED) | ||
| 736 | return ret; | ||
| 737 | 686 | ||
| 738 | if (type == TRACE_GRAPH_ENT) | 687 | if (type == TRACE_GRAPH_ENT) |
| 739 | ret = trace_seq_puts(s, "==========>"); | 688 | trace_seq_puts(s, "==========>"); |
| 740 | else | 689 | else |
| 741 | ret = trace_seq_puts(s, "<=========="); | 690 | trace_seq_puts(s, "<=========="); |
| 742 | |||
| 743 | if (!ret) | ||
| 744 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 745 | |||
| 746 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); | ||
| 747 | if (ret != TRACE_TYPE_HANDLED) | ||
| 748 | return ret; | ||
| 749 | |||
| 750 | ret = trace_seq_putc(s, '\n'); | ||
| 751 | 691 | ||
| 752 | if (!ret) | 692 | print_graph_duration(0, s, flags | FLAGS_FILL_END); |
| 753 | return TRACE_TYPE_PARTIAL_LINE; | 693 | trace_seq_putc(s, '\n'); |
| 754 | return TRACE_TYPE_HANDLED; | ||
| 755 | } | 694 | } |
| 756 | 695 | ||
| 757 | enum print_line_t | 696 | void |
| 758 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | 697 | trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) |
| 759 | { | 698 | { |
| 760 | unsigned long nsecs_rem = do_div(duration, 1000); | 699 | unsigned long nsecs_rem = do_div(duration, 1000); |
| 761 | /* log10(ULONG_MAX) + '\0' */ | 700 | /* log10(ULONG_MAX) + '\0' */ |
| 762 | char msecs_str[21]; | 701 | char usecs_str[21]; |
| 763 | char nsecs_str[5]; | 702 | char nsecs_str[5]; |
| 764 | int ret, len; | 703 | int len; |
| 765 | int i; | 704 | int i; |
| 766 | 705 | ||
| 767 | sprintf(msecs_str, "%lu", (unsigned long) duration); | 706 | sprintf(usecs_str, "%lu", (unsigned long) duration); |
| 768 | 707 | ||
| 769 | /* Print msecs */ | 708 | /* Print msecs */ |
| 770 | ret = trace_seq_printf(s, "%s", msecs_str); | 709 | trace_seq_printf(s, "%s", usecs_str); |
| 771 | if (!ret) | ||
| 772 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 773 | 710 | ||
| 774 | len = strlen(msecs_str); | 711 | len = strlen(usecs_str); |
| 775 | 712 | ||
| 776 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 713 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
| 777 | if (len < 7) { | 714 | if (len < 7) { |
| 778 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); | 715 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
| 779 | 716 | ||
| 780 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | 717 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); |
| 781 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 718 | trace_seq_printf(s, ".%s", nsecs_str); |
| 782 | if (!ret) | ||
| 783 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 784 | len += strlen(nsecs_str); | 719 | len += strlen(nsecs_str); |
| 785 | } | 720 | } |
| 786 | 721 | ||
| 787 | ret = trace_seq_puts(s, " us "); | 722 | trace_seq_puts(s, " us "); |
| 788 | if (!ret) | ||
| 789 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 790 | 723 | ||
| 791 | /* Print remaining spaces to fit the row's width */ | 724 | /* Print remaining spaces to fit the row's width */ |
| 792 | for (i = len; i < 7; i++) { | 725 | for (i = len; i < 7; i++) |
| 793 | ret = trace_seq_putc(s, ' '); | 726 | trace_seq_putc(s, ' '); |
| 794 | if (!ret) | ||
| 795 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 796 | } | ||
| 797 | return TRACE_TYPE_HANDLED; | ||
| 798 | } | 727 | } |
| 799 | 728 | ||
| 800 | static enum print_line_t | 729 | static void |
| 801 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | 730 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
| 802 | u32 flags) | 731 | u32 flags) |
| 803 | { | 732 | { |
| 804 | int ret = -1; | ||
| 805 | |||
| 806 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || | 733 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || |
| 807 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) | 734 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) |
| 808 | return TRACE_TYPE_HANDLED; | 735 | return; |
| 809 | 736 | ||
| 810 | /* No real adata, just filling the column with spaces */ | 737 | /* No real adata, just filling the column with spaces */ |
| 811 | switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { | 738 | switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { |
| 812 | case FLAGS_FILL_FULL: | 739 | case FLAGS_FILL_FULL: |
| 813 | ret = trace_seq_puts(s, " | "); | 740 | trace_seq_puts(s, " | "); |
| 814 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 741 | return; |
| 815 | case FLAGS_FILL_START: | 742 | case FLAGS_FILL_START: |
| 816 | ret = trace_seq_puts(s, " "); | 743 | trace_seq_puts(s, " "); |
| 817 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 744 | return; |
| 818 | case FLAGS_FILL_END: | 745 | case FLAGS_FILL_END: |
| 819 | ret = trace_seq_puts(s, " |"); | 746 | trace_seq_puts(s, " |"); |
| 820 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 747 | return; |
| 821 | } | 748 | } |
| 822 | 749 | ||
| 823 | /* Signal a overhead of time execution to the output */ | 750 | /* Signal a overhead of time execution to the output */ |
| 824 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | 751 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) |
| 825 | /* Duration exceeded 100 msecs */ | 752 | trace_seq_printf(s, "%c ", trace_find_mark(duration)); |
| 826 | if (duration > 100000ULL) | 753 | else |
| 827 | ret = trace_seq_puts(s, "! "); | 754 | trace_seq_puts(s, " "); |
| 828 | /* Duration exceeded 10 msecs */ | ||
| 829 | else if (duration > 10000ULL) | ||
| 830 | ret = trace_seq_puts(s, "+ "); | ||
| 831 | } | ||
| 832 | |||
| 833 | /* | ||
| 834 | * The -1 means we either did not exceed the duration tresholds | ||
| 835 | * or we dont want to print out the overhead. Either way we need | ||
| 836 | * to fill out the space. | ||
| 837 | */ | ||
| 838 | if (ret == -1) | ||
| 839 | ret = trace_seq_puts(s, " "); | ||
| 840 | |||
| 841 | /* Catching here any failure happenned above */ | ||
| 842 | if (!ret) | ||
| 843 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 844 | |||
| 845 | ret = trace_print_graph_duration(duration, s); | ||
| 846 | if (ret != TRACE_TYPE_HANDLED) | ||
| 847 | return ret; | ||
| 848 | |||
| 849 | ret = trace_seq_puts(s, "| "); | ||
| 850 | if (!ret) | ||
| 851 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 852 | 755 | ||
| 853 | return TRACE_TYPE_HANDLED; | 756 | trace_print_graph_duration(duration, s); |
| 757 | trace_seq_puts(s, "| "); | ||
| 854 | } | 758 | } |
| 855 | 759 | ||
| 856 | /* Case of a leaf function on its call entry */ | 760 | /* Case of a leaf function on its call entry */ |
| @@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 864 | struct ftrace_graph_ret *graph_ret; | 768 | struct ftrace_graph_ret *graph_ret; |
| 865 | struct ftrace_graph_ent *call; | 769 | struct ftrace_graph_ent *call; |
| 866 | unsigned long long duration; | 770 | unsigned long long duration; |
| 867 | int ret; | ||
| 868 | int i; | 771 | int i; |
| 869 | 772 | ||
| 870 | graph_ret = &ret_entry->ret; | 773 | graph_ret = &ret_entry->ret; |
| @@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 890 | } | 793 | } |
| 891 | 794 | ||
| 892 | /* Overhead and duration */ | 795 | /* Overhead and duration */ |
| 893 | ret = print_graph_duration(duration, s, flags); | 796 | print_graph_duration(duration, s, flags); |
| 894 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 895 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 896 | 797 | ||
| 897 | /* Function */ | 798 | /* Function */ |
| 898 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 799 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) |
| 899 | ret = trace_seq_putc(s, ' '); | 800 | trace_seq_putc(s, ' '); |
| 900 | if (!ret) | ||
| 901 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 902 | } | ||
| 903 | 801 | ||
| 904 | ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); | 802 | trace_seq_printf(s, "%ps();\n", (void *)call->func); |
| 905 | if (!ret) | ||
| 906 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 907 | 803 | ||
| 908 | return TRACE_TYPE_HANDLED; | 804 | return trace_handle_return(s); |
| 909 | } | 805 | } |
| 910 | 806 | ||
| 911 | static enum print_line_t | 807 | static enum print_line_t |
| @@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 915 | { | 811 | { |
| 916 | struct ftrace_graph_ent *call = &entry->graph_ent; | 812 | struct ftrace_graph_ent *call = &entry->graph_ent; |
| 917 | struct fgraph_data *data = iter->private; | 813 | struct fgraph_data *data = iter->private; |
| 918 | int ret; | ||
| 919 | int i; | 814 | int i; |
| 920 | 815 | ||
| 921 | if (data) { | 816 | if (data) { |
| @@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 931 | } | 826 | } |
| 932 | 827 | ||
| 933 | /* No time */ | 828 | /* No time */ |
| 934 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); | 829 | print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
| 935 | if (ret != TRACE_TYPE_HANDLED) | ||
| 936 | return ret; | ||
| 937 | 830 | ||
| 938 | /* Function */ | 831 | /* Function */ |
| 939 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 832 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) |
| 940 | ret = trace_seq_putc(s, ' '); | 833 | trace_seq_putc(s, ' '); |
| 941 | if (!ret) | 834 | |
| 942 | return TRACE_TYPE_PARTIAL_LINE; | 835 | trace_seq_printf(s, "%ps() {\n", (void *)call->func); |
| 943 | } | ||
| 944 | 836 | ||
| 945 | ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); | 837 | if (trace_seq_has_overflowed(s)) |
| 946 | if (!ret) | ||
| 947 | return TRACE_TYPE_PARTIAL_LINE; | 838 | return TRACE_TYPE_PARTIAL_LINE; |
| 948 | 839 | ||
| 949 | /* | 840 | /* |
| @@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 953 | return TRACE_TYPE_NO_CONSUME; | 844 | return TRACE_TYPE_NO_CONSUME; |
| 954 | } | 845 | } |
| 955 | 846 | ||
| 956 | static enum print_line_t | 847 | static void |
| 957 | print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | 848 | print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, |
| 958 | int type, unsigned long addr, u32 flags) | 849 | int type, unsigned long addr, u32 flags) |
| 959 | { | 850 | { |
| 960 | struct fgraph_data *data = iter->private; | 851 | struct fgraph_data *data = iter->private; |
| 961 | struct trace_entry *ent = iter->ent; | 852 | struct trace_entry *ent = iter->ent; |
| 962 | int cpu = iter->cpu; | 853 | int cpu = iter->cpu; |
| 963 | int ret; | ||
| 964 | 854 | ||
| 965 | /* Pid */ | 855 | /* Pid */ |
| 966 | if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) | 856 | verif_pid(s, ent->pid, cpu, data); |
| 967 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 968 | 857 | ||
| 969 | if (type) { | 858 | if (type) |
| 970 | /* Interrupt */ | 859 | /* Interrupt */ |
| 971 | ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); | 860 | print_graph_irq(iter, addr, type, cpu, ent->pid, flags); |
| 972 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 973 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 974 | } | ||
| 975 | 861 | ||
| 976 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | 862 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) |
| 977 | return 0; | 863 | return; |
| 978 | 864 | ||
| 979 | /* Absolute time */ | 865 | /* Absolute time */ |
| 980 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 866 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 981 | ret = print_graph_abs_time(iter->ts, s); | 867 | print_graph_abs_time(iter->ts, s); |
| 982 | if (!ret) | ||
| 983 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 984 | } | ||
| 985 | 868 | ||
| 986 | /* Cpu */ | 869 | /* Cpu */ |
| 987 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 870 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 988 | ret = print_graph_cpu(s, cpu); | 871 | print_graph_cpu(s, cpu); |
| 989 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 990 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 991 | } | ||
| 992 | 872 | ||
| 993 | /* Proc */ | 873 | /* Proc */ |
| 994 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 874 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
| 995 | ret = print_graph_proc(s, ent->pid); | 875 | print_graph_proc(s, ent->pid); |
| 996 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 876 | trace_seq_puts(s, " | "); |
| 997 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 998 | |||
| 999 | ret = trace_seq_puts(s, " | "); | ||
| 1000 | if (!ret) | ||
| 1001 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1002 | } | 877 | } |
| 1003 | 878 | ||
| 1004 | /* Latency format */ | 879 | /* Latency format */ |
| 1005 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | 880 | if (trace_flags & TRACE_ITER_LATENCY_FMT) |
| 1006 | ret = print_graph_lat_fmt(s, ent); | 881 | print_graph_lat_fmt(s, ent); |
| 1007 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1008 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1009 | } | ||
| 1010 | 882 | ||
| 1011 | return 0; | 883 | return; |
| 1012 | } | 884 | } |
| 1013 | 885 | ||
| 1014 | /* | 886 | /* |
| @@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
| 1126 | if (check_irq_entry(iter, flags, call->func, call->depth)) | 998 | if (check_irq_entry(iter, flags, call->func, call->depth)) |
| 1127 | return TRACE_TYPE_HANDLED; | 999 | return TRACE_TYPE_HANDLED; |
| 1128 | 1000 | ||
| 1129 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1001 | print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); |
| 1130 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1131 | 1002 | ||
| 1132 | leaf_ret = get_return_for_leaf(iter, field); | 1003 | leaf_ret = get_return_for_leaf(iter, field); |
| 1133 | if (leaf_ret) | 1004 | if (leaf_ret) |
| @@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1160 | pid_t pid = ent->pid; | 1031 | pid_t pid = ent->pid; |
| 1161 | int cpu = iter->cpu; | 1032 | int cpu = iter->cpu; |
| 1162 | int func_match = 1; | 1033 | int func_match = 1; |
| 1163 | int ret; | ||
| 1164 | int i; | 1034 | int i; |
| 1165 | 1035 | ||
| 1166 | if (check_irq_return(iter, flags, trace->depth)) | 1036 | if (check_irq_return(iter, flags, trace->depth)) |
| @@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1186 | } | 1056 | } |
| 1187 | } | 1057 | } |
| 1188 | 1058 | ||
| 1189 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1059 | print_graph_prologue(iter, s, 0, 0, flags); |
| 1190 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1191 | 1060 | ||
| 1192 | /* Overhead and duration */ | 1061 | /* Overhead and duration */ |
| 1193 | ret = print_graph_duration(duration, s, flags); | 1062 | print_graph_duration(duration, s, flags); |
| 1194 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1195 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1196 | 1063 | ||
| 1197 | /* Closing brace */ | 1064 | /* Closing brace */ |
| 1198 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { | 1065 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) |
| 1199 | ret = trace_seq_putc(s, ' '); | 1066 | trace_seq_putc(s, ' '); |
| 1200 | if (!ret) | ||
| 1201 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1202 | } | ||
| 1203 | 1067 | ||
| 1204 | /* | 1068 | /* |
| 1205 | * If the return function does not have a matching entry, | 1069 | * If the return function does not have a matching entry, |
| @@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1208 | * belongs to, write out the function name. Always do | 1072 | * belongs to, write out the function name. Always do |
| 1209 | * that if the funcgraph-tail option is enabled. | 1073 | * that if the funcgraph-tail option is enabled. |
| 1210 | */ | 1074 | */ |
| 1211 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { | 1075 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) |
| 1212 | ret = trace_seq_puts(s, "}\n"); | 1076 | trace_seq_puts(s, "}\n"); |
| 1213 | if (!ret) | 1077 | else |
| 1214 | return TRACE_TYPE_PARTIAL_LINE; | 1078 | trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); |
| 1215 | } else { | ||
| 1216 | ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); | ||
| 1217 | if (!ret) | ||
| 1218 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1219 | } | ||
| 1220 | 1079 | ||
| 1221 | /* Overrun */ | 1080 | /* Overrun */ |
| 1222 | if (flags & TRACE_GRAPH_PRINT_OVERRUN) { | 1081 | if (flags & TRACE_GRAPH_PRINT_OVERRUN) |
| 1223 | ret = trace_seq_printf(s, " (Overruns: %lu)\n", | 1082 | trace_seq_printf(s, " (Overruns: %lu)\n", |
| 1224 | trace->overrun); | 1083 | trace->overrun); |
| 1225 | if (!ret) | ||
| 1226 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1227 | } | ||
| 1228 | 1084 | ||
| 1229 | ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, | 1085 | print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, |
| 1230 | cpu, pid, flags); | 1086 | cpu, pid, flags); |
| 1231 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1232 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1233 | 1087 | ||
| 1234 | return TRACE_TYPE_HANDLED; | 1088 | return trace_handle_return(s); |
| 1235 | } | 1089 | } |
| 1236 | 1090 | ||
| 1237 | static enum print_line_t | 1091 | static enum print_line_t |
| @@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1248 | if (data) | 1102 | if (data) |
| 1249 | depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; | 1103 | depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; |
| 1250 | 1104 | ||
| 1251 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1105 | print_graph_prologue(iter, s, 0, 0, flags); |
| 1252 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1253 | 1106 | ||
| 1254 | /* No time */ | 1107 | /* No time */ |
| 1255 | ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); | 1108 | print_graph_duration(0, s, flags | FLAGS_FILL_FULL); |
| 1256 | if (ret != TRACE_TYPE_HANDLED) | ||
| 1257 | return ret; | ||
| 1258 | 1109 | ||
| 1259 | /* Indentation */ | 1110 | /* Indentation */ |
| 1260 | if (depth > 0) | 1111 | if (depth > 0) |
| 1261 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { | 1112 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) |
| 1262 | ret = trace_seq_putc(s, ' '); | 1113 | trace_seq_putc(s, ' '); |
| 1263 | if (!ret) | ||
| 1264 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1265 | } | ||
| 1266 | 1114 | ||
| 1267 | /* The comment */ | 1115 | /* The comment */ |
| 1268 | ret = trace_seq_puts(s, "/* "); | 1116 | trace_seq_puts(s, "/* "); |
| 1269 | if (!ret) | ||
| 1270 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1271 | 1117 | ||
| 1272 | switch (iter->ent->type) { | 1118 | switch (iter->ent->type) { |
| 1273 | case TRACE_BPRINT: | 1119 | case TRACE_BPRINT: |
| @@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1290 | return ret; | 1136 | return ret; |
| 1291 | } | 1137 | } |
| 1292 | 1138 | ||
| 1139 | if (trace_seq_has_overflowed(s)) | ||
| 1140 | goto out; | ||
| 1141 | |||
| 1293 | /* Strip ending newline */ | 1142 | /* Strip ending newline */ |
| 1294 | if (s->buffer[s->len - 1] == '\n') { | 1143 | if (s->buffer[s->seq.len - 1] == '\n') { |
| 1295 | s->buffer[s->len - 1] = '\0'; | 1144 | s->buffer[s->seq.len - 1] = '\0'; |
| 1296 | s->len--; | 1145 | s->seq.len--; |
| 1297 | } | 1146 | } |
| 1298 | 1147 | ||
| 1299 | ret = trace_seq_puts(s, " */\n"); | 1148 | trace_seq_puts(s, " */\n"); |
| 1300 | if (!ret) | 1149 | out: |
| 1301 | return TRACE_TYPE_PARTIAL_LINE; | 1150 | return trace_handle_return(s); |
| 1302 | |||
| 1303 | return TRACE_TYPE_HANDLED; | ||
| 1304 | } | 1151 | } |
| 1305 | 1152 | ||
| 1306 | 1153 | ||
| @@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
| 1407 | print_lat_header(s, flags); | 1254 | print_lat_header(s, flags); |
| 1408 | 1255 | ||
| 1409 | /* 1st line */ | 1256 | /* 1st line */ |
| 1410 | seq_printf(s, "#"); | 1257 | seq_putc(s, '#'); |
| 1411 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) | 1258 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 1412 | seq_printf(s, " TIME "); | 1259 | seq_puts(s, " TIME "); |
| 1413 | if (flags & TRACE_GRAPH_PRINT_CPU) | 1260 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 1414 | seq_printf(s, " CPU"); | 1261 | seq_puts(s, " CPU"); |
| 1415 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1262 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1416 | seq_printf(s, " TASK/PID "); | 1263 | seq_puts(s, " TASK/PID "); |
| 1417 | if (lat) | 1264 | if (lat) |
| 1418 | seq_printf(s, "||||"); | 1265 | seq_puts(s, "||||"); |
| 1419 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1266 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1420 | seq_printf(s, " DURATION "); | 1267 | seq_puts(s, " DURATION "); |
| 1421 | seq_printf(s, " FUNCTION CALLS\n"); | 1268 | seq_puts(s, " FUNCTION CALLS\n"); |
| 1422 | 1269 | ||
| 1423 | /* 2nd line */ | 1270 | /* 2nd line */ |
| 1424 | seq_printf(s, "#"); | 1271 | seq_putc(s, '#'); |
| 1425 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) | 1272 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) |
| 1426 | seq_printf(s, " | "); | 1273 | seq_puts(s, " | "); |
| 1427 | if (flags & TRACE_GRAPH_PRINT_CPU) | 1274 | if (flags & TRACE_GRAPH_PRINT_CPU) |
| 1428 | seq_printf(s, " | "); | 1275 | seq_puts(s, " | "); |
| 1429 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1276 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1430 | seq_printf(s, " | | "); | 1277 | seq_puts(s, " | | "); |
| 1431 | if (lat) | 1278 | if (lat) |
| 1432 | seq_printf(s, "||||"); | 1279 | seq_puts(s, "||||"); |
| 1433 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1280 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1434 | seq_printf(s, " | | "); | 1281 | seq_puts(s, " | | "); |
| 1435 | seq_printf(s, " | | | |\n"); | 1282 | seq_puts(s, " | | | |\n"); |
| 1436 | } | 1283 | } |
| 1437 | 1284 | ||
| 1438 | static void print_graph_headers(struct seq_file *s) | 1285 | static void print_graph_headers(struct seq_file *s) |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index bd90e1b06088..3ccf5c2c1320 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
| @@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
| 20 | { | 20 | { |
| 21 | /* use static because iter can be a bit big for the stack */ | 21 | /* use static because iter can be a bit big for the stack */ |
| 22 | static struct trace_iterator iter; | 22 | static struct trace_iterator iter; |
| 23 | static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; | ||
| 23 | unsigned int old_userobj; | 24 | unsigned int old_userobj; |
| 24 | int cnt = 0, cpu; | 25 | int cnt = 0, cpu; |
| 25 | 26 | ||
| 26 | trace_init_global_iter(&iter); | 27 | trace_init_global_iter(&iter); |
| 28 | iter.buffer_iter = buffer_iter; | ||
| 27 | 29 | ||
| 28 | for_each_tracing_cpu(cpu) { | 30 | for_each_tracing_cpu(cpu) { |
| 29 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | 31 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
| @@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
| 57 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); | 59 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); |
| 58 | tracing_iter_reset(&iter, cpu_file); | 60 | tracing_iter_reset(&iter, cpu_file); |
| 59 | } | 61 | } |
| 60 | if (!trace_empty(&iter)) | 62 | |
| 61 | trace_find_next_entry_inc(&iter); | 63 | while (trace_find_next_entry_inc(&iter)) { |
| 62 | while (!trace_empty(&iter)) { | ||
| 63 | if (!cnt) | 64 | if (!cnt) |
| 64 | kdb_printf("---------------------------------\n"); | 65 | kdb_printf("---------------------------------\n"); |
| 65 | cnt++; | 66 | cnt++; |
| 66 | 67 | ||
| 67 | if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) | 68 | if (!skip_lines) { |
| 68 | print_trace_line(&iter); | 69 | print_trace_line(&iter); |
| 69 | if (!skip_lines) | ||
| 70 | trace_printk_seq(&iter.seq); | 70 | trace_printk_seq(&iter.seq); |
| 71 | else | 71 | } else { |
| 72 | skip_lines--; | 72 | skip_lines--; |
| 73 | } | ||
| 74 | |||
| 73 | if (KDB_FLAG(CMD_INTERRUPT)) | 75 | if (KDB_FLAG(CMD_INTERRUPT)) |
| 74 | goto out; | 76 | goto out; |
| 75 | } | 77 | } |
| @@ -86,9 +88,12 @@ out: | |||
| 86 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | 88 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); |
| 87 | } | 89 | } |
| 88 | 90 | ||
| 89 | for_each_tracing_cpu(cpu) | 91 | for_each_tracing_cpu(cpu) { |
| 90 | if (iter.buffer_iter[cpu]) | 92 | if (iter.buffer_iter[cpu]) { |
| 91 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | 93 | ring_buffer_read_finish(iter.buffer_iter[cpu]); |
| 94 | iter.buffer_iter[cpu] = NULL; | ||
| 95 | } | ||
| 96 | } | ||
| 92 | } | 97 | } |
| 93 | 98 | ||
| 94 | /* | 99 | /* |
| @@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv) | |||
| 127 | 132 | ||
| 128 | static __init int kdb_ftrace_register(void) | 133 | static __init int kdb_ftrace_register(void) |
| 129 | { | 134 | { |
| 130 | kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", | 135 | kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", |
| 131 | "Dump ftrace log", 0, KDB_REPEAT_NONE); | 136 | "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); |
| 132 | return 0; | 137 | return 0; |
| 133 | } | 138 | } |
| 134 | 139 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f6e4e5539..5edb518be345 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 826 | struct trace_kprobe *tk = v; | 826 | struct trace_kprobe *tk = v; |
| 827 | int i; | 827 | int i; |
| 828 | 828 | ||
| 829 | seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); | 829 | seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); |
| 830 | seq_printf(m, ":%s/%s", tk->tp.call.class->system, | 830 | seq_printf(m, ":%s/%s", tk->tp.call.class->system, |
| 831 | ftrace_event_name(&tk->tp.call)); | 831 | ftrace_event_name(&tk->tp.call)); |
| 832 | 832 | ||
| @@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 840 | 840 | ||
| 841 | for (i = 0; i < tk->tp.nr_args; i++) | 841 | for (i = 0; i < tk->tp.nr_args; i++) |
| 842 | seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); | 842 | seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); |
| 843 | seq_printf(m, "\n"); | 843 | seq_putc(m, '\n'); |
| 844 | 844 | ||
| 845 | return 0; | 845 | return 0; |
| 846 | } | 846 | } |
| @@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags, | |||
| 1024 | field = (struct kprobe_trace_entry_head *)iter->ent; | 1024 | field = (struct kprobe_trace_entry_head *)iter->ent; |
| 1025 | tp = container_of(event, struct trace_probe, call.event); | 1025 | tp = container_of(event, struct trace_probe, call.event); |
| 1026 | 1026 | ||
| 1027 | if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) | 1027 | trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); |
| 1028 | goto partial; | ||
| 1029 | 1028 | ||
| 1030 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | 1029 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) |
| 1031 | goto partial; | 1030 | goto out; |
| 1032 | 1031 | ||
| 1033 | if (!trace_seq_puts(s, ")")) | 1032 | trace_seq_putc(s, ')'); |
| 1034 | goto partial; | ||
| 1035 | 1033 | ||
| 1036 | data = (u8 *)&field[1]; | 1034 | data = (u8 *)&field[1]; |
| 1037 | for (i = 0; i < tp->nr_args; i++) | 1035 | for (i = 0; i < tp->nr_args; i++) |
| 1038 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1036 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1039 | data + tp->args[i].offset, field)) | 1037 | data + tp->args[i].offset, field)) |
| 1040 | goto partial; | 1038 | goto out; |
| 1041 | |||
| 1042 | if (!trace_seq_puts(s, "\n")) | ||
| 1043 | goto partial; | ||
| 1044 | 1039 | ||
| 1045 | return TRACE_TYPE_HANDLED; | 1040 | trace_seq_putc(s, '\n'); |
| 1046 | partial: | 1041 | out: |
| 1047 | return TRACE_TYPE_PARTIAL_LINE; | 1042 | return trace_handle_return(s); |
| 1048 | } | 1043 | } |
| 1049 | 1044 | ||
| 1050 | static enum print_line_t | 1045 | static enum print_line_t |
| @@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, | |||
| 1060 | field = (struct kretprobe_trace_entry_head *)iter->ent; | 1055 | field = (struct kretprobe_trace_entry_head *)iter->ent; |
| 1061 | tp = container_of(event, struct trace_probe, call.event); | 1056 | tp = container_of(event, struct trace_probe, call.event); |
| 1062 | 1057 | ||
| 1063 | if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) | 1058 | trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); |
| 1064 | goto partial; | ||
| 1065 | 1059 | ||
| 1066 | if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) | 1060 | if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) |
| 1067 | goto partial; | 1061 | goto out; |
| 1068 | 1062 | ||
| 1069 | if (!trace_seq_puts(s, " <- ")) | 1063 | trace_seq_puts(s, " <- "); |
| 1070 | goto partial; | ||
| 1071 | 1064 | ||
| 1072 | if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) | 1065 | if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) |
| 1073 | goto partial; | 1066 | goto out; |
| 1074 | 1067 | ||
| 1075 | if (!trace_seq_puts(s, ")")) | 1068 | trace_seq_putc(s, ')'); |
| 1076 | goto partial; | ||
| 1077 | 1069 | ||
| 1078 | data = (u8 *)&field[1]; | 1070 | data = (u8 *)&field[1]; |
| 1079 | for (i = 0; i < tp->nr_args; i++) | 1071 | for (i = 0; i < tp->nr_args; i++) |
| 1080 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1072 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1081 | data + tp->args[i].offset, field)) | 1073 | data + tp->args[i].offset, field)) |
| 1082 | goto partial; | 1074 | goto out; |
| 1083 | 1075 | ||
| 1084 | if (!trace_seq_puts(s, "\n")) | 1076 | trace_seq_putc(s, '\n'); |
| 1085 | goto partial; | ||
| 1086 | 1077 | ||
| 1087 | return TRACE_TYPE_HANDLED; | 1078 | out: |
| 1088 | partial: | 1079 | return trace_handle_return(s); |
| 1089 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1090 | } | 1080 | } |
| 1091 | 1081 | ||
| 1092 | 1082 | ||
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b863474..7a9ba62e9fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
| @@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr) | |||
| 59 | mmio_reset_data(tr); | 59 | mmio_reset_data(tr); |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | 62 | static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) |
| 63 | { | 63 | { |
| 64 | int ret = 0; | ||
| 65 | int i; | 64 | int i; |
| 66 | resource_size_t start, end; | 65 | resource_size_t start, end; |
| 67 | const struct pci_driver *drv = pci_dev_driver(dev); | 66 | const struct pci_driver *drv = pci_dev_driver(dev); |
| 68 | 67 | ||
| 69 | /* XXX: incomplete checks for trace_seq_printf() return value */ | 68 | trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", |
| 70 | ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", | 69 | dev->bus->number, dev->devfn, |
| 71 | dev->bus->number, dev->devfn, | 70 | dev->vendor, dev->device, dev->irq); |
| 72 | dev->vendor, dev->device, dev->irq); | ||
| 73 | /* | 71 | /* |
| 74 | * XXX: is pci_resource_to_user() appropriate, since we are | 72 | * XXX: is pci_resource_to_user() appropriate, since we are |
| 75 | * supposed to interpret the __ioremap() phys_addr argument based on | 73 | * supposed to interpret the __ioremap() phys_addr argument based on |
| @@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | |||
| 77 | */ | 75 | */ |
| 78 | for (i = 0; i < 7; i++) { | 76 | for (i = 0; i < 7; i++) { |
| 79 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 77 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); |
| 80 | ret += trace_seq_printf(s, " %llx", | 78 | trace_seq_printf(s, " %llx", |
| 81 | (unsigned long long)(start | | 79 | (unsigned long long)(start | |
| 82 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); | 80 | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); |
| 83 | } | 81 | } |
| 84 | for (i = 0; i < 7; i++) { | 82 | for (i = 0; i < 7; i++) { |
| 85 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); | 83 | pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); |
| 86 | ret += trace_seq_printf(s, " %llx", | 84 | trace_seq_printf(s, " %llx", |
| 87 | dev->resource[i].start < dev->resource[i].end ? | 85 | dev->resource[i].start < dev->resource[i].end ? |
| 88 | (unsigned long long)(end - start) + 1 : 0); | 86 | (unsigned long long)(end - start) + 1 : 0); |
| 89 | } | 87 | } |
| 90 | if (drv) | 88 | if (drv) |
| 91 | ret += trace_seq_printf(s, " %s\n", drv->name); | 89 | trace_seq_printf(s, " %s\n", drv->name); |
| 92 | else | 90 | else |
| 93 | ret += trace_seq_puts(s, " \n"); | 91 | trace_seq_puts(s, " \n"); |
| 94 | return ret; | ||
| 95 | } | 92 | } |
| 96 | 93 | ||
| 97 | static void destroy_header_iter(struct header_iter *hiter) | 94 | static void destroy_header_iter(struct header_iter *hiter) |
| @@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) | |||
| 179 | unsigned long long t = ns2usecs(iter->ts); | 176 | unsigned long long t = ns2usecs(iter->ts); |
| 180 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 177 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 181 | unsigned secs = (unsigned long)t; | 178 | unsigned secs = (unsigned long)t; |
| 182 | int ret = 1; | ||
| 183 | 179 | ||
| 184 | trace_assign_type(field, entry); | 180 | trace_assign_type(field, entry); |
| 185 | rw = &field->rw; | 181 | rw = &field->rw; |
| 186 | 182 | ||
| 187 | switch (rw->opcode) { | 183 | switch (rw->opcode) { |
| 188 | case MMIO_READ: | 184 | case MMIO_READ: |
| 189 | ret = trace_seq_printf(s, | 185 | trace_seq_printf(s, |
| 190 | "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", | 186 | "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", |
| 191 | rw->width, secs, usec_rem, rw->map_id, | 187 | rw->width, secs, usec_rem, rw->map_id, |
| 192 | (unsigned long long)rw->phys, | 188 | (unsigned long long)rw->phys, |
| 193 | rw->value, rw->pc, 0); | 189 | rw->value, rw->pc, 0); |
| 194 | break; | 190 | break; |
| 195 | case MMIO_WRITE: | 191 | case MMIO_WRITE: |
| 196 | ret = trace_seq_printf(s, | 192 | trace_seq_printf(s, |
| 197 | "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", | 193 | "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", |
| 198 | rw->width, secs, usec_rem, rw->map_id, | 194 | rw->width, secs, usec_rem, rw->map_id, |
| 199 | (unsigned long long)rw->phys, | 195 | (unsigned long long)rw->phys, |
| 200 | rw->value, rw->pc, 0); | 196 | rw->value, rw->pc, 0); |
| 201 | break; | 197 | break; |
| 202 | case MMIO_UNKNOWN_OP: | 198 | case MMIO_UNKNOWN_OP: |
| 203 | ret = trace_seq_printf(s, | 199 | trace_seq_printf(s, |
| 204 | "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," | 200 | "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," |
| 205 | "%02lx 0x%lx %d\n", | 201 | "%02lx 0x%lx %d\n", |
| 206 | secs, usec_rem, rw->map_id, | 202 | secs, usec_rem, rw->map_id, |
| @@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) | |||
| 209 | (rw->value >> 0) & 0xff, rw->pc, 0); | 205 | (rw->value >> 0) & 0xff, rw->pc, 0); |
| 210 | break; | 206 | break; |
| 211 | default: | 207 | default: |
| 212 | ret = trace_seq_puts(s, "rw what?\n"); | 208 | trace_seq_puts(s, "rw what?\n"); |
| 213 | break; | 209 | break; |
| 214 | } | 210 | } |
| 215 | if (ret) | 211 | |
| 216 | return TRACE_TYPE_HANDLED; | 212 | return trace_handle_return(s); |
| 217 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 218 | } | 213 | } |
| 219 | 214 | ||
| 220 | static enum print_line_t mmio_print_map(struct trace_iterator *iter) | 215 | static enum print_line_t mmio_print_map(struct trace_iterator *iter) |
| @@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) | |||
| 226 | unsigned long long t = ns2usecs(iter->ts); | 221 | unsigned long long t = ns2usecs(iter->ts); |
| 227 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 222 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 228 | unsigned secs = (unsigned long)t; | 223 | unsigned secs = (unsigned long)t; |
| 229 | int ret; | ||
| 230 | 224 | ||
| 231 | trace_assign_type(field, entry); | 225 | trace_assign_type(field, entry); |
| 232 | m = &field->map; | 226 | m = &field->map; |
| 233 | 227 | ||
| 234 | switch (m->opcode) { | 228 | switch (m->opcode) { |
| 235 | case MMIO_PROBE: | 229 | case MMIO_PROBE: |
| 236 | ret = trace_seq_printf(s, | 230 | trace_seq_printf(s, |
| 237 | "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", | 231 | "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", |
| 238 | secs, usec_rem, m->map_id, | 232 | secs, usec_rem, m->map_id, |
| 239 | (unsigned long long)m->phys, m->virt, m->len, | 233 | (unsigned long long)m->phys, m->virt, m->len, |
| 240 | 0UL, 0); | 234 | 0UL, 0); |
| 241 | break; | 235 | break; |
| 242 | case MMIO_UNPROBE: | 236 | case MMIO_UNPROBE: |
| 243 | ret = trace_seq_printf(s, | 237 | trace_seq_printf(s, |
| 244 | "UNMAP %u.%06lu %d 0x%lx %d\n", | 238 | "UNMAP %u.%06lu %d 0x%lx %d\n", |
| 245 | secs, usec_rem, m->map_id, 0UL, 0); | 239 | secs, usec_rem, m->map_id, 0UL, 0); |
| 246 | break; | 240 | break; |
| 247 | default: | 241 | default: |
| 248 | ret = trace_seq_puts(s, "map what?\n"); | 242 | trace_seq_puts(s, "map what?\n"); |
| 249 | break; | 243 | break; |
| 250 | } | 244 | } |
| 251 | if (ret) | 245 | |
| 252 | return TRACE_TYPE_HANDLED; | 246 | return trace_handle_return(s); |
| 253 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 254 | } | 247 | } |
| 255 | 248 | ||
| 256 | static enum print_line_t mmio_print_mark(struct trace_iterator *iter) | 249 | static enum print_line_t mmio_print_mark(struct trace_iterator *iter) |
| @@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) | |||
| 262 | unsigned long long t = ns2usecs(iter->ts); | 255 | unsigned long long t = ns2usecs(iter->ts); |
| 263 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 256 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
| 264 | unsigned secs = (unsigned long)t; | 257 | unsigned secs = (unsigned long)t; |
| 265 | int ret; | ||
| 266 | 258 | ||
| 267 | /* The trailing newline must be in the message. */ | 259 | /* The trailing newline must be in the message. */ |
| 268 | ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); | 260 | trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); |
| 269 | if (!ret) | ||
| 270 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 271 | 261 | ||
| 272 | return TRACE_TYPE_HANDLED; | 262 | return trace_handle_return(s); |
| 273 | } | 263 | } |
| 274 | 264 | ||
| 275 | static enum print_line_t mmio_print_line(struct trace_iterator *iter) | 265 | static enum print_line_t mmio_print_line(struct trace_iterator *iter) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c6977d5a9b12..b77b9a697619 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) | |||
| 25 | struct trace_seq *s = &iter->seq; | 25 | struct trace_seq *s = &iter->seq; |
| 26 | struct trace_entry *entry = iter->ent; | 26 | struct trace_entry *entry = iter->ent; |
| 27 | struct bputs_entry *field; | 27 | struct bputs_entry *field; |
| 28 | int ret; | ||
| 29 | 28 | ||
| 30 | trace_assign_type(field, entry); | 29 | trace_assign_type(field, entry); |
| 31 | 30 | ||
| 32 | ret = trace_seq_puts(s, field->str); | 31 | trace_seq_puts(s, field->str); |
| 33 | if (!ret) | ||
| 34 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 35 | 32 | ||
| 36 | return TRACE_TYPE_HANDLED; | 33 | return trace_handle_return(s); |
| 37 | } | 34 | } |
| 38 | 35 | ||
| 39 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) | 36 | enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) |
| @@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) | |||
| 41 | struct trace_seq *s = &iter->seq; | 38 | struct trace_seq *s = &iter->seq; |
| 42 | struct trace_entry *entry = iter->ent; | 39 | struct trace_entry *entry = iter->ent; |
| 43 | struct bprint_entry *field; | 40 | struct bprint_entry *field; |
| 44 | int ret; | ||
| 45 | 41 | ||
| 46 | trace_assign_type(field, entry); | 42 | trace_assign_type(field, entry); |
| 47 | 43 | ||
| 48 | ret = trace_seq_bprintf(s, field->fmt, field->buf); | 44 | trace_seq_bprintf(s, field->fmt, field->buf); |
| 49 | if (!ret) | ||
| 50 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 51 | 45 | ||
| 52 | return TRACE_TYPE_HANDLED; | 46 | return trace_handle_return(s); |
| 53 | } | 47 | } |
| 54 | 48 | ||
| 55 | enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | 49 | enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) |
| @@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | |||
| 57 | struct trace_seq *s = &iter->seq; | 51 | struct trace_seq *s = &iter->seq; |
| 58 | struct trace_entry *entry = iter->ent; | 52 | struct trace_entry *entry = iter->ent; |
| 59 | struct print_entry *field; | 53 | struct print_entry *field; |
| 60 | int ret; | ||
| 61 | 54 | ||
| 62 | trace_assign_type(field, entry); | 55 | trace_assign_type(field, entry); |
| 63 | 56 | ||
| 64 | ret = trace_seq_puts(s, field->buf); | 57 | trace_seq_puts(s, field->buf); |
| 65 | if (!ret) | ||
| 66 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 67 | 58 | ||
| 68 | return TRACE_TYPE_HANDLED; | 59 | return trace_handle_return(s); |
| 69 | } | 60 | } |
| 70 | 61 | ||
| 71 | const char * | 62 | const char * |
| @@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
| 124 | 115 | ||
| 125 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) | 116 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) |
| 126 | trace_seq_printf(p, "0x%lx", val); | 117 | trace_seq_printf(p, "0x%lx", val); |
| 127 | 118 | ||
| 128 | trace_seq_putc(p, 0); | 119 | trace_seq_putc(p, 0); |
| 129 | 120 | ||
| 130 | return ret; | 121 | return ret; |
| @@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, | |||
| 193 | struct trace_seq *s = &iter->seq; | 184 | struct trace_seq *s = &iter->seq; |
| 194 | struct trace_seq *p = &iter->tmp_seq; | 185 | struct trace_seq *p = &iter->tmp_seq; |
| 195 | struct trace_entry *entry; | 186 | struct trace_entry *entry; |
| 196 | int ret; | ||
| 197 | 187 | ||
| 198 | event = container_of(trace_event, struct ftrace_event_call, event); | 188 | event = container_of(trace_event, struct ftrace_event_call, event); |
| 199 | entry = iter->ent; | 189 | entry = iter->ent; |
| @@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, | |||
| 204 | } | 194 | } |
| 205 | 195 | ||
| 206 | trace_seq_init(p); | 196 | trace_seq_init(p); |
| 207 | ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); | 197 | trace_seq_printf(s, "%s: ", ftrace_event_name(event)); |
| 208 | if (!ret) | ||
| 209 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 210 | 198 | ||
| 211 | return 0; | 199 | return trace_handle_return(s); |
| 212 | } | 200 | } |
| 213 | EXPORT_SYMBOL(ftrace_raw_output_prep); | 201 | EXPORT_SYMBOL(ftrace_raw_output_prep); |
| 214 | 202 | ||
| @@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, | |||
| 216 | char *fmt, va_list ap) | 204 | char *fmt, va_list ap) |
| 217 | { | 205 | { |
| 218 | struct trace_seq *s = &iter->seq; | 206 | struct trace_seq *s = &iter->seq; |
| 219 | int ret; | ||
| 220 | |||
| 221 | ret = trace_seq_printf(s, "%s: ", name); | ||
| 222 | if (!ret) | ||
| 223 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 224 | |||
| 225 | ret = trace_seq_vprintf(s, fmt, ap); | ||
| 226 | 207 | ||
| 227 | if (!ret) | 208 | trace_seq_printf(s, "%s: ", name); |
| 228 | return TRACE_TYPE_PARTIAL_LINE; | 209 | trace_seq_vprintf(s, fmt, ap); |
| 229 | 210 | ||
| 230 | return TRACE_TYPE_HANDLED; | 211 | return trace_handle_return(s); |
| 231 | } | 212 | } |
| 232 | 213 | ||
| 233 | int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) | 214 | int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) |
| @@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name) | |||
| 260 | } | 241 | } |
| 261 | #endif /* CONFIG_KRETPROBES */ | 242 | #endif /* CONFIG_KRETPROBES */ |
| 262 | 243 | ||
| 263 | static int | 244 | static void |
| 264 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | 245 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) |
| 265 | { | 246 | { |
| 266 | #ifdef CONFIG_KALLSYMS | 247 | #ifdef CONFIG_KALLSYMS |
| @@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | |||
| 271 | 252 | ||
| 272 | name = kretprobed(str); | 253 | name = kretprobed(str); |
| 273 | 254 | ||
| 274 | return trace_seq_printf(s, fmt, name); | 255 | trace_seq_printf(s, fmt, name); |
| 275 | #endif | 256 | #endif |
| 276 | return 1; | ||
| 277 | } | 257 | } |
| 278 | 258 | ||
| 279 | static int | 259 | static void |
| 280 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, | 260 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, |
| 281 | unsigned long address) | 261 | unsigned long address) |
| 282 | { | 262 | { |
| @@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, | |||
| 287 | sprint_symbol(str, address); | 267 | sprint_symbol(str, address); |
| 288 | name = kretprobed(str); | 268 | name = kretprobed(str); |
| 289 | 269 | ||
| 290 | return trace_seq_printf(s, fmt, name); | 270 | trace_seq_printf(s, fmt, name); |
| 291 | #endif | 271 | #endif |
| 292 | return 1; | ||
| 293 | } | 272 | } |
| 294 | 273 | ||
| 295 | #ifndef CONFIG_64BIT | 274 | #ifndef CONFIG_64BIT |
| @@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, | |||
| 320 | if (file) { | 299 | if (file) { |
| 321 | ret = trace_seq_path(s, &file->f_path); | 300 | ret = trace_seq_path(s, &file->f_path); |
| 322 | if (ret) | 301 | if (ret) |
| 323 | ret = trace_seq_printf(s, "[+0x%lx]", | 302 | trace_seq_printf(s, "[+0x%lx]", |
| 324 | ip - vmstart); | 303 | ip - vmstart); |
| 325 | } | 304 | } |
| 326 | up_read(&mm->mmap_sem); | 305 | up_read(&mm->mmap_sem); |
| 327 | } | 306 | } |
| 328 | if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) | 307 | if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) |
| 329 | ret = trace_seq_printf(s, " <" IP_FMT ">", ip); | 308 | trace_seq_printf(s, " <" IP_FMT ">", ip); |
| 330 | return ret; | 309 | return !trace_seq_has_overflowed(s); |
| 331 | } | 310 | } |
| 332 | 311 | ||
| 333 | int | 312 | int |
| @@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, | |||
| 335 | unsigned long sym_flags) | 314 | unsigned long sym_flags) |
| 336 | { | 315 | { |
| 337 | struct mm_struct *mm = NULL; | 316 | struct mm_struct *mm = NULL; |
| 338 | int ret = 1; | ||
| 339 | unsigned int i; | 317 | unsigned int i; |
| 340 | 318 | ||
| 341 | if (trace_flags & TRACE_ITER_SYM_USEROBJ) { | 319 | if (trace_flags & TRACE_ITER_SYM_USEROBJ) { |
| @@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, | |||
| 354 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { | 332 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { |
| 355 | unsigned long ip = entry->caller[i]; | 333 | unsigned long ip = entry->caller[i]; |
| 356 | 334 | ||
| 357 | if (ip == ULONG_MAX || !ret) | 335 | if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) |
| 358 | break; | 336 | break; |
| 359 | if (ret) | 337 | |
| 360 | ret = trace_seq_puts(s, " => "); | 338 | trace_seq_puts(s, " => "); |
| 339 | |||
| 361 | if (!ip) { | 340 | if (!ip) { |
| 362 | if (ret) | 341 | trace_seq_puts(s, "??"); |
| 363 | ret = trace_seq_puts(s, "??"); | 342 | trace_seq_putc(s, '\n'); |
| 364 | if (ret) | ||
| 365 | ret = trace_seq_putc(s, '\n'); | ||
| 366 | continue; | 343 | continue; |
| 367 | } | 344 | } |
| 368 | if (!ret) | 345 | |
| 369 | break; | 346 | seq_print_user_ip(s, mm, ip, sym_flags); |
| 370 | if (ret) | 347 | trace_seq_putc(s, '\n'); |
| 371 | ret = seq_print_user_ip(s, mm, ip, sym_flags); | ||
| 372 | ret = trace_seq_putc(s, '\n'); | ||
| 373 | } | 348 | } |
| 374 | 349 | ||
| 375 | if (mm) | 350 | if (mm) |
| 376 | mmput(mm); | 351 | mmput(mm); |
| 377 | return ret; | 352 | |
| 353 | return !trace_seq_has_overflowed(s); | ||
| 378 | } | 354 | } |
| 379 | 355 | ||
| 380 | int | 356 | int |
| 381 | seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | 357 | seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) |
| 382 | { | 358 | { |
| 383 | int ret; | 359 | if (!ip) { |
| 384 | 360 | trace_seq_putc(s, '0'); | |
| 385 | if (!ip) | 361 | goto out; |
| 386 | return trace_seq_putc(s, '0'); | 362 | } |
| 387 | 363 | ||
| 388 | if (sym_flags & TRACE_ITER_SYM_OFFSET) | 364 | if (sym_flags & TRACE_ITER_SYM_OFFSET) |
| 389 | ret = seq_print_sym_offset(s, "%s", ip); | 365 | seq_print_sym_offset(s, "%s", ip); |
| 390 | else | 366 | else |
| 391 | ret = seq_print_sym_short(s, "%s", ip); | 367 | seq_print_sym_short(s, "%s", ip); |
| 392 | |||
| 393 | if (!ret) | ||
| 394 | return 0; | ||
| 395 | 368 | ||
| 396 | if (sym_flags & TRACE_ITER_SYM_ADDR) | 369 | if (sym_flags & TRACE_ITER_SYM_ADDR) |
| 397 | ret = trace_seq_printf(s, " <" IP_FMT ">", ip); | 370 | trace_seq_printf(s, " <" IP_FMT ">", ip); |
| 398 | return ret; | 371 | |
| 372 | out: | ||
| 373 | return !trace_seq_has_overflowed(s); | ||
| 399 | } | 374 | } |
| 400 | 375 | ||
| 401 | /** | 376 | /** |
| @@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 413 | char irqs_off; | 388 | char irqs_off; |
| 414 | int hardirq; | 389 | int hardirq; |
| 415 | int softirq; | 390 | int softirq; |
| 416 | int ret; | ||
| 417 | 391 | ||
| 418 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; | 392 | hardirq = entry->flags & TRACE_FLAG_HARDIRQ; |
| 419 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; | 393 | softirq = entry->flags & TRACE_FLAG_SOFTIRQ; |
| @@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 445 | softirq ? 's' : | 419 | softirq ? 's' : |
| 446 | '.'; | 420 | '.'; |
| 447 | 421 | ||
| 448 | if (!trace_seq_printf(s, "%c%c%c", | 422 | trace_seq_printf(s, "%c%c%c", |
| 449 | irqs_off, need_resched, hardsoft_irq)) | 423 | irqs_off, need_resched, hardsoft_irq); |
| 450 | return 0; | ||
| 451 | 424 | ||
| 452 | if (entry->preempt_count) | 425 | if (entry->preempt_count) |
| 453 | ret = trace_seq_printf(s, "%x", entry->preempt_count); | 426 | trace_seq_printf(s, "%x", entry->preempt_count); |
| 454 | else | 427 | else |
| 455 | ret = trace_seq_putc(s, '.'); | 428 | trace_seq_putc(s, '.'); |
| 456 | 429 | ||
| 457 | return ret; | 430 | return !trace_seq_has_overflowed(s); |
| 458 | } | 431 | } |
| 459 | 432 | ||
| 460 | static int | 433 | static int |
| @@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) | |||
| 464 | 437 | ||
| 465 | trace_find_cmdline(entry->pid, comm); | 438 | trace_find_cmdline(entry->pid, comm); |
| 466 | 439 | ||
| 467 | if (!trace_seq_printf(s, "%8.8s-%-5d %3d", | 440 | trace_seq_printf(s, "%8.8s-%-5d %3d", |
| 468 | comm, entry->pid, cpu)) | 441 | comm, entry->pid, cpu); |
| 469 | return 0; | ||
| 470 | 442 | ||
| 471 | return trace_print_lat_fmt(s, entry); | 443 | return trace_print_lat_fmt(s, entry); |
| 472 | } | 444 | } |
| 473 | 445 | ||
| 474 | static unsigned long preempt_mark_thresh_us = 100; | 446 | #undef MARK |
| 447 | #define MARK(v, s) {.val = v, .sym = s} | ||
| 448 | /* trace overhead mark */ | ||
| 449 | static const struct trace_mark { | ||
| 450 | unsigned long long val; /* unit: nsec */ | ||
| 451 | char sym; | ||
| 452 | } mark[] = { | ||
| 453 | MARK(1000000000ULL , '$'), /* 1 sec */ | ||
| 454 | MARK(1000000ULL , '#'), /* 1000 usecs */ | ||
| 455 | MARK(100000ULL , '!'), /* 100 usecs */ | ||
| 456 | MARK(10000ULL , '+'), /* 10 usecs */ | ||
| 457 | }; | ||
| 458 | #undef MARK | ||
| 459 | |||
| 460 | char trace_find_mark(unsigned long long d) | ||
| 461 | { | ||
| 462 | int i; | ||
| 463 | int size = ARRAY_SIZE(mark); | ||
| 464 | |||
| 465 | for (i = 0; i < size; i++) { | ||
| 466 | if (d >= mark[i].val) | ||
| 467 | break; | ||
| 468 | } | ||
| 469 | |||
| 470 | return (i == size) ? ' ' : mark[i].sym; | ||
| 471 | } | ||
| 475 | 472 | ||
| 476 | static int | 473 | static int |
| 477 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) | 474 | lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) |
| @@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) | |||
| 493 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); | 490 | unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); |
| 494 | unsigned long rel_msec = (unsigned long)rel_ts; | 491 | unsigned long rel_msec = (unsigned long)rel_ts; |
| 495 | 492 | ||
| 496 | return trace_seq_printf( | 493 | trace_seq_printf( |
| 497 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", | 494 | s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", |
| 498 | ns2usecs(iter->ts), | 495 | ns2usecs(iter->ts), |
| 499 | abs_msec, abs_usec, | 496 | abs_msec, abs_usec, |
| 500 | rel_msec, rel_usec); | 497 | rel_msec, rel_usec); |
| 498 | |||
| 501 | } else if (verbose && !in_ns) { | 499 | } else if (verbose && !in_ns) { |
| 502 | return trace_seq_printf( | 500 | trace_seq_printf( |
| 503 | s, "[%016llx] %lld (+%lld): ", | 501 | s, "[%016llx] %lld (+%lld): ", |
| 504 | iter->ts, abs_ts, rel_ts); | 502 | iter->ts, abs_ts, rel_ts); |
| 503 | |||
| 505 | } else if (!verbose && in_ns) { | 504 | } else if (!verbose && in_ns) { |
| 506 | return trace_seq_printf( | 505 | trace_seq_printf( |
| 507 | s, " %4lldus%c: ", | 506 | s, " %4lldus%c: ", |
| 508 | abs_ts, | 507 | abs_ts, |
| 509 | rel_ts > preempt_mark_thresh_us ? '!' : | 508 | trace_find_mark(rel_ts * NSEC_PER_USEC)); |
| 510 | rel_ts > 1 ? '+' : ' '); | 509 | |
| 511 | } else { /* !verbose && !in_ns */ | 510 | } else { /* !verbose && !in_ns */ |
| 512 | return trace_seq_printf(s, " %4lld: ", abs_ts); | 511 | trace_seq_printf(s, " %4lld: ", abs_ts); |
| 513 | } | 512 | } |
| 513 | |||
| 514 | return !trace_seq_has_overflowed(s); | ||
| 514 | } | 515 | } |
| 515 | 516 | ||
| 516 | int trace_print_context(struct trace_iterator *iter) | 517 | int trace_print_context(struct trace_iterator *iter) |
| @@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 520 | unsigned long long t; | 521 | unsigned long long t; |
| 521 | unsigned long secs, usec_rem; | 522 | unsigned long secs, usec_rem; |
| 522 | char comm[TASK_COMM_LEN]; | 523 | char comm[TASK_COMM_LEN]; |
| 523 | int ret; | ||
| 524 | 524 | ||
| 525 | trace_find_cmdline(entry->pid, comm); | 525 | trace_find_cmdline(entry->pid, comm); |
| 526 | 526 | ||
| 527 | ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", | 527 | trace_seq_printf(s, "%16s-%-5d [%03d] ", |
| 528 | comm, entry->pid, iter->cpu); | 528 | comm, entry->pid, iter->cpu); |
| 529 | if (!ret) | ||
| 530 | return 0; | ||
| 531 | 529 | ||
| 532 | if (trace_flags & TRACE_ITER_IRQ_INFO) { | 530 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
| 533 | ret = trace_print_lat_fmt(s, entry); | 531 | trace_print_lat_fmt(s, entry); |
| 534 | if (!ret) | ||
| 535 | return 0; | ||
| 536 | } | ||
| 537 | 532 | ||
| 538 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { | 533 | if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { |
| 539 | t = ns2usecs(iter->ts); | 534 | t = ns2usecs(iter->ts); |
| 540 | usec_rem = do_div(t, USEC_PER_SEC); | 535 | usec_rem = do_div(t, USEC_PER_SEC); |
| 541 | secs = (unsigned long)t; | 536 | secs = (unsigned long)t; |
| 542 | return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); | 537 | trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); |
| 543 | } else | 538 | } else |
| 544 | return trace_seq_printf(s, " %12llu: ", iter->ts); | 539 | trace_seq_printf(s, " %12llu: ", iter->ts); |
| 540 | |||
| 541 | return !trace_seq_has_overflowed(s); | ||
| 545 | } | 542 | } |
| 546 | 543 | ||
| 547 | int trace_print_lat_context(struct trace_iterator *iter) | 544 | int trace_print_lat_context(struct trace_iterator *iter) |
| 548 | { | 545 | { |
| 549 | u64 next_ts; | 546 | u64 next_ts; |
| 550 | int ret; | ||
| 551 | /* trace_find_next_entry will reset ent_size */ | 547 | /* trace_find_next_entry will reset ent_size */ |
| 552 | int ent_size = iter->ent_size; | 548 | int ent_size = iter->ent_size; |
| 553 | struct trace_seq *s = &iter->seq; | 549 | struct trace_seq *s = &iter->seq; |
| @@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
| 567 | 563 | ||
| 568 | trace_find_cmdline(entry->pid, comm); | 564 | trace_find_cmdline(entry->pid, comm); |
| 569 | 565 | ||
| 570 | ret = trace_seq_printf( | 566 | trace_seq_printf( |
| 571 | s, "%16s %5d %3d %d %08x %08lx ", | 567 | s, "%16s %5d %3d %d %08x %08lx ", |
| 572 | comm, entry->pid, iter->cpu, entry->flags, | 568 | comm, entry->pid, iter->cpu, entry->flags, |
| 573 | entry->preempt_count, iter->idx); | 569 | entry->preempt_count, iter->idx); |
| 574 | } else { | 570 | } else { |
| 575 | ret = lat_print_generic(s, entry, iter->cpu); | 571 | lat_print_generic(s, entry, iter->cpu); |
| 576 | } | 572 | } |
| 577 | 573 | ||
| 578 | if (ret) | 574 | lat_print_timestamp(iter, next_ts); |
| 579 | ret = lat_print_timestamp(iter, next_ts); | ||
| 580 | 575 | ||
| 581 | return ret; | 576 | return !trace_seq_has_overflowed(s); |
| 582 | } | 577 | } |
| 583 | 578 | ||
| 584 | static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; | 579 | static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; |
| @@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event) | |||
| 692 | goto out; | 687 | goto out; |
| 693 | 688 | ||
| 694 | } else { | 689 | } else { |
| 695 | 690 | ||
| 696 | event->type = next_event_type++; | 691 | event->type = next_event_type++; |
| 697 | list = &ftrace_event_list; | 692 | list = &ftrace_event_list; |
| 698 | } | 693 | } |
| @@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
| 764 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 759 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
| 765 | struct trace_event *event) | 760 | struct trace_event *event) |
| 766 | { | 761 | { |
| 767 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | 762 | trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); |
| 768 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 769 | 763 | ||
| 770 | return TRACE_TYPE_HANDLED; | 764 | return trace_handle_return(&iter->seq); |
| 771 | } | 765 | } |
| 772 | 766 | ||
| 773 | /* TRACE_FN */ | 767 | /* TRACE_FN */ |
| @@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, | |||
| 779 | 773 | ||
| 780 | trace_assign_type(field, iter->ent); | 774 | trace_assign_type(field, iter->ent); |
| 781 | 775 | ||
| 782 | if (!seq_print_ip_sym(s, field->ip, flags)) | 776 | seq_print_ip_sym(s, field->ip, flags); |
| 783 | goto partial; | ||
| 784 | 777 | ||
| 785 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { | 778 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { |
| 786 | if (!trace_seq_puts(s, " <-")) | 779 | trace_seq_puts(s, " <-"); |
| 787 | goto partial; | 780 | seq_print_ip_sym(s, field->parent_ip, flags); |
| 788 | if (!seq_print_ip_sym(s, | ||
| 789 | field->parent_ip, | ||
| 790 | flags)) | ||
| 791 | goto partial; | ||
| 792 | } | 781 | } |
| 793 | if (!trace_seq_putc(s, '\n')) | ||
| 794 | goto partial; | ||
| 795 | 782 | ||
| 796 | return TRACE_TYPE_HANDLED; | 783 | trace_seq_putc(s, '\n'); |
| 797 | 784 | ||
| 798 | partial: | 785 | return trace_handle_return(s); |
| 799 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 800 | } | 786 | } |
| 801 | 787 | ||
| 802 | static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, | 788 | static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, |
| @@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, | |||
| 806 | 792 | ||
| 807 | trace_assign_type(field, iter->ent); | 793 | trace_assign_type(field, iter->ent); |
| 808 | 794 | ||
| 809 | if (!trace_seq_printf(&iter->seq, "%lx %lx\n", | 795 | trace_seq_printf(&iter->seq, "%lx %lx\n", |
| 810 | field->ip, | 796 | field->ip, |
| 811 | field->parent_ip)) | 797 | field->parent_ip); |
| 812 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 813 | 798 | ||
| 814 | return TRACE_TYPE_HANDLED; | 799 | return trace_handle_return(&iter->seq); |
| 815 | } | 800 | } |
| 816 | 801 | ||
| 817 | static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, | 802 | static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, |
| @@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, | |||
| 822 | 807 | ||
| 823 | trace_assign_type(field, iter->ent); | 808 | trace_assign_type(field, iter->ent); |
| 824 | 809 | ||
| 825 | SEQ_PUT_HEX_FIELD_RET(s, field->ip); | 810 | SEQ_PUT_HEX_FIELD(s, field->ip); |
| 826 | SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); | 811 | SEQ_PUT_HEX_FIELD(s, field->parent_ip); |
| 827 | 812 | ||
| 828 | return TRACE_TYPE_HANDLED; | 813 | return trace_handle_return(s); |
| 829 | } | 814 | } |
| 830 | 815 | ||
| 831 | static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, | 816 | static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, |
| @@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, | |||
| 836 | 821 | ||
| 837 | trace_assign_type(field, iter->ent); | 822 | trace_assign_type(field, iter->ent); |
| 838 | 823 | ||
| 839 | SEQ_PUT_FIELD_RET(s, field->ip); | 824 | SEQ_PUT_FIELD(s, field->ip); |
| 840 | SEQ_PUT_FIELD_RET(s, field->parent_ip); | 825 | SEQ_PUT_FIELD(s, field->parent_ip); |
| 841 | 826 | ||
| 842 | return TRACE_TYPE_HANDLED; | 827 | return trace_handle_return(s); |
| 843 | } | 828 | } |
| 844 | 829 | ||
| 845 | static struct trace_event_functions trace_fn_funcs = { | 830 | static struct trace_event_functions trace_fn_funcs = { |
| @@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, | |||
| 868 | T = task_state_char(field->next_state); | 853 | T = task_state_char(field->next_state); |
| 869 | S = task_state_char(field->prev_state); | 854 | S = task_state_char(field->prev_state); |
| 870 | trace_find_cmdline(field->next_pid, comm); | 855 | trace_find_cmdline(field->next_pid, comm); |
| 871 | if (!trace_seq_printf(&iter->seq, | 856 | trace_seq_printf(&iter->seq, |
| 872 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", | 857 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", |
| 873 | field->prev_pid, | 858 | field->prev_pid, |
| 874 | field->prev_prio, | 859 | field->prev_prio, |
| 875 | S, delim, | 860 | S, delim, |
| 876 | field->next_cpu, | 861 | field->next_cpu, |
| 877 | field->next_pid, | 862 | field->next_pid, |
| 878 | field->next_prio, | 863 | field->next_prio, |
| 879 | T, comm)) | 864 | T, comm); |
| 880 | return TRACE_TYPE_PARTIAL_LINE; | 865 | |
| 881 | 866 | return trace_handle_return(&iter->seq); | |
| 882 | return TRACE_TYPE_HANDLED; | ||
| 883 | } | 867 | } |
| 884 | 868 | ||
| 885 | static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, | 869 | static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, |
| @@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) | |||
| 904 | if (!S) | 888 | if (!S) |
| 905 | S = task_state_char(field->prev_state); | 889 | S = task_state_char(field->prev_state); |
| 906 | T = task_state_char(field->next_state); | 890 | T = task_state_char(field->next_state); |
| 907 | if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", | 891 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", |
| 908 | field->prev_pid, | 892 | field->prev_pid, |
| 909 | field->prev_prio, | 893 | field->prev_prio, |
| 910 | S, | 894 | S, |
| 911 | field->next_cpu, | 895 | field->next_cpu, |
| 912 | field->next_pid, | 896 | field->next_pid, |
| 913 | field->next_prio, | 897 | field->next_prio, |
| 914 | T)) | 898 | T); |
| 915 | return TRACE_TYPE_PARTIAL_LINE; | 899 | |
| 916 | 900 | return trace_handle_return(&iter->seq); | |
| 917 | return TRACE_TYPE_HANDLED; | ||
| 918 | } | 901 | } |
| 919 | 902 | ||
| 920 | static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, | 903 | static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, |
| @@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) | |||
| 942 | S = task_state_char(field->prev_state); | 925 | S = task_state_char(field->prev_state); |
| 943 | T = task_state_char(field->next_state); | 926 | T = task_state_char(field->next_state); |
| 944 | 927 | ||
| 945 | SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); | 928 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); |
| 946 | SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); | 929 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); |
| 947 | SEQ_PUT_HEX_FIELD_RET(s, S); | 930 | SEQ_PUT_HEX_FIELD(s, S); |
| 948 | SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); | 931 | SEQ_PUT_HEX_FIELD(s, field->next_cpu); |
| 949 | SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); | 932 | SEQ_PUT_HEX_FIELD(s, field->next_pid); |
| 950 | SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); | 933 | SEQ_PUT_HEX_FIELD(s, field->next_prio); |
| 951 | SEQ_PUT_HEX_FIELD_RET(s, T); | 934 | SEQ_PUT_HEX_FIELD(s, T); |
| 952 | 935 | ||
| 953 | return TRACE_TYPE_HANDLED; | 936 | return trace_handle_return(s); |
| 954 | } | 937 | } |
| 955 | 938 | ||
| 956 | static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, | 939 | static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, |
| @@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, | |||
| 973 | 956 | ||
| 974 | trace_assign_type(field, iter->ent); | 957 | trace_assign_type(field, iter->ent); |
| 975 | 958 | ||
| 976 | SEQ_PUT_FIELD_RET(s, field->prev_pid); | 959 | SEQ_PUT_FIELD(s, field->prev_pid); |
| 977 | SEQ_PUT_FIELD_RET(s, field->prev_prio); | 960 | SEQ_PUT_FIELD(s, field->prev_prio); |
| 978 | SEQ_PUT_FIELD_RET(s, field->prev_state); | 961 | SEQ_PUT_FIELD(s, field->prev_state); |
| 979 | SEQ_PUT_FIELD_RET(s, field->next_pid); | 962 | SEQ_PUT_FIELD(s, field->next_cpu); |
| 980 | SEQ_PUT_FIELD_RET(s, field->next_prio); | 963 | SEQ_PUT_FIELD(s, field->next_pid); |
| 981 | SEQ_PUT_FIELD_RET(s, field->next_state); | 964 | SEQ_PUT_FIELD(s, field->next_prio); |
| 965 | SEQ_PUT_FIELD(s, field->next_state); | ||
| 982 | 966 | ||
| 983 | return TRACE_TYPE_HANDLED; | 967 | return trace_handle_return(s); |
| 984 | } | 968 | } |
| 985 | 969 | ||
| 986 | static struct trace_event_functions trace_ctx_funcs = { | 970 | static struct trace_event_functions trace_ctx_funcs = { |
| @@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
| 1020 | trace_assign_type(field, iter->ent); | 1004 | trace_assign_type(field, iter->ent); |
| 1021 | end = (unsigned long *)((long)iter->ent + iter->ent_size); | 1005 | end = (unsigned long *)((long)iter->ent + iter->ent_size); |
| 1022 | 1006 | ||
| 1023 | if (!trace_seq_puts(s, "<stack trace>\n")) | 1007 | trace_seq_puts(s, "<stack trace>\n"); |
| 1024 | goto partial; | ||
| 1025 | 1008 | ||
| 1026 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { | 1009 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { |
| 1027 | if (!trace_seq_puts(s, " => ")) | ||
| 1028 | goto partial; | ||
| 1029 | 1010 | ||
| 1030 | if (!seq_print_ip_sym(s, *p, flags)) | 1011 | if (trace_seq_has_overflowed(s)) |
| 1031 | goto partial; | 1012 | break; |
| 1032 | if (!trace_seq_putc(s, '\n')) | ||
| 1033 | goto partial; | ||
| 1034 | } | ||
| 1035 | 1013 | ||
| 1036 | return TRACE_TYPE_HANDLED; | 1014 | trace_seq_puts(s, " => "); |
| 1015 | seq_print_ip_sym(s, *p, flags); | ||
| 1016 | trace_seq_putc(s, '\n'); | ||
| 1017 | } | ||
| 1037 | 1018 | ||
| 1038 | partial: | 1019 | return trace_handle_return(s); |
| 1039 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1040 | } | 1020 | } |
| 1041 | 1021 | ||
| 1042 | static struct trace_event_functions trace_stack_funcs = { | 1022 | static struct trace_event_functions trace_stack_funcs = { |
| @@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, | |||
| 1057 | 1037 | ||
| 1058 | trace_assign_type(field, iter->ent); | 1038 | trace_assign_type(field, iter->ent); |
| 1059 | 1039 | ||
| 1060 | if (!trace_seq_puts(s, "<user stack trace>\n")) | 1040 | trace_seq_puts(s, "<user stack trace>\n"); |
| 1061 | goto partial; | 1041 | seq_print_userip_objs(field, s, flags); |
| 1062 | |||
| 1063 | if (!seq_print_userip_objs(field, s, flags)) | ||
| 1064 | goto partial; | ||
| 1065 | |||
| 1066 | return TRACE_TYPE_HANDLED; | ||
| 1067 | 1042 | ||
| 1068 | partial: | 1043 | return trace_handle_return(s); |
| 1069 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1070 | } | 1044 | } |
| 1071 | 1045 | ||
| 1072 | static struct trace_event_functions trace_user_stack_funcs = { | 1046 | static struct trace_event_functions trace_user_stack_funcs = { |
| @@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags, | |||
| 1089 | 1063 | ||
| 1090 | trace_assign_type(field, entry); | 1064 | trace_assign_type(field, entry); |
| 1091 | 1065 | ||
| 1092 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1066 | seq_print_ip_sym(s, field->ip, flags); |
| 1093 | goto partial; | 1067 | trace_seq_puts(s, ": "); |
| 1068 | trace_seq_puts(s, field->str); | ||
| 1094 | 1069 | ||
| 1095 | if (!trace_seq_puts(s, ": ")) | 1070 | return trace_handle_return(s); |
| 1096 | goto partial; | ||
| 1097 | |||
| 1098 | if (!trace_seq_puts(s, field->str)) | ||
| 1099 | goto partial; | ||
| 1100 | |||
| 1101 | return TRACE_TYPE_HANDLED; | ||
| 1102 | |||
| 1103 | partial: | ||
| 1104 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1105 | } | 1071 | } |
| 1106 | 1072 | ||
| 1107 | 1073 | ||
| @@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags, | |||
| 1114 | 1080 | ||
| 1115 | trace_assign_type(field, iter->ent); | 1081 | trace_assign_type(field, iter->ent); |
| 1116 | 1082 | ||
| 1117 | if (!trace_seq_printf(s, ": %lx : ", field->ip)) | 1083 | trace_seq_printf(s, ": %lx : ", field->ip); |
| 1118 | goto partial; | 1084 | trace_seq_puts(s, field->str); |
| 1119 | |||
| 1120 | if (!trace_seq_puts(s, field->str)) | ||
| 1121 | goto partial; | ||
| 1122 | 1085 | ||
| 1123 | return TRACE_TYPE_HANDLED; | 1086 | return trace_handle_return(s); |
| 1124 | |||
| 1125 | partial: | ||
| 1126 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1127 | } | 1087 | } |
| 1128 | 1088 | ||
| 1129 | static struct trace_event_functions trace_bputs_funcs = { | 1089 | static struct trace_event_functions trace_bputs_funcs = { |
| @@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags, | |||
| 1147 | 1107 | ||
| 1148 | trace_assign_type(field, entry); | 1108 | trace_assign_type(field, entry); |
| 1149 | 1109 | ||
| 1150 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1110 | seq_print_ip_sym(s, field->ip, flags); |
| 1151 | goto partial; | 1111 | trace_seq_puts(s, ": "); |
| 1152 | 1112 | trace_seq_bprintf(s, field->fmt, field->buf); | |
| 1153 | if (!trace_seq_puts(s, ": ")) | ||
| 1154 | goto partial; | ||
| 1155 | |||
| 1156 | if (!trace_seq_bprintf(s, field->fmt, field->buf)) | ||
| 1157 | goto partial; | ||
| 1158 | 1113 | ||
| 1159 | return TRACE_TYPE_HANDLED; | 1114 | return trace_handle_return(s); |
| 1160 | |||
| 1161 | partial: | ||
| 1162 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1163 | } | 1115 | } |
| 1164 | 1116 | ||
| 1165 | 1117 | ||
| @@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags, | |||
| 1172 | 1124 | ||
| 1173 | trace_assign_type(field, iter->ent); | 1125 | trace_assign_type(field, iter->ent); |
| 1174 | 1126 | ||
| 1175 | if (!trace_seq_printf(s, ": %lx : ", field->ip)) | 1127 | trace_seq_printf(s, ": %lx : ", field->ip); |
| 1176 | goto partial; | 1128 | trace_seq_bprintf(s, field->fmt, field->buf); |
| 1177 | |||
| 1178 | if (!trace_seq_bprintf(s, field->fmt, field->buf)) | ||
| 1179 | goto partial; | ||
| 1180 | 1129 | ||
| 1181 | return TRACE_TYPE_HANDLED; | 1130 | return trace_handle_return(s); |
| 1182 | |||
| 1183 | partial: | ||
| 1184 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1185 | } | 1131 | } |
| 1186 | 1132 | ||
| 1187 | static struct trace_event_functions trace_bprint_funcs = { | 1133 | static struct trace_event_functions trace_bprint_funcs = { |
| @@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, | |||
| 1203 | 1149 | ||
| 1204 | trace_assign_type(field, iter->ent); | 1150 | trace_assign_type(field, iter->ent); |
| 1205 | 1151 | ||
| 1206 | if (!seq_print_ip_sym(s, field->ip, flags)) | 1152 | seq_print_ip_sym(s, field->ip, flags); |
| 1207 | goto partial; | 1153 | trace_seq_printf(s, ": %s", field->buf); |
| 1208 | |||
| 1209 | if (!trace_seq_printf(s, ": %s", field->buf)) | ||
| 1210 | goto partial; | ||
| 1211 | 1154 | ||
| 1212 | return TRACE_TYPE_HANDLED; | 1155 | return trace_handle_return(s); |
| 1213 | |||
| 1214 | partial: | ||
| 1215 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1216 | } | 1156 | } |
| 1217 | 1157 | ||
| 1218 | static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, | 1158 | static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, |
| @@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, | |||
| 1222 | 1162 | ||
| 1223 | trace_assign_type(field, iter->ent); | 1163 | trace_assign_type(field, iter->ent); |
| 1224 | 1164 | ||
| 1225 | if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) | 1165 | trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); |
| 1226 | goto partial; | ||
| 1227 | |||
| 1228 | return TRACE_TYPE_HANDLED; | ||
| 1229 | 1166 | ||
| 1230 | partial: | 1167 | return trace_handle_return(&iter->seq); |
| 1231 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1232 | } | 1168 | } |
| 1233 | 1169 | ||
| 1234 | static struct trace_event_functions trace_print_funcs = { | 1170 | static struct trace_event_functions trace_print_funcs = { |
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 80b25b585a70..8ef2c40efb3c 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h | |||
| @@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); | |||
| 35 | extern int __unregister_ftrace_event(struct trace_event *event); | 35 | extern int __unregister_ftrace_event(struct trace_event *event); |
| 36 | extern struct rw_semaphore trace_event_sem; | 36 | extern struct rw_semaphore trace_event_sem; |
| 37 | 37 | ||
| 38 | #define SEQ_PUT_FIELD_RET(s, x) \ | 38 | #define SEQ_PUT_FIELD(s, x) \ |
| 39 | do { \ | 39 | trace_seq_putmem(s, &(x), sizeof(x)) |
| 40 | if (!trace_seq_putmem(s, &(x), sizeof(x))) \ | 40 | |
| 41 | return TRACE_TYPE_PARTIAL_LINE; \ | 41 | #define SEQ_PUT_HEX_FIELD(s, x) \ |
| 42 | } while (0) | 42 | trace_seq_putmem_hex(s, &(x), sizeof(x)) |
| 43 | |||
| 44 | #define SEQ_PUT_HEX_FIELD_RET(s, x) \ | ||
| 45 | do { \ | ||
| 46 | if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ | ||
| 47 | return TRACE_TYPE_PARTIAL_LINE; \ | ||
| 48 | } while (0) | ||
| 49 | 43 | ||
| 50 | #endif | 44 | #endif |
| 51 | 45 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65c..c4e70b6bd7fa 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
| @@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v) | |||
| 305 | seq_puts(m, "\\t"); | 305 | seq_puts(m, "\\t"); |
| 306 | break; | 306 | break; |
| 307 | case '\\': | 307 | case '\\': |
| 308 | seq_puts(m, "\\"); | 308 | seq_putc(m, '\\'); |
| 309 | break; | 309 | break; |
| 310 | case '"': | 310 | case '"': |
| 311 | seq_puts(m, "\\\""); | 311 | seq_puts(m, "\\\""); |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d4b9fc22cd27..b983b2fd2ca1 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -40,7 +40,8 @@ const char *reserved_field_names[] = { | |||
| 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ | 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ |
| 41 | void *data, void *ent) \ | 41 | void *data, void *ent) \ |
| 42 | { \ | 42 | { \ |
| 43 | return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ | 43 | trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
| 44 | return !trace_seq_has_overflowed(s); \ | ||
| 44 | } \ | 45 | } \ |
| 45 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ | 46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ |
| 46 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); | 47 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); |
| @@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, | |||
| 61 | int len = *(u32 *)data >> 16; | 62 | int len = *(u32 *)data >> 16; |
| 62 | 63 | ||
| 63 | if (!len) | 64 | if (!len) |
| 64 | return trace_seq_printf(s, " %s=(fault)", name); | 65 | trace_seq_printf(s, " %s=(fault)", name); |
| 65 | else | 66 | else |
| 66 | return trace_seq_printf(s, " %s=\"%s\"", name, | 67 | trace_seq_printf(s, " %s=\"%s\"", name, |
| 67 | (const char *)get_loc_data(data, ent)); | 68 | (const char *)get_loc_data(data, ent)); |
| 69 | return !trace_seq_has_overflowed(s); | ||
| 68 | } | 70 | } |
| 69 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); | 71 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); |
| 70 | 72 | ||
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9b40f3..2e293beb186e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
| @@ -14,122 +14,26 @@ | |||
| 14 | 14 | ||
| 15 | #include "trace.h" | 15 | #include "trace.h" |
| 16 | 16 | ||
| 17 | static struct trace_array *ctx_trace; | ||
| 18 | static int __read_mostly tracer_enabled; | ||
| 19 | static int sched_ref; | 17 | static int sched_ref; |
| 20 | static DEFINE_MUTEX(sched_register_mutex); | 18 | static DEFINE_MUTEX(sched_register_mutex); |
| 21 | static int sched_stopped; | ||
| 22 | |||
| 23 | |||
| 24 | void | ||
| 25 | tracing_sched_switch_trace(struct trace_array *tr, | ||
| 26 | struct task_struct *prev, | ||
| 27 | struct task_struct *next, | ||
| 28 | unsigned long flags, int pc) | ||
| 29 | { | ||
| 30 | struct ftrace_event_call *call = &event_context_switch; | ||
| 31 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 32 | struct ring_buffer_event *event; | ||
| 33 | struct ctx_switch_entry *entry; | ||
| 34 | |||
| 35 | event = trace_buffer_lock_reserve(buffer, TRACE_CTX, | ||
| 36 | sizeof(*entry), flags, pc); | ||
| 37 | if (!event) | ||
| 38 | return; | ||
| 39 | entry = ring_buffer_event_data(event); | ||
| 40 | entry->prev_pid = prev->pid; | ||
| 41 | entry->prev_prio = prev->prio; | ||
| 42 | entry->prev_state = prev->state; | ||
| 43 | entry->next_pid = next->pid; | ||
| 44 | entry->next_prio = next->prio; | ||
| 45 | entry->next_state = next->state; | ||
| 46 | entry->next_cpu = task_cpu(next); | ||
| 47 | |||
| 48 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 49 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 50 | } | ||
| 51 | 19 | ||
| 52 | static void | 20 | static void |
| 53 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) | 21 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) |
| 54 | { | 22 | { |
| 55 | struct trace_array_cpu *data; | ||
| 56 | unsigned long flags; | ||
| 57 | int cpu; | ||
| 58 | int pc; | ||
| 59 | |||
| 60 | if (unlikely(!sched_ref)) | 23 | if (unlikely(!sched_ref)) |
| 61 | return; | 24 | return; |
| 62 | 25 | ||
| 63 | tracing_record_cmdline(prev); | 26 | tracing_record_cmdline(prev); |
| 64 | tracing_record_cmdline(next); | 27 | tracing_record_cmdline(next); |
| 65 | |||
| 66 | if (!tracer_enabled || sched_stopped) | ||
| 67 | return; | ||
| 68 | |||
| 69 | pc = preempt_count(); | ||
| 70 | local_irq_save(flags); | ||
| 71 | cpu = raw_smp_processor_id(); | ||
| 72 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); | ||
| 73 | |||
| 74 | if (likely(!atomic_read(&data->disabled))) | ||
| 75 | tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); | ||
| 76 | |||
| 77 | local_irq_restore(flags); | ||
| 78 | } | ||
| 79 | |||
| 80 | void | ||
| 81 | tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 82 | struct task_struct *wakee, | ||
| 83 | struct task_struct *curr, | ||
| 84 | unsigned long flags, int pc) | ||
| 85 | { | ||
| 86 | struct ftrace_event_call *call = &event_wakeup; | ||
| 87 | struct ring_buffer_event *event; | ||
| 88 | struct ctx_switch_entry *entry; | ||
| 89 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 90 | |||
| 91 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, | ||
| 92 | sizeof(*entry), flags, pc); | ||
| 93 | if (!event) | ||
| 94 | return; | ||
| 95 | entry = ring_buffer_event_data(event); | ||
| 96 | entry->prev_pid = curr->pid; | ||
| 97 | entry->prev_prio = curr->prio; | ||
| 98 | entry->prev_state = curr->state; | ||
| 99 | entry->next_pid = wakee->pid; | ||
| 100 | entry->next_prio = wakee->prio; | ||
| 101 | entry->next_state = wakee->state; | ||
| 102 | entry->next_cpu = task_cpu(wakee); | ||
| 103 | |||
| 104 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 105 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 106 | } | 28 | } |
| 107 | 29 | ||
| 108 | static void | 30 | static void |
| 109 | probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) | 31 | probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) |
| 110 | { | 32 | { |
| 111 | struct trace_array_cpu *data; | ||
| 112 | unsigned long flags; | ||
| 113 | int cpu, pc; | ||
| 114 | |||
| 115 | if (unlikely(!sched_ref)) | 33 | if (unlikely(!sched_ref)) |
| 116 | return; | 34 | return; |
| 117 | 35 | ||
| 118 | tracing_record_cmdline(current); | 36 | tracing_record_cmdline(current); |
| 119 | |||
| 120 | if (!tracer_enabled || sched_stopped) | ||
| 121 | return; | ||
| 122 | |||
| 123 | pc = preempt_count(); | ||
| 124 | local_irq_save(flags); | ||
| 125 | cpu = raw_smp_processor_id(); | ||
| 126 | data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); | ||
| 127 | |||
| 128 | if (likely(!atomic_read(&data->disabled))) | ||
| 129 | tracing_sched_wakeup_trace(ctx_trace, wakee, current, | ||
| 130 | flags, pc); | ||
| 131 | |||
| 132 | local_irq_restore(flags); | ||
| 133 | } | 37 | } |
| 134 | 38 | ||
| 135 | static int tracing_sched_register(void) | 39 | static int tracing_sched_register(void) |
| @@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void) | |||
| 197 | { | 101 | { |
| 198 | tracing_stop_sched_switch(); | 102 | tracing_stop_sched_switch(); |
| 199 | } | 103 | } |
| 200 | |||
| 201 | /** | ||
| 202 | * tracing_start_sched_switch_record - start tracing context switches | ||
| 203 | * | ||
| 204 | * Turns on context switch tracing for a tracer. | ||
| 205 | */ | ||
| 206 | void tracing_start_sched_switch_record(void) | ||
| 207 | { | ||
| 208 | if (unlikely(!ctx_trace)) { | ||
| 209 | WARN_ON(1); | ||
| 210 | return; | ||
| 211 | } | ||
| 212 | |||
| 213 | tracing_start_sched_switch(); | ||
| 214 | |||
| 215 | mutex_lock(&sched_register_mutex); | ||
| 216 | tracer_enabled++; | ||
| 217 | mutex_unlock(&sched_register_mutex); | ||
| 218 | } | ||
| 219 | |||
| 220 | /** | ||
| 221 | * tracing_stop_sched_switch_record - start tracing context switches | ||
| 222 | * | ||
| 223 | * Turns off context switch tracing for a tracer. | ||
| 224 | */ | ||
| 225 | void tracing_stop_sched_switch_record(void) | ||
| 226 | { | ||
| 227 | mutex_lock(&sched_register_mutex); | ||
| 228 | tracer_enabled--; | ||
| 229 | WARN_ON(tracer_enabled < 0); | ||
| 230 | mutex_unlock(&sched_register_mutex); | ||
| 231 | |||
| 232 | tracing_stop_sched_switch(); | ||
| 233 | } | ||
| 234 | |||
| 235 | /** | ||
| 236 | * tracing_sched_switch_assign_trace - assign a trace array for ctx switch | ||
| 237 | * @tr: trace array pointer to assign | ||
| 238 | * | ||
| 239 | * Some tracers might want to record the context switches in their | ||
| 240 | * trace. This function lets those tracers assign the trace array | ||
| 241 | * to use. | ||
| 242 | */ | ||
| 243 | void tracing_sched_switch_assign_trace(struct trace_array *tr) | ||
| 244 | { | ||
| 245 | ctx_trace = tr; | ||
| 246 | } | ||
| 247 | |||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 19bd8928ce94..8fb84b362816 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) | |||
| 365 | wakeup_current_cpu = cpu; | 365 | wakeup_current_cpu = cpu; |
| 366 | } | 366 | } |
| 367 | 367 | ||
| 368 | static void | ||
| 369 | tracing_sched_switch_trace(struct trace_array *tr, | ||
| 370 | struct task_struct *prev, | ||
| 371 | struct task_struct *next, | ||
| 372 | unsigned long flags, int pc) | ||
| 373 | { | ||
| 374 | struct ftrace_event_call *call = &event_context_switch; | ||
| 375 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 376 | struct ring_buffer_event *event; | ||
| 377 | struct ctx_switch_entry *entry; | ||
| 378 | |||
| 379 | event = trace_buffer_lock_reserve(buffer, TRACE_CTX, | ||
| 380 | sizeof(*entry), flags, pc); | ||
| 381 | if (!event) | ||
| 382 | return; | ||
| 383 | entry = ring_buffer_event_data(event); | ||
| 384 | entry->prev_pid = prev->pid; | ||
| 385 | entry->prev_prio = prev->prio; | ||
| 386 | entry->prev_state = prev->state; | ||
| 387 | entry->next_pid = next->pid; | ||
| 388 | entry->next_prio = next->prio; | ||
| 389 | entry->next_state = next->state; | ||
| 390 | entry->next_cpu = task_cpu(next); | ||
| 391 | |||
| 392 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 393 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 394 | } | ||
| 395 | |||
| 396 | static void | ||
| 397 | tracing_sched_wakeup_trace(struct trace_array *tr, | ||
| 398 | struct task_struct *wakee, | ||
| 399 | struct task_struct *curr, | ||
| 400 | unsigned long flags, int pc) | ||
| 401 | { | ||
| 402 | struct ftrace_event_call *call = &event_wakeup; | ||
| 403 | struct ring_buffer_event *event; | ||
| 404 | struct ctx_switch_entry *entry; | ||
| 405 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
| 406 | |||
| 407 | event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, | ||
| 408 | sizeof(*entry), flags, pc); | ||
| 409 | if (!event) | ||
| 410 | return; | ||
| 411 | entry = ring_buffer_event_data(event); | ||
| 412 | entry->prev_pid = curr->pid; | ||
| 413 | entry->prev_prio = curr->prio; | ||
| 414 | entry->prev_state = curr->state; | ||
| 415 | entry->next_pid = wakee->pid; | ||
| 416 | entry->next_prio = wakee->prio; | ||
| 417 | entry->next_state = wakee->state; | ||
| 418 | entry->next_cpu = task_cpu(wakee); | ||
| 419 | |||
| 420 | if (!call_filter_check_discard(call, entry, buffer, event)) | ||
| 421 | trace_buffer_unlock_commit(buffer, event, flags, pc); | ||
| 422 | } | ||
| 423 | |||
| 368 | static void notrace | 424 | static void notrace |
| 369 | probe_wakeup_sched_switch(void *ignore, | 425 | probe_wakeup_sched_switch(void *ignore, |
| 370 | struct task_struct *prev, struct task_struct *next) | 426 | struct task_struct *prev, struct task_struct *next) |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..b0f86ea77881 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 382 | 382 | ||
| 383 | /* check the trace buffer */ | 383 | /* check the trace buffer */ |
| 384 | ret = trace_test_buffer(&tr->trace_buffer, &count); | 384 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
| 385 | |||
| 386 | ftrace_enabled = 1; | ||
| 385 | tracing_start(); | 387 | tracing_start(); |
| 386 | 388 | ||
| 387 | /* we should only have one item */ | 389 | /* we should only have one item */ |
| @@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
| 679 | 681 | ||
| 680 | /* check the trace buffer */ | 682 | /* check the trace buffer */ |
| 681 | ret = trace_test_buffer(&tr->trace_buffer, &count); | 683 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
| 684 | |||
| 685 | ftrace_enabled = 1; | ||
| 682 | trace->reset(tr); | 686 | trace->reset(tr); |
| 683 | tracing_start(); | 687 | tracing_start(); |
| 684 | 688 | ||
| @@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) | |||
| 1025 | #endif | 1029 | #endif |
| 1026 | 1030 | ||
| 1027 | #ifdef CONFIG_SCHED_TRACER | 1031 | #ifdef CONFIG_SCHED_TRACER |
| 1032 | |||
| 1033 | struct wakeup_test_data { | ||
| 1034 | struct completion is_ready; | ||
| 1035 | int go; | ||
| 1036 | }; | ||
| 1037 | |||
| 1028 | static int trace_wakeup_test_thread(void *data) | 1038 | static int trace_wakeup_test_thread(void *data) |
| 1029 | { | 1039 | { |
| 1030 | /* Make this a -deadline thread */ | 1040 | /* Make this a -deadline thread */ |
| @@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data) | |||
| 1034 | .sched_deadline = 10000000ULL, | 1044 | .sched_deadline = 10000000ULL, |
| 1035 | .sched_period = 10000000ULL | 1045 | .sched_period = 10000000ULL |
| 1036 | }; | 1046 | }; |
| 1037 | struct completion *x = data; | 1047 | struct wakeup_test_data *x = data; |
| 1038 | 1048 | ||
| 1039 | sched_setattr(current, &attr); | 1049 | sched_setattr(current, &attr); |
| 1040 | 1050 | ||
| 1041 | /* Make it know we have a new prio */ | 1051 | /* Make it know we have a new prio */ |
| 1042 | complete(x); | 1052 | complete(&x->is_ready); |
| 1043 | 1053 | ||
| 1044 | /* now go to sleep and let the test wake us up */ | 1054 | /* now go to sleep and let the test wake us up */ |
| 1045 | set_current_state(TASK_INTERRUPTIBLE); | 1055 | set_current_state(TASK_INTERRUPTIBLE); |
| 1046 | schedule(); | 1056 | while (!x->go) { |
| 1057 | schedule(); | ||
| 1058 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1059 | } | ||
| 1047 | 1060 | ||
| 1048 | complete(x); | 1061 | complete(&x->is_ready); |
| 1062 | |||
| 1063 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1049 | 1064 | ||
| 1050 | /* we are awake, now wait to disappear */ | 1065 | /* we are awake, now wait to disappear */ |
| 1051 | while (!kthread_should_stop()) { | 1066 | while (!kthread_should_stop()) { |
| 1052 | /* | 1067 | schedule(); |
| 1053 | * This will likely be the system top priority | 1068 | set_current_state(TASK_INTERRUPTIBLE); |
| 1054 | * task, do short sleeps to let others run. | ||
| 1055 | */ | ||
| 1056 | msleep(100); | ||
| 1057 | } | 1069 | } |
| 1058 | 1070 | ||
| 1071 | __set_current_state(TASK_RUNNING); | ||
| 1072 | |||
| 1059 | return 0; | 1073 | return 0; |
| 1060 | } | 1074 | } |
| 1061 | |||
| 1062 | int | 1075 | int |
| 1063 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | 1076 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) |
| 1064 | { | 1077 | { |
| 1065 | unsigned long save_max = tr->max_latency; | 1078 | unsigned long save_max = tr->max_latency; |
| 1066 | struct task_struct *p; | 1079 | struct task_struct *p; |
| 1067 | struct completion is_ready; | 1080 | struct wakeup_test_data data; |
| 1068 | unsigned long count; | 1081 | unsigned long count; |
| 1069 | int ret; | 1082 | int ret; |
| 1070 | 1083 | ||
| 1071 | init_completion(&is_ready); | 1084 | memset(&data, 0, sizeof(data)); |
| 1085 | |||
| 1086 | init_completion(&data.is_ready); | ||
| 1072 | 1087 | ||
| 1073 | /* create a -deadline thread */ | 1088 | /* create a -deadline thread */ |
| 1074 | p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); | 1089 | p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); |
| 1075 | if (IS_ERR(p)) { | 1090 | if (IS_ERR(p)) { |
| 1076 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); | 1091 | printk(KERN_CONT "Failed to create ftrace wakeup test thread "); |
| 1077 | return -1; | 1092 | return -1; |
| 1078 | } | 1093 | } |
| 1079 | 1094 | ||
| 1080 | /* make sure the thread is running at -deadline policy */ | 1095 | /* make sure the thread is running at -deadline policy */ |
| 1081 | wait_for_completion(&is_ready); | 1096 | wait_for_completion(&data.is_ready); |
| 1082 | 1097 | ||
| 1083 | /* start the tracing */ | 1098 | /* start the tracing */ |
| 1084 | ret = tracer_init(trace, tr); | 1099 | ret = tracer_init(trace, tr); |
| @@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1099 | msleep(100); | 1114 | msleep(100); |
| 1100 | } | 1115 | } |
| 1101 | 1116 | ||
| 1102 | init_completion(&is_ready); | 1117 | init_completion(&data.is_ready); |
| 1118 | |||
| 1119 | data.go = 1; | ||
| 1120 | /* memory barrier is in the wake_up_process() */ | ||
| 1103 | 1121 | ||
| 1104 | wake_up_process(p); | 1122 | wake_up_process(p); |
| 1105 | 1123 | ||
| 1106 | /* Wait for the task to wake up */ | 1124 | /* Wait for the task to wake up */ |
| 1107 | wait_for_completion(&is_ready); | 1125 | wait_for_completion(&data.is_ready); |
| 1108 | 1126 | ||
| 1109 | /* stop the tracing. */ | 1127 | /* stop the tracing. */ |
| 1110 | tracing_stop(); | 1128 | tracing_stop(); |
| 1111 | /* check both trace buffers */ | 1129 | /* check both trace buffers */ |
| 1112 | ret = trace_test_buffer(&tr->trace_buffer, NULL); | 1130 | ret = trace_test_buffer(&tr->trace_buffer, NULL); |
| 1113 | printk("ret = %d\n", ret); | ||
| 1114 | if (!ret) | 1131 | if (!ret) |
| 1115 | ret = trace_test_buffer(&tr->max_buffer, &count); | 1132 | ret = trace_test_buffer(&tr->max_buffer, &count); |
| 1116 | 1133 | ||
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1f24ed99dca2..f8b45d8792f9 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c | |||
| @@ -27,10 +27,19 @@ | |||
| 27 | #include <linux/trace_seq.h> | 27 | #include <linux/trace_seq.h> |
| 28 | 28 | ||
| 29 | /* How much buffer is left on the trace_seq? */ | 29 | /* How much buffer is left on the trace_seq? */ |
| 30 | #define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) | 30 | #define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) |
| 31 | 31 | ||
| 32 | /* How much buffer is written? */ | 32 | /* How much buffer is written? */ |
| 33 | #define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) | 33 | #define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) |
| 34 | |||
| 35 | /* | ||
| 36 | * trace_seq should work with being initialized with 0s. | ||
| 37 | */ | ||
| 38 | static inline void __trace_seq_init(struct trace_seq *s) | ||
| 39 | { | ||
| 40 | if (unlikely(!s->seq.size)) | ||
| 41 | trace_seq_init(s); | ||
| 42 | } | ||
| 34 | 43 | ||
| 35 | /** | 44 | /** |
| 36 | * trace_print_seq - move the contents of trace_seq into a seq_file | 45 | * trace_print_seq - move the contents of trace_seq into a seq_file |
| @@ -43,10 +52,11 @@ | |||
| 43 | */ | 52 | */ |
| 44 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) | 53 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) |
| 45 | { | 54 | { |
| 46 | unsigned int len = TRACE_SEQ_BUF_USED(s); | ||
| 47 | int ret; | 55 | int ret; |
| 48 | 56 | ||
| 49 | ret = seq_write(m, s->buffer, len); | 57 | __trace_seq_init(s); |
| 58 | |||
| 59 | ret = seq_buf_print_seq(m, &s->seq); | ||
| 50 | 60 | ||
| 51 | /* | 61 | /* |
| 52 | * Only reset this buffer if we successfully wrote to the | 62 | * Only reset this buffer if we successfully wrote to the |
| @@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) | |||
| 69 | * trace_seq_printf() is used to store strings into a special | 79 | * trace_seq_printf() is used to store strings into a special |
| 70 | * buffer (@s). Then the output may be either used by | 80 | * buffer (@s). Then the output may be either used by |
| 71 | * the sequencer or pulled into another buffer. | 81 | * the sequencer or pulled into another buffer. |
| 72 | * | ||
| 73 | * Returns 1 if we successfully written all the contents to | ||
| 74 | * the buffer. | ||
| 75 | * Returns 0 if we the length to write is bigger than the | ||
| 76 | * reserved buffer space. In this case, nothing gets written. | ||
| 77 | */ | 82 | */ |
| 78 | int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | 83 | void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) |
| 79 | { | 84 | { |
| 80 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 85 | unsigned int save_len = s->seq.len; |
| 81 | va_list ap; | 86 | va_list ap; |
| 82 | int ret; | ||
| 83 | 87 | ||
| 84 | if (s->full || !len) | 88 | if (s->full) |
| 85 | return 0; | 89 | return; |
| 90 | |||
| 91 | __trace_seq_init(s); | ||
| 86 | 92 | ||
| 87 | va_start(ap, fmt); | 93 | va_start(ap, fmt); |
| 88 | ret = vsnprintf(s->buffer + s->len, len, fmt, ap); | 94 | seq_buf_vprintf(&s->seq, fmt, ap); |
| 89 | va_end(ap); | 95 | va_end(ap); |
| 90 | 96 | ||
| 91 | /* If we can't write it all, don't bother writing anything */ | 97 | /* If we can't write it all, don't bother writing anything */ |
| 92 | if (ret >= len) { | 98 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 99 | s->seq.len = save_len; | ||
| 93 | s->full = 1; | 100 | s->full = 1; |
| 94 | return 0; | ||
| 95 | } | 101 | } |
| 96 | |||
| 97 | s->len += ret; | ||
| 98 | |||
| 99 | return 1; | ||
| 100 | } | 102 | } |
| 101 | EXPORT_SYMBOL_GPL(trace_seq_printf); | 103 | EXPORT_SYMBOL_GPL(trace_seq_printf); |
| 102 | 104 | ||
| @@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); | |||
| 107 | * @nmaskbits: The number of bits that are valid in @maskp | 109 | * @nmaskbits: The number of bits that are valid in @maskp |
| 108 | * | 110 | * |
| 109 | * Writes a ASCII representation of a bitmask string into @s. | 111 | * Writes a ASCII representation of a bitmask string into @s. |
| 110 | * | ||
| 111 | * Returns 1 if we successfully written all the contents to | ||
| 112 | * the buffer. | ||
| 113 | * Returns 0 if we the length to write is bigger than the | ||
| 114 | * reserved buffer space. In this case, nothing gets written. | ||
| 115 | */ | 112 | */ |
| 116 | int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | 113 | void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, |
| 117 | int nmaskbits) | 114 | int nmaskbits) |
| 118 | { | 115 | { |
| 119 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 116 | unsigned int save_len = s->seq.len; |
| 120 | int ret; | ||
| 121 | 117 | ||
| 122 | if (s->full || !len) | 118 | if (s->full) |
| 123 | return 0; | 119 | return; |
| 124 | 120 | ||
| 125 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | 121 | __trace_seq_init(s); |
| 126 | s->len += ret; | ||
| 127 | 122 | ||
| 128 | return 1; | 123 | seq_buf_bitmask(&s->seq, maskp, nmaskbits); |
| 124 | |||
| 125 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { | ||
| 126 | s->seq.len = save_len; | ||
| 127 | s->full = 1; | ||
| 128 | } | ||
| 129 | } | 129 | } |
| 130 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | 130 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); |
| 131 | 131 | ||
| @@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); | |||
| 139 | * trace_seq_printf is used to store strings into a special | 139 | * trace_seq_printf is used to store strings into a special |
| 140 | * buffer (@s). Then the output may be either used by | 140 | * buffer (@s). Then the output may be either used by |
| 141 | * the sequencer or pulled into another buffer. | 141 | * the sequencer or pulled into another buffer. |
| 142 | * | ||
| 143 | * Returns how much it wrote to the buffer. | ||
| 144 | */ | 142 | */ |
| 145 | int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) | 143 | void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) |
| 146 | { | 144 | { |
| 147 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 145 | unsigned int save_len = s->seq.len; |
| 148 | int ret; | ||
| 149 | 146 | ||
| 150 | if (s->full || !len) | 147 | if (s->full) |
| 151 | return 0; | 148 | return; |
| 152 | 149 | ||
| 153 | ret = vsnprintf(s->buffer + s->len, len, fmt, args); | 150 | __trace_seq_init(s); |
| 151 | |||
| 152 | seq_buf_vprintf(&s->seq, fmt, args); | ||
| 154 | 153 | ||
| 155 | /* If we can't write it all, don't bother writing anything */ | 154 | /* If we can't write it all, don't bother writing anything */ |
| 156 | if (ret >= len) { | 155 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 156 | s->seq.len = save_len; | ||
| 157 | s->full = 1; | 157 | s->full = 1; |
| 158 | return 0; | ||
| 159 | } | 158 | } |
| 160 | |||
| 161 | s->len += ret; | ||
| 162 | |||
| 163 | return len; | ||
| 164 | } | 159 | } |
| 165 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); | 160 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); |
| 166 | 161 | ||
| @@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf); | |||
| 178 | * | 173 | * |
| 179 | * This function will take the format and the binary array and finish | 174 | * This function will take the format and the binary array and finish |
| 180 | * the conversion into the ASCII string within the buffer. | 175 | * the conversion into the ASCII string within the buffer. |
| 181 | * | ||
| 182 | * Returns how much it wrote to the buffer. | ||
| 183 | */ | 176 | */ |
| 184 | int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) | 177 | void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) |
| 185 | { | 178 | { |
| 186 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | 179 | unsigned int save_len = s->seq.len; |
| 187 | int ret; | ||
| 188 | 180 | ||
| 189 | if (s->full || !len) | 181 | if (s->full) |
| 190 | return 0; | 182 | return; |
| 183 | |||
| 184 | __trace_seq_init(s); | ||
| 191 | 185 | ||
| 192 | ret = bstr_printf(s->buffer + s->len, len, fmt, binary); | 186 | seq_buf_bprintf(&s->seq, fmt, binary); |
| 193 | 187 | ||
| 194 | /* If we can't write it all, don't bother writing anything */ | 188 | /* If we can't write it all, don't bother writing anything */ |
| 195 | if (ret >= len) { | 189 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 190 | s->seq.len = save_len; | ||
| 196 | s->full = 1; | 191 | s->full = 1; |
| 197 | return 0; | 192 | return; |
| 198 | } | 193 | } |
| 199 | |||
| 200 | s->len += ret; | ||
| 201 | |||
| 202 | return len; | ||
| 203 | } | 194 | } |
| 204 | EXPORT_SYMBOL_GPL(trace_seq_bprintf); | 195 | EXPORT_SYMBOL_GPL(trace_seq_bprintf); |
| 205 | 196 | ||
| @@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf); | |||
| 212 | * copy to user routines. This function records a simple string | 203 | * copy to user routines. This function records a simple string |
| 213 | * into a special buffer (@s) for later retrieval by a sequencer | 204 | * into a special buffer (@s) for later retrieval by a sequencer |
| 214 | * or other mechanism. | 205 | * or other mechanism. |
| 215 | * | ||
| 216 | * Returns how much it wrote to the buffer. | ||
| 217 | */ | 206 | */ |
| 218 | int trace_seq_puts(struct trace_seq *s, const char *str) | 207 | void trace_seq_puts(struct trace_seq *s, const char *str) |
| 219 | { | 208 | { |
| 220 | unsigned int len = strlen(str); | 209 | unsigned int len = strlen(str); |
| 221 | 210 | ||
| 222 | if (s->full) | 211 | if (s->full) |
| 223 | return 0; | 212 | return; |
| 213 | |||
| 214 | __trace_seq_init(s); | ||
| 224 | 215 | ||
| 225 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | 216 | if (len > TRACE_SEQ_BUF_LEFT(s)) { |
| 226 | s->full = 1; | 217 | s->full = 1; |
| 227 | return 0; | 218 | return; |
| 228 | } | 219 | } |
| 229 | 220 | ||
| 230 | memcpy(s->buffer + s->len, str, len); | 221 | seq_buf_putmem(&s->seq, str, len); |
| 231 | s->len += len; | ||
| 232 | |||
| 233 | return len; | ||
| 234 | } | 222 | } |
| 235 | EXPORT_SYMBOL_GPL(trace_seq_puts); | 223 | EXPORT_SYMBOL_GPL(trace_seq_puts); |
| 236 | 224 | ||
| @@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); | |||
| 243 | * copy to user routines. This function records a simple charater | 231 | * copy to user routines. This function records a simple charater |
| 244 | * into a special buffer (@s) for later retrieval by a sequencer | 232 | * into a special buffer (@s) for later retrieval by a sequencer |
| 245 | * or other mechanism. | 233 | * or other mechanism. |
| 246 | * | ||
| 247 | * Returns how much it wrote to the buffer. | ||
| 248 | */ | 234 | */ |
| 249 | int trace_seq_putc(struct trace_seq *s, unsigned char c) | 235 | void trace_seq_putc(struct trace_seq *s, unsigned char c) |
| 250 | { | 236 | { |
| 251 | if (s->full) | 237 | if (s->full) |
| 252 | return 0; | 238 | return; |
| 239 | |||
| 240 | __trace_seq_init(s); | ||
| 253 | 241 | ||
| 254 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | 242 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { |
| 255 | s->full = 1; | 243 | s->full = 1; |
| 256 | return 0; | 244 | return; |
| 257 | } | 245 | } |
| 258 | 246 | ||
| 259 | s->buffer[s->len++] = c; | 247 | seq_buf_putc(&s->seq, c); |
| 260 | |||
| 261 | return 1; | ||
| 262 | } | 248 | } |
| 263 | EXPORT_SYMBOL_GPL(trace_seq_putc); | 249 | EXPORT_SYMBOL_GPL(trace_seq_putc); |
| 264 | 250 | ||
| @@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc); | |||
| 271 | * There may be cases where raw memory needs to be written into the | 257 | * There may be cases where raw memory needs to be written into the |
| 272 | * buffer and a strcpy() would not work. Using this function allows | 258 | * buffer and a strcpy() would not work. Using this function allows |
| 273 | * for such cases. | 259 | * for such cases. |
| 274 | * | ||
| 275 | * Returns how much it wrote to the buffer. | ||
| 276 | */ | 260 | */ |
| 277 | int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) | 261 | void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) |
| 278 | { | 262 | { |
| 279 | if (s->full) | 263 | if (s->full) |
| 280 | return 0; | 264 | return; |
| 265 | |||
| 266 | __trace_seq_init(s); | ||
| 281 | 267 | ||
| 282 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | 268 | if (len > TRACE_SEQ_BUF_LEFT(s)) { |
| 283 | s->full = 1; | 269 | s->full = 1; |
| 284 | return 0; | 270 | return; |
| 285 | } | 271 | } |
| 286 | 272 | ||
| 287 | memcpy(s->buffer + s->len, mem, len); | 273 | seq_buf_putmem(&s->seq, mem, len); |
| 288 | s->len += len; | ||
| 289 | |||
| 290 | return len; | ||
| 291 | } | 274 | } |
| 292 | EXPORT_SYMBOL_GPL(trace_seq_putmem); | 275 | EXPORT_SYMBOL_GPL(trace_seq_putmem); |
| 293 | 276 | ||
| 294 | #define MAX_MEMHEX_BYTES 8U | ||
| 295 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) | ||
| 296 | |||
| 297 | /** | 277 | /** |
| 298 | * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex | 278 | * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex |
| 299 | * @s: trace sequence descriptor | 279 | * @s: trace sequence descriptor |
| @@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem); | |||
| 303 | * This is similar to trace_seq_putmem() except instead of just copying the | 283 | * This is similar to trace_seq_putmem() except instead of just copying the |
| 304 | * raw memory into the buffer it writes its ASCII representation of it | 284 | * raw memory into the buffer it writes its ASCII representation of it |
| 305 | * in hex characters. | 285 | * in hex characters. |
| 306 | * | ||
| 307 | * Returns how much it wrote to the buffer. | ||
| 308 | */ | 286 | */ |
| 309 | int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, | 287 | void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, |
| 310 | unsigned int len) | 288 | unsigned int len) |
| 311 | { | 289 | { |
| 312 | unsigned char hex[HEX_CHARS]; | 290 | unsigned int save_len = s->seq.len; |
| 313 | const unsigned char *data = mem; | ||
| 314 | unsigned int start_len; | ||
| 315 | int i, j; | ||
| 316 | int cnt = 0; | ||
| 317 | 291 | ||
| 318 | if (s->full) | 292 | if (s->full) |
| 319 | return 0; | 293 | return; |
| 320 | 294 | ||
| 321 | while (len) { | 295 | __trace_seq_init(s); |
| 322 | start_len = min(len, HEX_CHARS - 1); | 296 | |
| 323 | #ifdef __BIG_ENDIAN | 297 | /* Each byte is represented by two chars */ |
| 324 | for (i = 0, j = 0; i < start_len; i++) { | 298 | if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { |
| 325 | #else | 299 | s->full = 1; |
| 326 | for (i = start_len-1, j = 0; i >= 0; i--) { | 300 | return; |
| 327 | #endif | 301 | } |
| 328 | hex[j++] = hex_asc_hi(data[i]); | 302 | |
| 329 | hex[j++] = hex_asc_lo(data[i]); | 303 | /* The added spaces can still cause an overflow */ |
| 330 | } | 304 | seq_buf_putmem_hex(&s->seq, mem, len); |
| 331 | if (WARN_ON_ONCE(j == 0 || j/2 > len)) | 305 | |
| 332 | break; | 306 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 333 | 307 | s->seq.len = save_len; | |
| 334 | /* j increments twice per loop */ | 308 | s->full = 1; |
| 335 | len -= j / 2; | 309 | return; |
| 336 | hex[j++] = ' '; | ||
| 337 | |||
| 338 | cnt += trace_seq_putmem(s, hex, j); | ||
| 339 | } | 310 | } |
| 340 | return cnt; | ||
| 341 | } | 311 | } |
| 342 | EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); | 312 | EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); |
| 343 | 313 | ||
| @@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); | |||
| 355 | */ | 325 | */ |
| 356 | int trace_seq_path(struct trace_seq *s, const struct path *path) | 326 | int trace_seq_path(struct trace_seq *s, const struct path *path) |
| 357 | { | 327 | { |
| 358 | unsigned char *p; | 328 | unsigned int save_len = s->seq.len; |
| 359 | 329 | ||
| 360 | if (s->full) | 330 | if (s->full) |
| 361 | return 0; | 331 | return 0; |
| 362 | 332 | ||
| 333 | __trace_seq_init(s); | ||
| 334 | |||
| 363 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | 335 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { |
| 364 | s->full = 1; | 336 | s->full = 1; |
| 365 | return 0; | 337 | return 0; |
| 366 | } | 338 | } |
| 367 | 339 | ||
| 368 | p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); | 340 | seq_buf_path(&s->seq, path, "\n"); |
| 369 | if (!IS_ERR(p)) { | 341 | |
| 370 | p = mangle_path(s->buffer + s->len, p, "\n"); | 342 | if (unlikely(seq_buf_has_overflowed(&s->seq))) { |
| 371 | if (p) { | 343 | s->seq.len = save_len; |
| 372 | s->len = p - s->buffer; | 344 | s->full = 1; |
| 373 | return 1; | 345 | return 0; |
| 374 | } | ||
| 375 | } else { | ||
| 376 | s->buffer[s->len++] = '?'; | ||
| 377 | return 1; | ||
| 378 | } | 346 | } |
| 379 | 347 | ||
| 380 | s->full = 1; | 348 | return 1; |
| 381 | return 0; | ||
| 382 | } | 349 | } |
| 383 | EXPORT_SYMBOL_GPL(trace_seq_path); | 350 | EXPORT_SYMBOL_GPL(trace_seq_path); |
| 384 | 351 | ||
| @@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); | |||
| 404 | */ | 371 | */ |
| 405 | int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) | 372 | int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) |
| 406 | { | 373 | { |
| 407 | int len; | 374 | __trace_seq_init(s); |
| 408 | int ret; | 375 | return seq_buf_to_user(&s->seq, ubuf, cnt); |
| 409 | |||
| 410 | if (!cnt) | ||
| 411 | return 0; | ||
| 412 | |||
| 413 | if (s->len <= s->readpos) | ||
| 414 | return -EBUSY; | ||
| 415 | |||
| 416 | len = s->len - s->readpos; | ||
| 417 | if (cnt > len) | ||
| 418 | cnt = len; | ||
| 419 | ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); | ||
| 420 | if (ret == cnt) | ||
| 421 | return -EFAULT; | ||
| 422 | |||
| 423 | cnt -= ret; | ||
| 424 | |||
| 425 | s->readpos += cnt; | ||
| 426 | return cnt; | ||
| 427 | } | 376 | } |
| 428 | EXPORT_SYMBOL_GPL(trace_seq_to_user); | 377 | EXPORT_SYMBOL_GPL(trace_seq_to_user); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/sysctl.h> | 13 | #include <linux/sysctl.h> |
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
| 16 | #include <linux/magic.h> | ||
| 17 | 16 | ||
| 18 | #include <asm/setup.h> | 17 | #include <asm/setup.h> |
| 19 | 18 | ||
| @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
| 171 | i++; | 170 | i++; |
| 172 | } | 171 | } |
| 173 | 172 | ||
| 174 | if ((current != &init_task && | 173 | if (task_stack_end_corrupted(current)) { |
| 175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { | ||
| 176 | print_max_stack(); | 174 | print_max_stack(); |
| 177 | BUG(); | 175 | BUG(); |
| 178 | } | 176 | } |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 759d5e004517..c6ee36fcbf90 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, | |||
| 114 | struct trace_entry *ent = iter->ent; | 114 | struct trace_entry *ent = iter->ent; |
| 115 | struct syscall_trace_enter *trace; | 115 | struct syscall_trace_enter *trace; |
| 116 | struct syscall_metadata *entry; | 116 | struct syscall_metadata *entry; |
| 117 | int i, ret, syscall; | 117 | int i, syscall; |
| 118 | 118 | ||
| 119 | trace = (typeof(trace))ent; | 119 | trace = (typeof(trace))ent; |
| 120 | syscall = trace->nr; | 120 | syscall = trace->nr; |
| @@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags, | |||
| 128 | goto end; | 128 | goto end; |
| 129 | } | 129 | } |
| 130 | 130 | ||
| 131 | ret = trace_seq_printf(s, "%s(", entry->name); | 131 | trace_seq_printf(s, "%s(", entry->name); |
| 132 | if (!ret) | ||
| 133 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 134 | 132 | ||
| 135 | for (i = 0; i < entry->nb_args; i++) { | 133 | for (i = 0; i < entry->nb_args; i++) { |
| 134 | |||
| 135 | if (trace_seq_has_overflowed(s)) | ||
| 136 | goto end; | ||
| 137 | |||
| 136 | /* parameter types */ | 138 | /* parameter types */ |
| 137 | if (trace_flags & TRACE_ITER_VERBOSE) { | 139 | if (trace_flags & TRACE_ITER_VERBOSE) |
| 138 | ret = trace_seq_printf(s, "%s ", entry->types[i]); | 140 | trace_seq_printf(s, "%s ", entry->types[i]); |
| 139 | if (!ret) | 141 | |
| 140 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 141 | } | ||
| 142 | /* parameter values */ | 142 | /* parameter values */ |
| 143 | ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], | 143 | trace_seq_printf(s, "%s: %lx%s", entry->args[i], |
| 144 | trace->args[i], | 144 | trace->args[i], |
| 145 | i == entry->nb_args - 1 ? "" : ", "); | 145 | i == entry->nb_args - 1 ? "" : ", "); |
| 146 | if (!ret) | ||
| 147 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 148 | } | 146 | } |
| 149 | 147 | ||
| 150 | ret = trace_seq_putc(s, ')'); | 148 | trace_seq_putc(s, ')'); |
| 151 | if (!ret) | ||
| 152 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 153 | |||
| 154 | end: | 149 | end: |
| 155 | ret = trace_seq_putc(s, '\n'); | 150 | trace_seq_putc(s, '\n'); |
| 156 | if (!ret) | ||
| 157 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 158 | 151 | ||
| 159 | return TRACE_TYPE_HANDLED; | 152 | return trace_handle_return(s); |
| 160 | } | 153 | } |
| 161 | 154 | ||
| 162 | static enum print_line_t | 155 | static enum print_line_t |
| @@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 168 | struct syscall_trace_exit *trace; | 161 | struct syscall_trace_exit *trace; |
| 169 | int syscall; | 162 | int syscall; |
| 170 | struct syscall_metadata *entry; | 163 | struct syscall_metadata *entry; |
| 171 | int ret; | ||
| 172 | 164 | ||
| 173 | trace = (typeof(trace))ent; | 165 | trace = (typeof(trace))ent; |
| 174 | syscall = trace->nr; | 166 | syscall = trace->nr; |
| @@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 176 | 168 | ||
| 177 | if (!entry) { | 169 | if (!entry) { |
| 178 | trace_seq_putc(s, '\n'); | 170 | trace_seq_putc(s, '\n'); |
| 179 | return TRACE_TYPE_HANDLED; | 171 | goto out; |
| 180 | } | 172 | } |
| 181 | 173 | ||
| 182 | if (entry->exit_event->event.type != ent->type) { | 174 | if (entry->exit_event->event.type != ent->type) { |
| @@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
| 184 | return TRACE_TYPE_UNHANDLED; | 176 | return TRACE_TYPE_UNHANDLED; |
| 185 | } | 177 | } |
| 186 | 178 | ||
| 187 | ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, | 179 | trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, |
| 188 | trace->ret); | 180 | trace->ret); |
| 189 | if (!ret) | ||
| 190 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 191 | 181 | ||
| 192 | return TRACE_TYPE_HANDLED; | 182 | out: |
| 183 | return trace_handle_return(s); | ||
| 193 | } | 184 | } |
| 194 | 185 | ||
| 195 | extern char *__bad_type_size(void); | 186 | extern char *__bad_type_size(void); |
| @@ -313,7 +304,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
| 313 | int size; | 304 | int size; |
| 314 | 305 | ||
| 315 | syscall_nr = trace_get_syscall_nr(current, regs); | 306 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 316 | if (syscall_nr < 0) | 307 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 317 | return; | 308 | return; |
| 318 | 309 | ||
| 319 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ | 310 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ |
| @@ -360,7 +351,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
| 360 | int syscall_nr; | 351 | int syscall_nr; |
| 361 | 352 | ||
| 362 | syscall_nr = trace_get_syscall_nr(current, regs); | 353 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 363 | if (syscall_nr < 0) | 354 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 364 | return; | 355 | return; |
| 365 | 356 | ||
| 366 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ | 357 | /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ |
| @@ -425,7 +416,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, | |||
| 425 | return; | 416 | return; |
| 426 | mutex_lock(&syscall_trace_lock); | 417 | mutex_lock(&syscall_trace_lock); |
| 427 | tr->sys_refcount_enter--; | 418 | tr->sys_refcount_enter--; |
| 428 | rcu_assign_pointer(tr->enter_syscall_files[num], NULL); | 419 | RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); |
| 429 | if (!tr->sys_refcount_enter) | 420 | if (!tr->sys_refcount_enter) |
| 430 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); | 421 | unregister_trace_sys_enter(ftrace_syscall_enter, tr); |
| 431 | mutex_unlock(&syscall_trace_lock); | 422 | mutex_unlock(&syscall_trace_lock); |
| @@ -463,7 +454,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, | |||
| 463 | return; | 454 | return; |
| 464 | mutex_lock(&syscall_trace_lock); | 455 | mutex_lock(&syscall_trace_lock); |
| 465 | tr->sys_refcount_exit--; | 456 | tr->sys_refcount_exit--; |
| 466 | rcu_assign_pointer(tr->exit_syscall_files[num], NULL); | 457 | RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); |
| 467 | if (!tr->sys_refcount_exit) | 458 | if (!tr->sys_refcount_exit) |
| 468 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); | 459 | unregister_trace_sys_exit(ftrace_syscall_exit, tr); |
| 469 | mutex_unlock(&syscall_trace_lock); | 460 | mutex_unlock(&syscall_trace_lock); |
| @@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) | |||
| 523 | return (unsigned long)sys_call_table[nr]; | 514 | return (unsigned long)sys_call_table[nr]; |
| 524 | } | 515 | } |
| 525 | 516 | ||
| 526 | static int __init init_ftrace_syscalls(void) | 517 | void __init init_ftrace_syscalls(void) |
| 527 | { | 518 | { |
| 528 | struct syscall_metadata *meta; | 519 | struct syscall_metadata *meta; |
| 529 | unsigned long addr; | 520 | unsigned long addr; |
| @@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void) | |||
| 533 | GFP_KERNEL); | 524 | GFP_KERNEL); |
| 534 | if (!syscalls_metadata) { | 525 | if (!syscalls_metadata) { |
| 535 | WARN_ON(1); | 526 | WARN_ON(1); |
| 536 | return -ENOMEM; | 527 | return; |
| 537 | } | 528 | } |
| 538 | 529 | ||
| 539 | for (i = 0; i < NR_syscalls; i++) { | 530 | for (i = 0; i < NR_syscalls; i++) { |
| @@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void) | |||
| 545 | meta->syscall_nr = i; | 536 | meta->syscall_nr = i; |
| 546 | syscalls_metadata[i] = meta; | 537 | syscalls_metadata[i] = meta; |
| 547 | } | 538 | } |
| 548 | |||
| 549 | return 0; | ||
| 550 | } | 539 | } |
| 551 | early_initcall(init_ftrace_syscalls); | ||
| 552 | 540 | ||
| 553 | #ifdef CONFIG_PERF_EVENTS | 541 | #ifdef CONFIG_PERF_EVENTS |
| 554 | 542 | ||
| @@ -567,7 +555,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 567 | int size; | 555 | int size; |
| 568 | 556 | ||
| 569 | syscall_nr = trace_get_syscall_nr(current, regs); | 557 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 570 | if (syscall_nr < 0) | 558 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 571 | return; | 559 | return; |
| 572 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) | 560 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) |
| 573 | return; | 561 | return; |
| @@ -641,7 +629,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 641 | int size; | 629 | int size; |
| 642 | 630 | ||
| 643 | syscall_nr = trace_get_syscall_nr(current, regs); | 631 | syscall_nr = trace_get_syscall_nr(current, regs); |
| 644 | if (syscall_nr < 0) | 632 | if (syscall_nr < 0 || syscall_nr >= NR_syscalls) |
| 645 | return; | 633 | return; |
| 646 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) | 634 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) |
| 647 | return; | 635 | return; |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 33ff6a24b802..8520acc34b18 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -552,8 +552,7 @@ error: | |||
| 552 | return ret; | 552 | return ret; |
| 553 | 553 | ||
| 554 | fail_address_parse: | 554 | fail_address_parse: |
| 555 | if (inode) | 555 | iput(inode); |
| 556 | iput(inode); | ||
| 557 | 556 | ||
| 558 | pr_info("Failed to parse address or file.\n"); | 557 | pr_info("Failed to parse address or file.\n"); |
| 559 | 558 | ||
| @@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 606 | for (i = 0; i < tu->tp.nr_args; i++) | 605 | for (i = 0; i < tu->tp.nr_args; i++) |
| 607 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); | 606 | seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); |
| 608 | 607 | ||
| 609 | seq_printf(m, "\n"); | 608 | seq_putc(m, '\n'); |
| 610 | return 0; | 609 | return 0; |
| 611 | } | 610 | } |
| 612 | 611 | ||
| @@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e | |||
| 852 | tu = container_of(event, struct trace_uprobe, tp.call.event); | 851 | tu = container_of(event, struct trace_uprobe, tp.call.event); |
| 853 | 852 | ||
| 854 | if (is_ret_probe(tu)) { | 853 | if (is_ret_probe(tu)) { |
| 855 | if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", | 854 | trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", |
| 856 | ftrace_event_name(&tu->tp.call), | 855 | ftrace_event_name(&tu->tp.call), |
| 857 | entry->vaddr[1], entry->vaddr[0])) | 856 | entry->vaddr[1], entry->vaddr[0]); |
| 858 | goto partial; | ||
| 859 | data = DATAOF_TRACE_ENTRY(entry, true); | 857 | data = DATAOF_TRACE_ENTRY(entry, true); |
| 860 | } else { | 858 | } else { |
| 861 | if (!trace_seq_printf(s, "%s: (0x%lx)", | 859 | trace_seq_printf(s, "%s: (0x%lx)", |
| 862 | ftrace_event_name(&tu->tp.call), | 860 | ftrace_event_name(&tu->tp.call), |
| 863 | entry->vaddr[0])) | 861 | entry->vaddr[0]); |
| 864 | goto partial; | ||
| 865 | data = DATAOF_TRACE_ENTRY(entry, false); | 862 | data = DATAOF_TRACE_ENTRY(entry, false); |
| 866 | } | 863 | } |
| 867 | 864 | ||
| @@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e | |||
| 869 | struct probe_arg *parg = &tu->tp.args[i]; | 866 | struct probe_arg *parg = &tu->tp.args[i]; |
| 870 | 867 | ||
| 871 | if (!parg->type->print(s, parg->name, data + parg->offset, entry)) | 868 | if (!parg->type->print(s, parg->name, data + parg->offset, entry)) |
| 872 | goto partial; | 869 | goto out; |
| 873 | } | 870 | } |
| 874 | 871 | ||
| 875 | if (trace_seq_puts(s, "\n")) | 872 | trace_seq_putc(s, '\n'); |
| 876 | return TRACE_TYPE_HANDLED; | ||
| 877 | 873 | ||
| 878 | partial: | 874 | out: |
| 879 | return TRACE_TYPE_PARTIAL_LINE; | 875 | return trace_handle_return(s); |
| 880 | } | 876 | } |
| 881 | 877 | ||
| 882 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, | 878 | typedef bool (*filter_func_t)(struct uprobe_consumer *self, |
diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbceff..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
| 176 | struct group_info *group_info; | 176 | struct group_info *group_info; |
| 177 | int retval; | 177 | int retval; |
| 178 | 178 | ||
| 179 | if (!ns_capable(current_user_ns(), CAP_SETGID)) | 179 | if (!may_setgroups()) |
| 180 | return -EPERM; | 180 | return -EPERM; |
| 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 182 | return -EINVAL; | 182 | return -EINVAL; |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
| @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); | |||
| 14 | void user_return_notifier_register(struct user_return_notifier *urn) | 14 | void user_return_notifier_register(struct user_return_notifier *urn) |
| 15 | { | 15 | { |
| 16 | set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); | 16 | set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); |
| 17 | hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); | 17 | hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); |
| 18 | } | 18 | } |
| 19 | EXPORT_SYMBOL_GPL(user_return_notifier_register); | 19 | EXPORT_SYMBOL_GPL(user_return_notifier_register); |
| 20 | 20 | ||
| @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); | |||
| 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) | 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) |
| 26 | { | 26 | { |
| 27 | hlist_del(&urn->link); | 27 | hlist_del(&urn->link); |
| 28 | if (hlist_empty(&__get_cpu_var(return_notifier_list))) | 28 | if (hlist_empty(this_cpu_ptr(&return_notifier_list))) |
| 29 | clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); | 29 | clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); |
| 30 | } | 30 | } |
| 31 | EXPORT_SYMBOL_GPL(user_return_notifier_unregister); | 31 | EXPORT_SYMBOL_GPL(user_return_notifier_unregister); |
diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e44..b069ccbfb0b0 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { | |||
| 50 | .count = ATOMIC_INIT(3), | 50 | .count = ATOMIC_INIT(3), |
| 51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
| 52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
| 53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .ns.inum = PROC_USER_INIT_INO, |
| 54 | #ifdef CONFIG_USER_NS | ||
| 55 | .ns.ops = &userns_operations, | ||
| 56 | #endif | ||
| 57 | .flags = USERNS_INIT_FLAGS, | ||
| 54 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 58 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 55 | .persistent_keyring_register_sem = | 59 | .persistent_keyring_register_sem = |
| 56 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), | 60 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3ec..4109f8320684 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/fs_struct.h> | 24 | #include <linux/fs_struct.h> |
| 25 | 25 | ||
| 26 | static struct kmem_cache *user_ns_cachep __read_mostly; | 26 | static struct kmem_cache *user_ns_cachep __read_mostly; |
| 27 | static DEFINE_MUTEX(userns_state_mutex); | ||
| 27 | 28 | ||
| 28 | static bool new_idmap_permitted(const struct file *file, | 29 | static bool new_idmap_permitted(const struct file *file, |
| 29 | struct user_namespace *ns, int cap_setid, | 30 | struct user_namespace *ns, int cap_setid, |
| @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) | |||
| 86 | if (!ns) | 87 | if (!ns) |
| 87 | return -ENOMEM; | 88 | return -ENOMEM; |
| 88 | 89 | ||
| 89 | ret = proc_alloc_inum(&ns->proc_inum); | 90 | ret = ns_alloc_inum(&ns->ns); |
| 90 | if (ret) { | 91 | if (ret) { |
| 91 | kmem_cache_free(user_ns_cachep, ns); | 92 | kmem_cache_free(user_ns_cachep, ns); |
| 92 | return ret; | 93 | return ret; |
| 93 | } | 94 | } |
| 95 | ns->ns.ops = &userns_operations; | ||
| 94 | 96 | ||
| 95 | atomic_set(&ns->count, 1); | 97 | atomic_set(&ns->count, 1); |
| 96 | /* Leave the new->user_ns reference with the new user namespace. */ | 98 | /* Leave the new->user_ns reference with the new user namespace. */ |
| @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) | |||
| 99 | ns->owner = owner; | 101 | ns->owner = owner; |
| 100 | ns->group = group; | 102 | ns->group = group; |
| 101 | 103 | ||
| 104 | /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ | ||
| 105 | mutex_lock(&userns_state_mutex); | ||
| 106 | ns->flags = parent_ns->flags; | ||
| 107 | mutex_unlock(&userns_state_mutex); | ||
| 108 | |||
| 102 | set_cred_user_ns(new, ns); | 109 | set_cred_user_ns(new, ns); |
| 103 | 110 | ||
| 104 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 111 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) | |||
| 136 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 143 | #ifdef CONFIG_PERSISTENT_KEYRINGS |
| 137 | key_put(ns->persistent_keyring_register); | 144 | key_put(ns->persistent_keyring_register); |
| 138 | #endif | 145 | #endif |
| 139 | proc_free_inum(ns->proc_inum); | 146 | ns_free_inum(&ns->ns); |
| 140 | kmem_cache_free(user_ns_cachep, ns); | 147 | kmem_cache_free(user_ns_cachep, ns); |
| 141 | ns = parent; | 148 | ns = parent; |
| 142 | } while (atomic_dec_and_test(&parent->count)); | 149 | } while (atomic_dec_and_test(&parent->count)); |
| @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, | |||
| 583 | return false; | 590 | return false; |
| 584 | } | 591 | } |
| 585 | 592 | ||
| 586 | |||
| 587 | static DEFINE_MUTEX(id_map_mutex); | ||
| 588 | |||
| 589 | static ssize_t map_write(struct file *file, const char __user *buf, | 593 | static ssize_t map_write(struct file *file, const char __user *buf, |
| 590 | size_t count, loff_t *ppos, | 594 | size_t count, loff_t *ppos, |
| 591 | int cap_setid, | 595 | int cap_setid, |
| @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 602 | ssize_t ret = -EINVAL; | 606 | ssize_t ret = -EINVAL; |
| 603 | 607 | ||
| 604 | /* | 608 | /* |
| 605 | * The id_map_mutex serializes all writes to any given map. | 609 | * The userns_state_mutex serializes all writes to any given map. |
| 606 | * | 610 | * |
| 607 | * Any map is only ever written once. | 611 | * Any map is only ever written once. |
| 608 | * | 612 | * |
| @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 620 | * order and smp_rmb() is guaranteed that we don't have crazy | 624 | * order and smp_rmb() is guaranteed that we don't have crazy |
| 621 | * architectures returning stale data. | 625 | * architectures returning stale data. |
| 622 | */ | 626 | */ |
| 623 | mutex_lock(&id_map_mutex); | 627 | mutex_lock(&userns_state_mutex); |
| 624 | 628 | ||
| 625 | ret = -EPERM; | 629 | ret = -EPERM; |
| 626 | /* Only allow one successful write to the map */ | 630 | /* Only allow one successful write to the map */ |
| @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 640 | if (!page) | 644 | if (!page) |
| 641 | goto out; | 645 | goto out; |
| 642 | 646 | ||
| 643 | /* Only allow <= page size writes at the beginning of the file */ | 647 | /* Only allow < page size writes at the beginning of the file */ |
| 644 | ret = -EINVAL; | 648 | ret = -EINVAL; |
| 645 | if ((*ppos != 0) || (count >= PAGE_SIZE)) | 649 | if ((*ppos != 0) || (count >= PAGE_SIZE)) |
| 646 | goto out; | 650 | goto out; |
| @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 750 | *ppos = count; | 754 | *ppos = count; |
| 751 | ret = count; | 755 | ret = count; |
| 752 | out: | 756 | out: |
| 753 | mutex_unlock(&id_map_mutex); | 757 | mutex_unlock(&userns_state_mutex); |
| 754 | if (page) | 758 | if (page) |
| 755 | free_page(page); | 759 | free_page(page); |
| 756 | return ret; | 760 | return ret; |
| @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, | |||
| 812 | struct user_namespace *ns, int cap_setid, | 816 | struct user_namespace *ns, int cap_setid, |
| 813 | struct uid_gid_map *new_map) | 817 | struct uid_gid_map *new_map) |
| 814 | { | 818 | { |
| 815 | /* Allow mapping to your own filesystem ids */ | 819 | const struct cred *cred = file->f_cred; |
| 816 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { | 820 | /* Don't allow mappings that would allow anything that wouldn't |
| 821 | * be allowed without the establishment of unprivileged mappings. | ||
| 822 | */ | ||
| 823 | if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && | ||
| 824 | uid_eq(ns->owner, cred->euid)) { | ||
| 817 | u32 id = new_map->extent[0].lower_first; | 825 | u32 id = new_map->extent[0].lower_first; |
| 818 | if (cap_setid == CAP_SETUID) { | 826 | if (cap_setid == CAP_SETUID) { |
| 819 | kuid_t uid = make_kuid(ns->parent, id); | 827 | kuid_t uid = make_kuid(ns->parent, id); |
| 820 | if (uid_eq(uid, file->f_cred->fsuid)) | 828 | if (uid_eq(uid, cred->euid)) |
| 821 | return true; | 829 | return true; |
| 822 | } else if (cap_setid == CAP_SETGID) { | 830 | } else if (cap_setid == CAP_SETGID) { |
| 823 | kgid_t gid = make_kgid(ns->parent, id); | 831 | kgid_t gid = make_kgid(ns->parent, id); |
| 824 | if (gid_eq(gid, file->f_cred->fsgid)) | 832 | if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && |
| 833 | gid_eq(gid, cred->egid)) | ||
| 825 | return true; | 834 | return true; |
| 826 | } | 835 | } |
| 827 | } | 836 | } |
| @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, | |||
| 841 | return false; | 850 | return false; |
| 842 | } | 851 | } |
| 843 | 852 | ||
| 844 | static void *userns_get(struct task_struct *task) | 853 | int proc_setgroups_show(struct seq_file *seq, void *v) |
| 854 | { | ||
| 855 | struct user_namespace *ns = seq->private; | ||
| 856 | unsigned long userns_flags = ACCESS_ONCE(ns->flags); | ||
| 857 | |||
| 858 | seq_printf(seq, "%s\n", | ||
| 859 | (userns_flags & USERNS_SETGROUPS_ALLOWED) ? | ||
| 860 | "allow" : "deny"); | ||
| 861 | return 0; | ||
| 862 | } | ||
| 863 | |||
| 864 | ssize_t proc_setgroups_write(struct file *file, const char __user *buf, | ||
| 865 | size_t count, loff_t *ppos) | ||
| 866 | { | ||
| 867 | struct seq_file *seq = file->private_data; | ||
| 868 | struct user_namespace *ns = seq->private; | ||
| 869 | char kbuf[8], *pos; | ||
| 870 | bool setgroups_allowed; | ||
| 871 | ssize_t ret; | ||
| 872 | |||
| 873 | /* Only allow a very narrow range of strings to be written */ | ||
| 874 | ret = -EINVAL; | ||
| 875 | if ((*ppos != 0) || (count >= sizeof(kbuf))) | ||
| 876 | goto out; | ||
| 877 | |||
| 878 | /* What was written? */ | ||
| 879 | ret = -EFAULT; | ||
| 880 | if (copy_from_user(kbuf, buf, count)) | ||
| 881 | goto out; | ||
| 882 | kbuf[count] = '\0'; | ||
| 883 | pos = kbuf; | ||
| 884 | |||
| 885 | /* What is being requested? */ | ||
| 886 | ret = -EINVAL; | ||
| 887 | if (strncmp(pos, "allow", 5) == 0) { | ||
| 888 | pos += 5; | ||
| 889 | setgroups_allowed = true; | ||
| 890 | } | ||
| 891 | else if (strncmp(pos, "deny", 4) == 0) { | ||
| 892 | pos += 4; | ||
| 893 | setgroups_allowed = false; | ||
| 894 | } | ||
| 895 | else | ||
| 896 | goto out; | ||
| 897 | |||
| 898 | /* Verify there is not trailing junk on the line */ | ||
| 899 | pos = skip_spaces(pos); | ||
| 900 | if (*pos != '\0') | ||
| 901 | goto out; | ||
| 902 | |||
| 903 | ret = -EPERM; | ||
| 904 | mutex_lock(&userns_state_mutex); | ||
| 905 | if (setgroups_allowed) { | ||
| 906 | /* Enabling setgroups after setgroups has been disabled | ||
| 907 | * is not allowed. | ||
| 908 | */ | ||
| 909 | if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) | ||
| 910 | goto out_unlock; | ||
| 911 | } else { | ||
| 912 | /* Permanently disabling setgroups after setgroups has | ||
| 913 | * been enabled by writing the gid_map is not allowed. | ||
| 914 | */ | ||
| 915 | if (ns->gid_map.nr_extents != 0) | ||
| 916 | goto out_unlock; | ||
| 917 | ns->flags &= ~USERNS_SETGROUPS_ALLOWED; | ||
| 918 | } | ||
| 919 | mutex_unlock(&userns_state_mutex); | ||
| 920 | |||
| 921 | /* Report a successful write */ | ||
| 922 | *ppos = count; | ||
| 923 | ret = count; | ||
| 924 | out: | ||
| 925 | return ret; | ||
| 926 | out_unlock: | ||
| 927 | mutex_unlock(&userns_state_mutex); | ||
| 928 | goto out; | ||
| 929 | } | ||
| 930 | |||
| 931 | bool userns_may_setgroups(const struct user_namespace *ns) | ||
| 932 | { | ||
| 933 | bool allowed; | ||
| 934 | |||
| 935 | mutex_lock(&userns_state_mutex); | ||
| 936 | /* It is not safe to use setgroups until a gid mapping in | ||
| 937 | * the user namespace has been established. | ||
| 938 | */ | ||
| 939 | allowed = ns->gid_map.nr_extents != 0; | ||
| 940 | /* Is setgroups allowed? */ | ||
| 941 | allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); | ||
| 942 | mutex_unlock(&userns_state_mutex); | ||
| 943 | |||
| 944 | return allowed; | ||
| 945 | } | ||
| 946 | |||
| 947 | static inline struct user_namespace *to_user_ns(struct ns_common *ns) | ||
| 948 | { | ||
| 949 | return container_of(ns, struct user_namespace, ns); | ||
| 950 | } | ||
| 951 | |||
| 952 | static struct ns_common *userns_get(struct task_struct *task) | ||
| 845 | { | 953 | { |
| 846 | struct user_namespace *user_ns; | 954 | struct user_namespace *user_ns; |
| 847 | 955 | ||
| @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) | |||
| 849 | user_ns = get_user_ns(__task_cred(task)->user_ns); | 957 | user_ns = get_user_ns(__task_cred(task)->user_ns); |
| 850 | rcu_read_unlock(); | 958 | rcu_read_unlock(); |
| 851 | 959 | ||
| 852 | return user_ns; | 960 | return user_ns ? &user_ns->ns : NULL; |
| 853 | } | 961 | } |
| 854 | 962 | ||
| 855 | static void userns_put(void *ns) | 963 | static void userns_put(struct ns_common *ns) |
| 856 | { | 964 | { |
| 857 | put_user_ns(ns); | 965 | put_user_ns(to_user_ns(ns)); |
| 858 | } | 966 | } |
| 859 | 967 | ||
| 860 | static int userns_install(struct nsproxy *nsproxy, void *ns) | 968 | static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
| 861 | { | 969 | { |
| 862 | struct user_namespace *user_ns = ns; | 970 | struct user_namespace *user_ns = to_user_ns(ns); |
| 863 | struct cred *cred; | 971 | struct cred *cred; |
| 864 | 972 | ||
| 865 | /* Don't allow gaining capabilities by reentering | 973 | /* Don't allow gaining capabilities by reentering |
| @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) | |||
| 888 | return commit_creds(cred); | 996 | return commit_creds(cred); |
| 889 | } | 997 | } |
| 890 | 998 | ||
| 891 | static unsigned int userns_inum(void *ns) | ||
| 892 | { | ||
| 893 | struct user_namespace *user_ns = ns; | ||
| 894 | return user_ns->proc_inum; | ||
| 895 | } | ||
| 896 | |||
| 897 | const struct proc_ns_operations userns_operations = { | 999 | const struct proc_ns_operations userns_operations = { |
| 898 | .name = "user", | 1000 | .name = "user", |
| 899 | .type = CLONE_NEWUSER, | 1001 | .type = CLONE_NEWUSER, |
| 900 | .get = userns_get, | 1002 | .get = userns_get, |
| 901 | .put = userns_put, | 1003 | .put = userns_put, |
| 902 | .install = userns_install, | 1004 | .install = userns_install, |
| 903 | .inum = userns_inum, | ||
| 904 | }; | 1005 | }; |
| 905 | 1006 | ||
| 906 | static __init int user_namespaces_init(void) | 1007 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 883aaaa7de8a..831ea7108232 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, | |||
| 42 | if (!ns) | 42 | if (!ns) |
| 43 | return ERR_PTR(-ENOMEM); | 43 | return ERR_PTR(-ENOMEM); |
| 44 | 44 | ||
| 45 | err = proc_alloc_inum(&ns->proc_inum); | 45 | err = ns_alloc_inum(&ns->ns); |
| 46 | if (err) { | 46 | if (err) { |
| 47 | kfree(ns); | 47 | kfree(ns); |
| 48 | return ERR_PTR(err); | 48 | return ERR_PTR(err); |
| 49 | } | 49 | } |
| 50 | 50 | ||
| 51 | ns->ns.ops = &utsns_operations; | ||
| 52 | |||
| 51 | down_read(&uts_sem); | 53 | down_read(&uts_sem); |
| 52 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 54 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
| 53 | ns->user_ns = get_user_ns(user_ns); | 55 | ns->user_ns = get_user_ns(user_ns); |
| @@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref) | |||
| 84 | 86 | ||
| 85 | ns = container_of(kref, struct uts_namespace, kref); | 87 | ns = container_of(kref, struct uts_namespace, kref); |
| 86 | put_user_ns(ns->user_ns); | 88 | put_user_ns(ns->user_ns); |
| 87 | proc_free_inum(ns->proc_inum); | 89 | ns_free_inum(&ns->ns); |
| 88 | kfree(ns); | 90 | kfree(ns); |
| 89 | } | 91 | } |
| 90 | 92 | ||
| 91 | static void *utsns_get(struct task_struct *task) | 93 | static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) |
| 94 | { | ||
| 95 | return container_of(ns, struct uts_namespace, ns); | ||
| 96 | } | ||
| 97 | |||
| 98 | static struct ns_common *utsns_get(struct task_struct *task) | ||
| 92 | { | 99 | { |
| 93 | struct uts_namespace *ns = NULL; | 100 | struct uts_namespace *ns = NULL; |
| 94 | struct nsproxy *nsproxy; | 101 | struct nsproxy *nsproxy; |
| @@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task) | |||
| 101 | } | 108 | } |
| 102 | task_unlock(task); | 109 | task_unlock(task); |
| 103 | 110 | ||
| 104 | return ns; | 111 | return ns ? &ns->ns : NULL; |
| 105 | } | 112 | } |
| 106 | 113 | ||
| 107 | static void utsns_put(void *ns) | 114 | static void utsns_put(struct ns_common *ns) |
| 108 | { | 115 | { |
| 109 | put_uts_ns(ns); | 116 | put_uts_ns(to_uts_ns(ns)); |
| 110 | } | 117 | } |
| 111 | 118 | ||
| 112 | static int utsns_install(struct nsproxy *nsproxy, void *new) | 119 | static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) |
| 113 | { | 120 | { |
| 114 | struct uts_namespace *ns = new; | 121 | struct uts_namespace *ns = to_uts_ns(new); |
| 115 | 122 | ||
| 116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | 123 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || |
| 117 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 124 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) | |||
| 123 | return 0; | 130 | return 0; |
| 124 | } | 131 | } |
| 125 | 132 | ||
| 126 | static unsigned int utsns_inum(void *vp) | ||
| 127 | { | ||
| 128 | struct uts_namespace *ns = vp; | ||
| 129 | |||
| 130 | return ns->proc_inum; | ||
| 131 | } | ||
| 132 | |||
| 133 | const struct proc_ns_operations utsns_operations = { | 133 | const struct proc_ns_operations utsns_operations = { |
| 134 | .name = "uts", | 134 | .name = "uts", |
| 135 | .type = CLONE_NEWUTS, | 135 | .type = CLONE_NEWUTS, |
| 136 | .get = utsns_get, | 136 | .get = utsns_get, |
| 137 | .put = utsns_put, | 137 | .put = utsns_put, |
| 138 | .install = utsns_install, | 138 | .install = utsns_install, |
| 139 | .inum = utsns_inum, | ||
| 140 | }; | 139 | }; |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..70bf11815f84 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -15,11 +15,6 @@ | |||
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/nmi.h> | 16 | #include <linux/nmi.h> |
| 17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
| 18 | #include <linux/delay.h> | ||
| 19 | #include <linux/freezer.h> | ||
| 20 | #include <linux/kthread.h> | ||
| 21 | #include <linux/lockdep.h> | ||
| 22 | #include <linux/notifier.h> | ||
| 23 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 24 | #include <linux/sysctl.h> | 19 | #include <linux/sysctl.h> |
| 25 | #include <linux/smpboot.h> | 20 | #include <linux/smpboot.h> |
| @@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); | |||
| 47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 42 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
| 48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 43 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
| 49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 44 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
| 45 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | ||
| 50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 46 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 51 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 47 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
| 52 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 48 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
| @@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn; | |||
| 63 | static int hardlockup_panic = | 59 | static int hardlockup_panic = |
| 64 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 60 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
| 65 | 61 | ||
| 62 | static bool hardlockup_detector_enabled = true; | ||
| 63 | /* | ||
| 64 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 65 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 66 | * cases this function can be called to disable hard lockup detection. This | ||
| 67 | * function should only be executed once by the boot processor before the | ||
| 68 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 69 | * possible to override this in hardlockup_panic_setup(). | ||
| 70 | */ | ||
| 71 | void watchdog_enable_hardlockup_detector(bool val) | ||
| 72 | { | ||
| 73 | hardlockup_detector_enabled = val; | ||
| 74 | } | ||
| 75 | |||
| 76 | bool watchdog_hardlockup_detector_is_enabled(void) | ||
| 77 | { | ||
| 78 | return hardlockup_detector_enabled; | ||
| 79 | } | ||
| 80 | |||
| 66 | static int __init hardlockup_panic_setup(char *str) | 81 | static int __init hardlockup_panic_setup(char *str) |
| 67 | { | 82 | { |
| 68 | if (!strncmp(str, "panic", 5)) | 83 | if (!strncmp(str, "panic", 5)) |
| @@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str) | |||
| 71 | hardlockup_panic = 0; | 86 | hardlockup_panic = 0; |
| 72 | else if (!strncmp(str, "0", 1)) | 87 | else if (!strncmp(str, "0", 1)) |
| 73 | watchdog_user_enabled = 0; | 88 | watchdog_user_enabled = 0; |
| 89 | else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { | ||
| 90 | /* | ||
| 91 | * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) | ||
| 92 | * has the same effect. | ||
| 93 | */ | ||
| 94 | watchdog_user_enabled = 1; | ||
| 95 | watchdog_enable_hardlockup_detector(true); | ||
| 96 | } | ||
| 74 | return 1; | 97 | return 1; |
| 75 | } | 98 | } |
| 76 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 99 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
| @@ -185,7 +208,7 @@ void touch_nmi_watchdog(void) | |||
| 185 | * case we shouldn't have to worry about the watchdog | 208 | * case we shouldn't have to worry about the watchdog |
| 186 | * going off. | 209 | * going off. |
| 187 | */ | 210 | */ |
| 188 | __raw_get_cpu_var(watchdog_nmi_touch) = true; | 211 | raw_cpu_write(watchdog_nmi_touch, true); |
| 189 | touch_softlockup_watchdog(); | 212 | touch_softlockup_watchdog(); |
| 190 | } | 213 | } |
| 191 | EXPORT_SYMBOL(touch_nmi_watchdog); | 214 | EXPORT_SYMBOL(touch_nmi_watchdog); |
| @@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog); | |||
| 194 | 217 | ||
| 195 | void touch_softlockup_watchdog_sync(void) | 218 | void touch_softlockup_watchdog_sync(void) |
| 196 | { | 219 | { |
| 197 | __raw_get_cpu_var(softlockup_touch_sync) = true; | 220 | __this_cpu_write(softlockup_touch_sync, true); |
| 198 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | 221 | __this_cpu_write(watchdog_touch_ts, 0); |
| 199 | } | 222 | } |
| 200 | 223 | ||
| 201 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 224 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| @@ -333,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 333 | return HRTIMER_RESTART; | 356 | return HRTIMER_RESTART; |
| 334 | 357 | ||
| 335 | /* only warn once */ | 358 | /* only warn once */ |
| 336 | if (__this_cpu_read(soft_watchdog_warn) == true) | 359 | if (__this_cpu_read(soft_watchdog_warn) == true) { |
| 360 | /* | ||
| 361 | * When multiple processes are causing softlockups the | ||
| 362 | * softlockup detector only warns on the first one | ||
| 363 | * because the code relies on a full quiet cycle to | ||
| 364 | * re-arm. The second process prevents the quiet cycle | ||
| 365 | * and never gets reported. Use task pointers to detect | ||
| 366 | * this. | ||
| 367 | */ | ||
| 368 | if (__this_cpu_read(softlockup_task_ptr_saved) != | ||
| 369 | current) { | ||
| 370 | __this_cpu_write(soft_watchdog_warn, false); | ||
| 371 | __touch_watchdog(); | ||
| 372 | } | ||
| 337 | return HRTIMER_RESTART; | 373 | return HRTIMER_RESTART; |
| 374 | } | ||
| 338 | 375 | ||
| 339 | if (softlockup_all_cpu_backtrace) { | 376 | if (softlockup_all_cpu_backtrace) { |
| 340 | /* Prevent multiple soft-lockup reports if one cpu is already | 377 | /* Prevent multiple soft-lockup reports if one cpu is already |
| @@ -350,6 +387,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 350 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 387 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
| 351 | smp_processor_id(), duration, | 388 | smp_processor_id(), duration, |
| 352 | current->comm, task_pid_nr(current)); | 389 | current->comm, task_pid_nr(current)); |
| 390 | __this_cpu_write(softlockup_task_ptr_saved, current); | ||
| 353 | print_modules(); | 391 | print_modules(); |
| 354 | print_irqtrace_events(current); | 392 | print_irqtrace_events(current); |
| 355 | if (regs) | 393 | if (regs) |
| @@ -387,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) | |||
| 387 | 425 | ||
| 388 | static void watchdog_enable(unsigned int cpu) | 426 | static void watchdog_enable(unsigned int cpu) |
| 389 | { | 427 | { |
| 390 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 428 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 391 | 429 | ||
| 392 | /* kick off the timer for the hardlockup detector */ | 430 | /* kick off the timer for the hardlockup detector */ |
| 393 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 431 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| @@ -407,7 +445,7 @@ static void watchdog_enable(unsigned int cpu) | |||
| 407 | 445 | ||
| 408 | static void watchdog_disable(unsigned int cpu) | 446 | static void watchdog_disable(unsigned int cpu) |
| 409 | { | 447 | { |
| 410 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 448 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 411 | 449 | ||
| 412 | watchdog_set_prio(SCHED_NORMAL, 0); | 450 | watchdog_set_prio(SCHED_NORMAL, 0); |
| 413 | hrtimer_cancel(hrtimer); | 451 | hrtimer_cancel(hrtimer); |
| @@ -454,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 454 | struct perf_event_attr *wd_attr; | 492 | struct perf_event_attr *wd_attr; |
| 455 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 493 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
| 456 | 494 | ||
| 495 | /* | ||
| 496 | * Some kernels need to default hard lockup detection to | ||
| 497 | * 'disabled', for example a guest on a hypervisor. | ||
| 498 | */ | ||
| 499 | if (!watchdog_hardlockup_detector_is_enabled()) { | ||
| 500 | event = ERR_PTR(-ENOENT); | ||
| 501 | goto handle_err; | ||
| 502 | } | ||
| 503 | |||
| 457 | /* is it already setup and enabled? */ | 504 | /* is it already setup and enabled? */ |
| 458 | if (event && event->state > PERF_EVENT_STATE_OFF) | 505 | if (event && event->state > PERF_EVENT_STATE_OFF) |
| 459 | goto out; | 506 | goto out; |
| @@ -468,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 468 | /* Try to register using hardware perf events */ | 515 | /* Try to register using hardware perf events */ |
| 469 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 516 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
| 470 | 517 | ||
| 518 | handle_err: | ||
| 471 | /* save cpu0 error for future comparision */ | 519 | /* save cpu0 error for future comparision */ |
| 472 | if (cpu == 0 && IS_ERR(event)) | 520 | if (cpu == 0 && IS_ERR(event)) |
| 473 | cpu0_err = PTR_ERR(event); | 521 | cpu0_err = PTR_ERR(event); |
| @@ -514,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
| 514 | /* should be in cleanup, but blocks oprofile */ | 562 | /* should be in cleanup, but blocks oprofile */ |
| 515 | perf_event_release_kernel(event); | 563 | perf_event_release_kernel(event); |
| 516 | } | 564 | } |
| 517 | return; | 565 | if (cpu == 0) { |
| 566 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
| 567 | cpu0_err = 0; | ||
| 568 | } | ||
| 518 | } | 569 | } |
| 519 | #else | 570 | #else |
| 520 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 571 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
| @@ -534,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
| 534 | 585 | ||
| 535 | static void restart_watchdog_hrtimer(void *info) | 586 | static void restart_watchdog_hrtimer(void *info) |
| 536 | { | 587 | { |
| 537 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 588 | struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); |
| 538 | int ret; | 589 | int ret; |
| 539 | 590 | ||
| 540 | /* | 591 | /* |
| @@ -610,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 610 | void __user *buffer, size_t *lenp, loff_t *ppos) | 661 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 611 | { | 662 | { |
| 612 | int err, old_thresh, old_enabled; | 663 | int err, old_thresh, old_enabled; |
| 664 | bool old_hardlockup; | ||
| 613 | static DEFINE_MUTEX(watchdog_proc_mutex); | 665 | static DEFINE_MUTEX(watchdog_proc_mutex); |
| 614 | 666 | ||
| 615 | mutex_lock(&watchdog_proc_mutex); | 667 | mutex_lock(&watchdog_proc_mutex); |
| 616 | old_thresh = ACCESS_ONCE(watchdog_thresh); | 668 | old_thresh = ACCESS_ONCE(watchdog_thresh); |
| 617 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | 669 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); |
| 670 | old_hardlockup = watchdog_hardlockup_detector_is_enabled(); | ||
| 618 | 671 | ||
| 619 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 672 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 620 | if (err || !write) | 673 | if (err || !write) |
| @@ -626,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 626 | * disabled. The 'watchdog_running' variable check in | 679 | * disabled. The 'watchdog_running' variable check in |
| 627 | * watchdog_*_all_cpus() function takes care of this. | 680 | * watchdog_*_all_cpus() function takes care of this. |
| 628 | */ | 681 | */ |
| 629 | if (watchdog_user_enabled && watchdog_thresh) | 682 | if (watchdog_user_enabled && watchdog_thresh) { |
| 683 | /* | ||
| 684 | * Prevent a change in watchdog_thresh accidentally overriding | ||
| 685 | * the enablement of the hardlockup detector. | ||
| 686 | */ | ||
| 687 | if (watchdog_user_enabled != old_enabled) | ||
| 688 | watchdog_enable_hardlockup_detector(true); | ||
| 630 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); | 689 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); |
| 631 | else | 690 | } else |
| 632 | watchdog_disable_all_cpus(); | 691 | watchdog_disable_all_cpus(); |
| 633 | 692 | ||
| 634 | /* Restore old values on failure */ | 693 | /* Restore old values on failure */ |
| 635 | if (err) { | 694 | if (err) { |
| 636 | watchdog_thresh = old_thresh; | 695 | watchdog_thresh = old_thresh; |
| 637 | watchdog_user_enabled = old_enabled; | 696 | watchdog_user_enabled = old_enabled; |
| 697 | watchdog_enable_hardlockup_detector(old_hardlockup); | ||
| 638 | } | 698 | } |
| 639 | out: | 699 | out: |
| 640 | mutex_unlock(&watchdog_proc_mutex); | 700 | mutex_unlock(&watchdog_proc_mutex); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5dbe22aa3efd..6202b08f1933 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1804 | struct worker_pool *pool = (void *)__pool; | 1804 | struct worker_pool *pool = (void *)__pool; |
| 1805 | struct work_struct *work; | 1805 | struct work_struct *work; |
| 1806 | 1806 | ||
| 1807 | spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ | 1807 | spin_lock_irq(&pool->lock); |
| 1808 | spin_lock(&pool->lock); | 1808 | spin_lock(&wq_mayday_lock); /* for wq->maydays */ |
| 1809 | 1809 | ||
| 1810 | if (need_to_create_worker(pool)) { | 1810 | if (need_to_create_worker(pool)) { |
| 1811 | /* | 1811 | /* |
| @@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1818 | send_mayday(work); | 1818 | send_mayday(work); |
| 1819 | } | 1819 | } |
| 1820 | 1820 | ||
| 1821 | spin_unlock(&pool->lock); | 1821 | spin_unlock(&wq_mayday_lock); |
| 1822 | spin_unlock_irq(&wq_mayday_lock); | 1822 | spin_unlock_irq(&pool->lock); |
| 1823 | 1823 | ||
| 1824 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); | 1824 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); |
| 1825 | } | 1825 | } |
| @@ -2043,9 +2043,10 @@ __acquires(&pool->lock) | |||
| 2043 | * kernels, where a requeueing work item waiting for something to | 2043 | * kernels, where a requeueing work item waiting for something to |
| 2044 | * happen could deadlock with stop_machine as such work item could | 2044 | * happen could deadlock with stop_machine as such work item could |
| 2045 | * indefinitely requeue itself while all other CPUs are trapped in | 2045 | * indefinitely requeue itself while all other CPUs are trapped in |
| 2046 | * stop_machine. | 2046 | * stop_machine. At the same time, report a quiescent RCU state so |
| 2047 | * the same condition doesn't freeze RCU. | ||
| 2047 | */ | 2048 | */ |
| 2048 | cond_resched(); | 2049 | cond_resched_rcu_qs(); |
| 2049 | 2050 | ||
| 2050 | spin_lock_irq(&pool->lock); | 2051 | spin_lock_irq(&pool->lock); |
| 2051 | 2052 | ||
| @@ -2247,12 +2248,30 @@ repeat: | |||
| 2247 | * Slurp in all works issued via this workqueue and | 2248 | * Slurp in all works issued via this workqueue and |
| 2248 | * process'em. | 2249 | * process'em. |
| 2249 | */ | 2250 | */ |
| 2250 | WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); | 2251 | WARN_ON_ONCE(!list_empty(scheduled)); |
| 2251 | list_for_each_entry_safe(work, n, &pool->worklist, entry) | 2252 | list_for_each_entry_safe(work, n, &pool->worklist, entry) |
| 2252 | if (get_work_pwq(work) == pwq) | 2253 | if (get_work_pwq(work) == pwq) |
| 2253 | move_linked_works(work, scheduled, &n); | 2254 | move_linked_works(work, scheduled, &n); |
| 2254 | 2255 | ||
| 2255 | process_scheduled_works(rescuer); | 2256 | if (!list_empty(scheduled)) { |
| 2257 | process_scheduled_works(rescuer); | ||
| 2258 | |||
| 2259 | /* | ||
| 2260 | * The above execution of rescued work items could | ||
| 2261 | * have created more to rescue through | ||
| 2262 | * pwq_activate_first_delayed() or chained | ||
| 2263 | * queueing. Let's put @pwq back on mayday list so | ||
| 2264 | * that such back-to-back work items, which may be | ||
| 2265 | * being used to relieve memory pressure, don't | ||
| 2266 | * incur MAYDAY_INTERVAL delay inbetween. | ||
| 2267 | */ | ||
| 2268 | if (need_to_create_worker(pool)) { | ||
| 2269 | spin_lock(&wq_mayday_lock); | ||
| 2270 | get_pwq(pwq); | ||
| 2271 | list_move_tail(&pwq->mayday_node, &wq->maydays); | ||
| 2272 | spin_unlock(&wq_mayday_lock); | ||
| 2273 | } | ||
| 2274 | } | ||
| 2256 | 2275 | ||
| 2257 | /* | 2276 | /* |
| 2258 | * Put the reference grabbed by send_mayday(). @pool won't | 2277 | * Put the reference grabbed by send_mayday(). @pool won't |
