aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c23
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/auditfilter.c23
-rw-r--r--kernel/auditsc.c56
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c171
-rw-r--r--kernel/cgroup.c177
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpuset.c185
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c37
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c269
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/core.c63
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c281
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c130
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdesc.c52
-rw-r--r--kernel/irq/irqdomain.c567
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/msi.c330
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c43
-rw-r--r--kernel/kprobes.c22
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c281
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/params.c100
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/hibernate.c14
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c96
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/range.c10
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c1
-rw-r--r--kernel/rcu/tiny.c6
-rw-r--r--kernel/rcu/tree.c97
-rw-r--r--kernel/rcu/tree.h22
-rw-r--r--kernel/rcu/tree_plugin.h111
-rw-r--r--kernel/rcu/update.c89
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c262
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c124
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c360
-rw-r--r--kernel/sched/rt.c17
-rw-r--r--kernel/sched/sched.h43
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/time.c25
-rw-r--r--kernel/time/timekeeping.c127
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/ftrace.c383
-rw-r--r--kernel/trace/ring_buffer.c75
-rw-r--r--kernel/trace/trace.c254
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c126
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c25
-rw-r--r--kernel/trace/trace_kprobe.c50
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_syscalls.c58
-rw-r--r--kernel/trace/trace_uprobe.c30
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/workqueue.c55
124 files changed, 5767 insertions, 3039 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index cebb11db4d34..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
429 * This function doesn't consume an skb as might be expected since it has to 429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways. 430 * copy it anyways.
431 */ 431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb) 432static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
433{ 433{
434 struct sk_buff *copy; 434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id); 435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
448 * no reason for new multicast clients to continue with this 448 * no reason for new multicast clients to continue with this
449 * non-compliance. 449 * non-compliance.
450 */ 450 */
451 copy = skb_copy(skb, GFP_KERNEL); 451 copy = skb_copy(skb, gfp_mask);
452 if (!copy) 452 if (!copy)
453 return; 453 return;
454 454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); 455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
456} 456}
457 457
458/* 458/*
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
499 set_freezable(); 499 set_freezable();
500 while (!kthread_should_stop()) { 500 while (!kthread_should_stop()) {
501 struct sk_buff *skb; 501 struct sk_buff *skb;
502 DECLARE_WAITQUEUE(wait, current);
503 502
504 flush_hold_queue(); 503 flush_hold_queue();
505 504
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
514 audit_printk_skb(skb); 513 audit_printk_skb(skb);
515 continue; 514 continue;
516 } 515 }
517 set_current_state(TASK_INTERRUPTIBLE);
518 add_wait_queue(&kauditd_wait, &wait);
519 516
520 if (!skb_queue_len(&audit_skb_queue)) { 517 wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
521 try_to_freeze();
522 schedule();
523 }
524
525 __set_current_state(TASK_RUNNING);
526 remove_wait_queue(&kauditd_wait, &wait);
527 } 518 }
528 return 0; 519 return 0;
529} 520}
@@ -842,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
842 s.backlog_limit = audit_backlog_limit; 833 s.backlog_limit = audit_backlog_limit;
843 s.lost = atomic_read(&audit_lost); 834 s.lost = atomic_read(&audit_lost);
844 s.backlog = skb_queue_len(&audit_skb_queue); 835 s.backlog = skb_queue_len(&audit_skb_queue);
845 s.version = AUDIT_VERSION_LATEST; 836 s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
846 s.backlog_wait_time = audit_backlog_wait_time; 837 s.backlog_wait_time = audit_backlog_wait_time;
847 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); 838 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
848 break; 839 break;
@@ -1109,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
1109} 1100}
1110 1101
1111/* Run custom bind function on netlink socket group connect or bind requests. */ 1102/* Run custom bind function on netlink socket group connect or bind requests. */
1112static int audit_bind(int group) 1103static int audit_bind(struct net *net, int group)
1113{ 1104{
1114 if (!capable(CAP_AUDIT_READ)) 1105 if (!capable(CAP_AUDIT_READ))
1115 return -EPERM; 1106 return -EPERM;
@@ -1949,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab)
1949 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1950 1941
1951 nlh->nlmsg_len = ab->skb->len; 1942 nlh->nlmsg_len = ab->skb->len;
1952 kauditd_send_multicast_skb(ab->skb); 1943 kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
1953 1944
1954 /* 1945 /*
1955 * The original kaudit unicast socket sends up messages with 1946 * The original kaudit unicast socket sends up messages with
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
174 struct fsnotify_mark *entry = &chunk->mark; 174 struct fsnotify_mark *entry = &chunk->mark;
175 struct list_head *list; 175 struct list_head *list;
176 176
177 if (!entry->i.inode) 177 if (!entry->inode)
178 return; 178 return;
179 list = chunk_hash(entry->i.inode); 179 list = chunk_hash(entry->inode);
180 list_add_rcu(&chunk->hash, list); 180 list_add_rcu(&chunk->hash, list);
181} 181}
182 182
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
188 188
189 list_for_each_entry_rcu(p, list, hash) { 189 list_for_each_entry_rcu(p, list, hash) {
190 /* mark.inode may have gone NULL, but who cares? */ 190 /* mark.inode may have gone NULL, but who cares? */
191 if (p->mark.i.inode == inode) { 191 if (p->mark.inode == inode) {
192 atomic_long_inc(&p->refs); 192 atomic_long_inc(&p->refs);
193 return p; 193 return p;
194 } 194 }
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
231 new = alloc_chunk(size); 231 new = alloc_chunk(size);
232 232
233 spin_lock(&entry->lock); 233 spin_lock(&entry->lock);
234 if (chunk->dead || !entry->i.inode) { 234 if (chunk->dead || !entry->inode) {
235 spin_unlock(&entry->lock); 235 spin_unlock(&entry->lock);
236 if (new) 236 if (new)
237 free_chunk(new); 237 free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
258 goto Fallback; 258 goto Fallback;
259 259
260 fsnotify_duplicate_mark(&new->mark, entry); 260 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
262 fsnotify_put_mark(&new->mark); 262 fsnotify_put_mark(&new->mark);
263 goto Fallback; 263 goto Fallback;
264 } 264 }
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
386 chunk_entry = &chunk->mark; 386 chunk_entry = &chunk->mark;
387 387
388 spin_lock(&old_entry->lock); 388 spin_lock(&old_entry->lock);
389 if (!old_entry->i.inode) { 389 if (!old_entry->inode) {
390 /* old_entry is being shot, lets just lie */ 390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock); 391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry); 392 fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
395 } 395 }
396 396
397 fsnotify_duplicate_mark(chunk_entry, old_entry); 397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock); 399 spin_unlock(&old_entry->lock);
400 fsnotify_put_mark(chunk_entry); 400 fsnotify_put_mark(chunk_entry);
401 fsnotify_put_mark(old_entry); 401 fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
611 list_for_each_entry(node, &tree->chunks, list) { 611 list_for_each_entry(node, &tree->chunks, list) {
612 struct audit_chunk *chunk = find_chunk(node); 612 struct audit_chunk *chunk = find_chunk(node);
613 /* this could be NULL if the watch is dying else where... */ 613 /* this could be NULL if the watch is dying else where... */
614 struct inode *inode = chunk->mark.i.inode; 614 struct inode *inode = chunk->mark.inode;
615 node->index |= 1U<<31; 615 node->index |= 1U<<31;
616 if (iterate_mounts(compare_root, inode, root_mnt)) 616 if (iterate_mounts(compare_root, inode, root_mnt))
617 node->index &= ~(1U<<31); 617 node->index &= ~(1U<<31);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3598e13f2a65..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { 442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
443 f->type = AUDIT_LOGINUID_SET; 443 f->type = AUDIT_LOGINUID_SET;
444 f->val = 0; 444 f->val = 0;
445 } 445 entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
446
447 if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
448 struct pid *pid;
449 rcu_read_lock();
450 pid = find_vpid(f->val);
451 if (!pid) {
452 rcu_read_unlock();
453 err = -ESRCH;
454 goto exit_free;
455 }
456 f->val = pid_nr(pid);
457 rcu_read_unlock();
458 } 446 }
459 447
460 err = audit_field_valid(entry, f); 448 err = audit_field_valid(entry, f);
@@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
630 data->buflen += data->values[i] = 618 data->buflen += data->values[i] =
631 audit_pack_string(&bufp, krule->filterkey); 619 audit_pack_string(&bufp, krule->filterkey);
632 break; 620 break;
621 case AUDIT_LOGINUID_SET:
622 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
623 data->fields[i] = AUDIT_LOGINUID;
624 data->values[i] = AUDIT_UID_UNSET;
625 break;
626 }
627 /* fallthrough if set */
633 default: 628 default:
634 data->values[i] = f->val; 629 data->values[i] = f->val;
635 } 630 }
@@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
646 int i; 641 int i;
647 642
648 if (a->flags != b->flags || 643 if (a->flags != b->flags ||
644 a->pflags != b->pflags ||
649 a->listnr != b->listnr || 645 a->listnr != b->listnr ||
650 a->action != b->action || 646 a->action != b->action ||
651 a->field_count != b->field_count) 647 a->field_count != b->field_count)
@@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
764 new = &entry->rule; 760 new = &entry->rule;
765 new->vers_ops = old->vers_ops; 761 new->vers_ops = old->vers_ops;
766 new->flags = old->flags; 762 new->flags = old->flags;
763 new->pflags = old->pflags;
767 new->listnr = old->listnr; 764 new->listnr = old->listnr;
768 new->action = old->action; 765 new->action = old->action;
769 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 766 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -72,6 +72,8 @@
72#include <linux/fs_struct.h> 72#include <linux/fs_struct.h>
73#include <linux/compat.h> 73#include <linux/compat.h>
74#include <linux/ctype.h> 74#include <linux/ctype.h>
75#include <linux/string.h>
76#include <uapi/linux/limits.h>
75 77
76#include "audit.h" 78#include "audit.h"
77 79
@@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1861 } 1863 }
1862 1864
1863 list_for_each_entry_reverse(n, &context->names_list, list) { 1865 list_for_each_entry_reverse(n, &context->names_list, list) {
1864 /* does the name pointer match? */ 1866 if (!n->name || strcmp(n->name->name, name->name))
1865 if (!n->name || n->name->name != name->name)
1866 continue; 1867 continue;
1867 1868
1868 /* match the correct record type */ 1869 /* match the correct record type */
@@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1877 } 1878 }
1878 1879
1879out_alloc: 1880out_alloc:
1880 /* unable to find the name from a previous getname(). Allocate a new 1881 /* unable to find an entry with both a matching name and type */
1881 * anonymous entry. 1882 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1882 */
1883 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
1884 if (!n) 1883 if (!n)
1885 return; 1884 return;
1885 /* unfortunately, while we may have a path name to record with the
1886 * inode, we can't always rely on the string lasting until the end of
1887 * the syscall so we need to create our own copy, it may fail due to
1888 * memory allocation issues, but we do our best */
1889 if (name) {
1890 /* we can't use getname_kernel() due to size limits */
1891 size_t len = strlen(name->name) + 1;
1892 struct filename *new = __getname();
1893
1894 if (unlikely(!new))
1895 goto out;
1896
1897 if (len <= (PATH_MAX - sizeof(*new))) {
1898 new->name = (char *)(new) + sizeof(*new);
1899 new->separate = false;
1900 } else if (len <= PATH_MAX) {
1901 /* this looks odd, but is due to final_putname() */
1902 struct filename *new2;
1903
1904 new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
1905 if (unlikely(!new2)) {
1906 __putname(new);
1907 goto out;
1908 }
1909 new2->name = (char *)new;
1910 new2->separate = true;
1911 new = new2;
1912 } else {
1913 /* we should never get here, but let's be safe */
1914 __putname(new);
1915 goto out;
1916 }
1917 strlcpy((char *)new->name, name->name, len);
1918 new->uptr = NULL;
1919 new->aname = n;
1920 n->name = new;
1921 n->name_put = true;
1922 }
1886out: 1923out:
1887 if (parent) { 1924 if (parent) {
1888 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1925 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1897,6 +1934,11 @@ out:
1897 audit_copy_inode(n, dentry, inode); 1934 audit_copy_inode(n, dentry, inode);
1898} 1935}
1899 1936
1937void __audit_file(const struct file *file)
1938{
1939 __audit_inode(NULL, file->f_path.dentry, 0);
1940}
1941
1900/** 1942/**
1901 * __audit_inode_child - collect inode info for created/removed objects 1943 * __audit_inode_child - collect inode info for created/removed objects
1902 * @parent: inode of dentry parent 1944 * @parent: inode of dentry parent
@@ -2373,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2373 ax->d.next = context->aux; 2415 ax->d.next = context->aux;
2374 context->aux = (void *)ax; 2416 context->aux = (void *)ax;
2375 2417
2376 dentry = dget(bprm->file->f_dentry); 2418 dentry = dget(bprm->file->f_path.dentry);
2377 get_vfs_caps_from_disk(dentry, &vcaps); 2419 get_vfs_caps_from_disk(dentry, &vcaps);
2378 dput(dentry); 2420 dput(dentry);
2379 2421
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o 2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF 3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o 4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif 5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16#include <linux/mm.h>
17
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23
24/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr)
26{
27 struct bpf_array *array;
28 u32 elem_size, array_size;
29
30 /* check sanity of attributes */
31 if (attr->max_entries == 0 || attr->key_size != 4 ||
32 attr->value_size == 0)
33 return ERR_PTR(-EINVAL);
34
35 elem_size = round_up(attr->value_size, 8);
36
37 /* check round_up into zero and u32 overflow */
38 if (elem_size == 0 ||
39 attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
40 return ERR_PTR(-ENOMEM);
41
42 array_size = sizeof(*array) + attr->max_entries * elem_size;
43
44 /* allocate all map elements and zero-initialize them */
45 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
46 if (!array) {
47 array = vzalloc(array_size);
48 if (!array)
49 return ERR_PTR(-ENOMEM);
50 }
51
52 /* copy mandatory map attributes */
53 array->map.key_size = attr->key_size;
54 array->map.value_size = attr->value_size;
55 array->map.max_entries = attr->max_entries;
56
57 array->elem_size = elem_size;
58
59 return &array->map;
60}
61
62/* Called from syscall or from eBPF program */
63static void *array_map_lookup_elem(struct bpf_map *map, void *key)
64{
65 struct bpf_array *array = container_of(map, struct bpf_array, map);
66 u32 index = *(u32 *)key;
67
68 if (index >= array->map.max_entries)
69 return NULL;
70
71 return array->value + array->elem_size * index;
72}
73
74/* Called from syscall */
75static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
76{
77 struct bpf_array *array = container_of(map, struct bpf_array, map);
78 u32 index = *(u32 *)key;
79 u32 *next = (u32 *)next_key;
80
81 if (index >= array->map.max_entries) {
82 *next = 0;
83 return 0;
84 }
85
86 if (index == array->map.max_entries - 1)
87 return -ENOENT;
88
89 *next = index + 1;
90 return 0;
91}
92
93/* Called from syscall or from eBPF program */
94static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
95 u64 map_flags)
96{
97 struct bpf_array *array = container_of(map, struct bpf_array, map);
98 u32 index = *(u32 *)key;
99
100 if (map_flags > BPF_EXIST)
101 /* unknown flags */
102 return -EINVAL;
103
104 if (index >= array->map.max_entries)
105 /* all elements were pre-allocated, cannot insert a new one */
106 return -E2BIG;
107
108 if (map_flags == BPF_NOEXIST)
109 /* all elements already exist */
110 return -EEXIST;
111
112 memcpy(array->value + array->elem_size * index, value, array->elem_size);
113 return 0;
114}
115
116/* Called from syscall or from eBPF program */
117static int array_map_delete_elem(struct bpf_map *map, void *key)
118{
119 return -EINVAL;
120}
121
122/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
123static void array_map_free(struct bpf_map *map)
124{
125 struct bpf_array *array = container_of(map, struct bpf_array, map);
126
127 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
128 * so the programs (can be more than one that used this map) were
129 * disconnected from events. Wait for outstanding programs to complete
130 * and free the array
131 */
132 synchronize_rcu();
133
134 kvfree(array);
135}
136
137static struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key,
141 .map_lookup_elem = array_map_lookup_elem,
142 .map_update_elem = array_map_update_elem,
143 .map_delete_elem = array_map_delete_elem,
144};
145
146static struct bpf_map_type_list tl = {
147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY,
149};
150
151static int __init register_array_map(void)
152{
153 bpf_register_map_type(&tl);
154 return 0;
155}
156late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
163 163
164void bpf_jit_binary_free(struct bpf_binary_header *hdr) 164void bpf_jit_binary_free(struct bpf_binary_header *hdr)
165{ 165{
166 module_free(NULL, hdr); 166 module_memfree(hdr);
167} 167}
168#endif /* CONFIG_BPF_JIT */ 168#endif /* CONFIG_BPF_JIT */
169 169
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/jhash.h>
14#include <linux/filter.h>
15#include <linux/vmalloc.h>
16
17struct bpf_htab {
18 struct bpf_map map;
19 struct hlist_head *buckets;
20 spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */
24};
25
26/* each htab element is struct htab_elem + key + value */
27struct htab_elem {
28 struct hlist_node hash_node;
29 struct rcu_head rcu;
30 u32 hash;
31 char key[0] __aligned(8);
32};
33
34/* Called from syscall */
35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36{
37 struct bpf_htab *htab;
38 int err, i;
39
40 htab = kzalloc(sizeof(*htab), GFP_USER);
41 if (!htab)
42 return ERR_PTR(-ENOMEM);
43
44 /* mandatory map attributes */
45 htab->map.key_size = attr->key_size;
46 htab->map.value_size = attr->value_size;
47 htab->map.max_entries = attr->max_entries;
48
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
51 */
52 err = -EINVAL;
53 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 htab->map.value_size == 0)
55 goto free_htab;
56
57 /* hash table size must be power of 2 */
58 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59
60 err = -E2BIG;
61 if (htab->map.key_size > MAX_BPF_STACK)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
64 */
65 goto free_htab;
66
67 err = -ENOMEM;
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab->n_buckets == 0 ||
70 htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
71 goto free_htab;
72
73 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
74 GFP_USER | __GFP_NOWARN);
75
76 if (!htab->buckets) {
77 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
78 if (!htab->buckets)
79 goto free_htab;
80 }
81
82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]);
84
85 spin_lock_init(&htab->lock);
86 htab->count = 0;
87
88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size;
91 return &htab->map;
92
93free_htab:
94 kfree(htab);
95 return ERR_PTR(err);
96}
97
98static inline u32 htab_map_hash(const void *key, u32 key_len)
99{
100 return jhash(key, key_len, 0);
101}
102
103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
104{
105 return &htab->buckets[hash & (htab->n_buckets - 1)];
106}
107
108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
109 void *key, u32 key_size)
110{
111 struct htab_elem *l;
112
113 hlist_for_each_entry_rcu(l, head, hash_node)
114 if (l->hash == hash && !memcmp(&l->key, key, key_size))
115 return l;
116
117 return NULL;
118}
119
120/* Called from syscall or from eBPF program */
121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
122{
123 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
124 struct hlist_head *head;
125 struct htab_elem *l;
126 u32 hash, key_size;
127
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
130
131 key_size = map->key_size;
132
133 hash = htab_map_hash(key, key_size);
134
135 head = select_bucket(htab, hash);
136
137 l = lookup_elem_raw(head, hash, key, key_size);
138
139 if (l)
140 return l->key + round_up(map->key_size, 8);
141
142 return NULL;
143}
144
145/* Called from syscall */
146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
147{
148 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
149 struct hlist_head *head;
150 struct htab_elem *l, *next_l;
151 u32 hash, key_size;
152 int i;
153
154 WARN_ON_ONCE(!rcu_read_lock_held());
155
156 key_size = map->key_size;
157
158 hash = htab_map_hash(key, key_size);
159
160 head = select_bucket(htab, hash);
161
162 /* lookup the key */
163 l = lookup_elem_raw(head, hash, key, key_size);
164
165 if (!l) {
166 i = 0;
167 goto find_first_elem;
168 }
169
170 /* key was found, get next key in the same bucket */
171 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
172 struct htab_elem, hash_node);
173
174 if (next_l) {
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key, next_l->key, key_size);
177 return 0;
178 }
179
180 /* no more elements in this hash list, go to the next bucket */
181 i = hash & (htab->n_buckets - 1);
182 i++;
183
184find_first_elem:
185 /* iterate over buckets */
186 for (; i < htab->n_buckets; i++) {
187 head = select_bucket(htab, i);
188
189 /* pick first element in the bucket */
190 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
191 struct htab_elem, hash_node);
192 if (next_l) {
193 /* if it's not empty, just return it */
194 memcpy(next_key, next_l->key, key_size);
195 return 0;
196 }
197 }
198
199 /* itereated over all buckets and all elements */
200 return -ENOENT;
201}
202
203/* Called from syscall or from eBPF program */
204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
205 u64 map_flags)
206{
207 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
208 struct htab_elem *l_new, *l_old;
209 struct hlist_head *head;
210 unsigned long flags;
211 u32 key_size;
212 int ret;
213
214 if (map_flags > BPF_EXIST)
215 /* unknown flags */
216 return -EINVAL;
217
218 WARN_ON_ONCE(!rcu_read_lock_held());
219
220 /* allocate new element outside of lock */
221 l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
222 if (!l_new)
223 return -ENOMEM;
224
225 key_size = map->key_size;
226
227 memcpy(l_new->key, key, key_size);
228 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
229
230 l_new->hash = htab_map_hash(l_new->key, key_size);
231
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags);
234
235 head = select_bucket(htab, l_new->hash);
236
237 l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
238
239 if (!l_old && unlikely(htab->count >= map->max_entries)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
242 */
243 ret = -E2BIG;
244 goto err;
245 }
246
247 if (l_old && map_flags == BPF_NOEXIST) {
248 /* elem already exists */
249 ret = -EEXIST;
250 goto err;
251 }
252
253 if (!l_old && map_flags == BPF_EXIST) {
254 /* elem doesn't exist, cannot update it */
255 ret = -ENOENT;
256 goto err;
257 }
258
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
261 */
262 hlist_add_head_rcu(&l_new->hash_node, head);
263 if (l_old) {
264 hlist_del_rcu(&l_old->hash_node);
265 kfree_rcu(l_old, rcu);
266 } else {
267 htab->count++;
268 }
269 spin_unlock_irqrestore(&htab->lock, flags);
270
271 return 0;
272err:
273 spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new);
275 return ret;
276}
277
278/* Called from syscall or from eBPF program */
279static int htab_map_delete_elem(struct bpf_map *map, void *key)
280{
281 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
282 struct hlist_head *head;
283 struct htab_elem *l;
284 unsigned long flags;
285 u32 hash, key_size;
286 int ret = -ENOENT;
287
288 WARN_ON_ONCE(!rcu_read_lock_held());
289
290 key_size = map->key_size;
291
292 hash = htab_map_hash(key, key_size);
293
294 spin_lock_irqsave(&htab->lock, flags);
295
296 head = select_bucket(htab, hash);
297
298 l = lookup_elem_raw(head, hash, key, key_size);
299
300 if (l) {
301 hlist_del_rcu(&l->hash_node);
302 htab->count--;
303 kfree_rcu(l, rcu);
304 ret = 0;
305 }
306
307 spin_unlock_irqrestore(&htab->lock, flags);
308 return ret;
309}
310
311static void delete_all_elements(struct bpf_htab *htab)
312{
313 int i;
314
315 for (i = 0; i < htab->n_buckets; i++) {
316 struct hlist_head *head = select_bucket(htab, i);
317 struct hlist_node *n;
318 struct htab_elem *l;
319
320 hlist_for_each_entry_safe(l, n, head, hash_node) {
321 hlist_del_rcu(&l->hash_node);
322 htab->count--;
323 kfree(l);
324 }
325 }
326}
327
328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329static void htab_map_free(struct bpf_map *map)
330{
331 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
332
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
337 */
338 synchronize_rcu();
339
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
342 */
343 delete_all_elements(htab);
344 kvfree(htab->buckets);
345 kfree(htab);
346}
347
348static struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key,
352 .map_lookup_elem = htab_map_lookup_elem,
353 .map_update_elem = htab_map_update_elem,
354 .map_delete_elem = htab_map_delete_elem,
355};
356
357static struct bpf_map_type_list tl = {
358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH,
360};
361
362static int __init register_htab_map(void)
363{
364 bpf_register_map_type(&tl);
365 return 0;
366}
367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/rcupdate.h>
14
15/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return
17 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
18 *
19 * Different map implementations will rely on rcu in map methods
20 * lookup/update/delete, therefore eBPF programs must run under rcu lock
21 * if program is allowed to access maps, so check rcu_read_lock_held in
22 * all three functions.
23 */
24static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
25{
26 /* verifier checked that R1 contains a valid pointer to bpf_map
27 * and R2 points to a program stack and map->key_size bytes were
28 * initialized
29 */
30 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
31 void *key = (void *) (unsigned long) r2;
32 void *value;
33
34 WARN_ON_ONCE(!rcu_read_lock_held());
35
36 value = map->ops->map_lookup_elem(map, key);
37
38 /* lookup() returns either pointer to element value or NULL
39 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
40 */
41 return (unsigned long) value;
42}
43
44struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem,
46 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
48 .arg1_type = ARG_CONST_MAP_PTR,
49 .arg2_type = ARG_PTR_TO_MAP_KEY,
50};
51
52static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
53{
54 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
55 void *key = (void *) (unsigned long) r2;
56 void *value = (void *) (unsigned long) r3;
57
58 WARN_ON_ONCE(!rcu_read_lock_held());
59
60 return map->ops->map_update_elem(map, key, value, r4);
61}
62
63struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem,
65 .gpl_only = false,
66 .ret_type = RET_INTEGER,
67 .arg1_type = ARG_CONST_MAP_PTR,
68 .arg2_type = ARG_PTR_TO_MAP_KEY,
69 .arg3_type = ARG_PTR_TO_MAP_VALUE,
70 .arg4_type = ARG_ANYTHING,
71};
72
73static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
74{
75 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
76 void *key = (void *) (unsigned long) r2;
77
78 WARN_ON_ONCE(!rcu_read_lock_held());
79
80 return map->ops->map_delete_elem(map, key);
81}
82
83struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem,
85 .gpl_only = false,
86 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY,
89};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ESRCH; 172 err = -ENOENT;
173 rcu_read_lock(); 173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key); 174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 175 if (!value)
@@ -190,7 +190,7 @@ err_put:
190 return err; 190 return err;
191} 191}
192 192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value 193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
194 194
195static int map_update_elem(union bpf_attr *attr) 195static int map_update_elem(union bpf_attr *attr)
196{ 196{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
231 * therefore all map accessors rely on this fact, so do the same here 231 * therefore all map accessors rely on this fact, so do the same here
232 */ 232 */
233 rcu_read_lock(); 233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value); 234 err = map->ops->map_update_elem(map, key, value, attr->flags);
235 rcu_read_unlock(); 235 rcu_read_unlock();
236 236
237free_value: 237free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
18 u64 arg2; 18 u64 arg2;
19}; 19};
20 20
21static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
22{
23 return 0;
24}
25
26static struct bpf_func_proto test_funcs[] = {
27 [BPF_FUNC_unspec] = {
28 .func = test_func,
29 .gpl_only = true,
30 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
31 .arg1_type = ARG_CONST_MAP_PTR,
32 .arg2_type = ARG_PTR_TO_MAP_KEY,
33 },
34};
35
36static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) 21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
37{ 22{
38 if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) 23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
39 return NULL; 31 return NULL;
40 return &test_funcs[func_id]; 32 }
41} 33}
42 34
43static const struct bpf_context_access { 35static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
78 .type = BPF_PROG_TYPE_UNSPEC, 70 .type = BPF_PROG_TYPE_UNSPEC,
79}; 71};
80 72
81static struct bpf_map *test_map_alloc(union bpf_attr *attr)
82{
83 struct bpf_map *map;
84
85 map = kzalloc(sizeof(*map), GFP_USER);
86 if (!map)
87 return ERR_PTR(-ENOMEM);
88
89 map->key_size = attr->key_size;
90 map->value_size = attr->value_size;
91 map->max_entries = attr->max_entries;
92 return map;
93}
94
95static void test_map_free(struct bpf_map *map)
96{
97 kfree(map);
98}
99
100static struct bpf_map_ops test_map_ops = {
101 .map_alloc = test_map_alloc,
102 .map_free = test_map_free,
103};
104
105static struct bpf_map_type_list tl_map = {
106 .ops = &test_map_ops,
107 .type = BPF_MAP_TYPE_UNSPEC,
108};
109
110static int __init register_test_ops(void) 73static int __init register_test_ops(void)
111{ 74{
112 bpf_register_map_type(&tl_map);
113 bpf_register_prog_type(&tl_prog); 75 bpf_register_prog_type(&tl_prog);
114 return 0; 76 return 0;
115} 77}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
153 153
154enum bpf_stack_slot_type { 154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */ 155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* 1st byte of register spilled into stack */ 156 STACK_SPILL, /* register spilled into stack */
157 STACK_SPILL_PART, /* other 7 bytes of register spill */
158 STACK_MISC /* BPF program wrote some data into this slot */ 157 STACK_MISC /* BPF program wrote some data into this slot */
159}; 158};
160 159
161struct bpf_stack_slot { 160#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
162 enum bpf_stack_slot_type stype;
163 struct reg_state reg_st;
164};
165 161
166/* state of the program: 162/* state of the program:
167 * type of all registers and stack info 163 * type of all registers and stack info
168 */ 164 */
169struct verifier_state { 165struct verifier_state {
170 struct reg_state regs[MAX_BPF_REG]; 166 struct reg_state regs[MAX_BPF_REG];
171 struct bpf_stack_slot stack[MAX_BPF_STACK]; 167 u8 stack_slot_type[MAX_BPF_STACK];
168 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
172}; 169};
173 170
174/* linked list of verifier states used to prune search */ 171/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
259 env->cur_state.regs[i].map_ptr->key_size, 256 env->cur_state.regs[i].map_ptr->key_size,
260 env->cur_state.regs[i].map_ptr->value_size); 257 env->cur_state.regs[i].map_ptr->value_size);
261 } 258 }
262 for (i = 0; i < MAX_BPF_STACK; i++) { 259 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
263 if (env->cur_state.stack[i].stype == STACK_SPILL) 260 if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
264 verbose(" fp%d=%s", -MAX_BPF_STACK + i, 261 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
265 reg_type_str[env->cur_state.stack[i].reg_st.type]); 262 reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
266 } 263 }
267 verbose("\n"); 264 verbose("\n");
268} 265}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
539static int check_stack_write(struct verifier_state *state, int off, int size, 536static int check_stack_write(struct verifier_state *state, int off, int size,
540 int value_regno) 537 int value_regno)
541{ 538{
542 struct bpf_stack_slot *slot;
543 int i; 539 int i;
540 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
541 * so it's aligned access and [off, off + size) are within stack limits
542 */
544 543
545 if (value_regno >= 0 && 544 if (value_regno >= 0 &&
546 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 545 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
548 state->regs[value_regno].type == PTR_TO_CTX)) { 547 state->regs[value_regno].type == PTR_TO_CTX)) {
549 548
550 /* register containing pointer is being spilled into stack */ 549 /* register containing pointer is being spilled into stack */
551 if (size != 8) { 550 if (size != BPF_REG_SIZE) {
552 verbose("invalid size of register spill\n"); 551 verbose("invalid size of register spill\n");
553 return -EACCES; 552 return -EACCES;
554 } 553 }
555 554
556 slot = &state->stack[MAX_BPF_STACK + off];
557 slot->stype = STACK_SPILL;
558 /* save register state */ 555 /* save register state */
559 slot->reg_st = state->regs[value_regno]; 556 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
560 for (i = 1; i < 8; i++) { 557 state->regs[value_regno];
561 slot = &state->stack[MAX_BPF_STACK + off + i];
562 slot->stype = STACK_SPILL_PART;
563 slot->reg_st.type = UNKNOWN_VALUE;
564 slot->reg_st.map_ptr = NULL;
565 }
566 } else {
567 558
559 for (i = 0; i < BPF_REG_SIZE; i++)
560 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
561 } else {
568 /* regular write of data into stack */ 562 /* regular write of data into stack */
569 for (i = 0; i < size; i++) { 563 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
570 slot = &state->stack[MAX_BPF_STACK + off + i]; 564 (struct reg_state) {};
571 slot->stype = STACK_MISC; 565
572 slot->reg_st.type = UNKNOWN_VALUE; 566 for (i = 0; i < size; i++)
573 slot->reg_st.map_ptr = NULL; 567 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
574 }
575 } 568 }
576 return 0; 569 return 0;
577} 570}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
579static int check_stack_read(struct verifier_state *state, int off, int size, 572static int check_stack_read(struct verifier_state *state, int off, int size,
580 int value_regno) 573 int value_regno)
581{ 574{
575 u8 *slot_type;
582 int i; 576 int i;
583 struct bpf_stack_slot *slot;
584 577
585 slot = &state->stack[MAX_BPF_STACK + off]; 578 slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
586 579
587 if (slot->stype == STACK_SPILL) { 580 if (slot_type[0] == STACK_SPILL) {
588 if (size != 8) { 581 if (size != BPF_REG_SIZE) {
589 verbose("invalid size of register spill\n"); 582 verbose("invalid size of register spill\n");
590 return -EACCES; 583 return -EACCES;
591 } 584 }
592 for (i = 1; i < 8; i++) { 585 for (i = 1; i < BPF_REG_SIZE; i++) {
593 if (state->stack[MAX_BPF_STACK + off + i].stype != 586 if (slot_type[i] != STACK_SPILL) {
594 STACK_SPILL_PART) {
595 verbose("corrupted spill memory\n"); 587 verbose("corrupted spill memory\n");
596 return -EACCES; 588 return -EACCES;
597 } 589 }
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
599 591
600 if (value_regno >= 0) 592 if (value_regno >= 0)
601 /* restore register state from stack */ 593 /* restore register state from stack */
602 state->regs[value_regno] = slot->reg_st; 594 state->regs[value_regno] =
595 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
603 return 0; 596 return 0;
604 } else { 597 } else {
605 for (i = 0; i < size; i++) { 598 for (i = 0; i < size; i++) {
606 if (state->stack[MAX_BPF_STACK + off + i].stype != 599 if (slot_type[i] != STACK_MISC) {
607 STACK_MISC) {
608 verbose("invalid read from stack off %d+%d size %d\n", 600 verbose("invalid read from stack off %d+%d size %d\n",
609 off, i, size); 601 off, i, size);
610 return -EACCES; 602 return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
747 } 739 }
748 740
749 for (i = 0; i < access_size; i++) { 741 for (i = 0; i < access_size; i++) {
750 if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { 742 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
751 verbose("invalid indirect read from stack off %d+%d size %d\n", 743 verbose("invalid indirect read from stack off %d+%d size %d\n",
752 off, i, access_size); 744 off, i, access_size);
753 return -EACCES; 745 return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1180 return 0; 1172 return 0;
1181} 1173}
1182 1174
1175/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers,
1178 * preserve R6-R9, and store return value into R0
1179 *
1180 * Implicit input:
1181 * ctx == skb == R6 == CTX
1182 *
1183 * Explicit input:
1184 * SRC == any register
1185 * IMM == 32-bit immediate
1186 *
1187 * Output:
1188 * R0 - 8/16/32-bit skb data converted to cpu endianness
1189 */
1190static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1191{
1192 struct reg_state *regs = env->cur_state.regs;
1193 u8 mode = BPF_MODE(insn->code);
1194 struct reg_state *reg;
1195 int i, err;
1196
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
1199 return -EINVAL;
1200 }
1201
1202 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
1203 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
1204 verbose("BPF_LD_ABS uses reserved fields\n");
1205 return -EINVAL;
1206 }
1207
1208 /* check whether implicit source operand (register R6) is readable */
1209 err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
1210 if (err)
1211 return err;
1212
1213 if (regs[BPF_REG_6].type != PTR_TO_CTX) {
1214 verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
1215 return -EINVAL;
1216 }
1217
1218 if (mode == BPF_IND) {
1219 /* check explicit source operand */
1220 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1221 if (err)
1222 return err;
1223 }
1224
1225 /* reset caller saved regs to unreadable */
1226 for (i = 0; i < CALLER_SAVED_REGS; i++) {
1227 reg = regs + caller_saved[i];
1228 reg->type = NOT_INIT;
1229 reg->imm = 0;
1230 }
1231
1232 /* mark destination R0 register as readable, since it contains
1233 * the value fetched from the packet
1234 */
1235 regs[BPF_REG_0].type = UNKNOWN_VALUE;
1236 return 0;
1237}
1238
1183/* non-recursive DFS pseudo code 1239/* non-recursive DFS pseudo code
1184 * 1 procedure DFS-iterative(G,v): 1240 * 1 procedure DFS-iterative(G,v):
1185 * 2 label v as discovered 1241 * 2 label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1417 } 1473 }
1418 1474
1419 for (i = 0; i < MAX_BPF_STACK; i++) { 1475 for (i = 0; i < MAX_BPF_STACK; i++) {
1420 if (memcmp(&old->stack[i], &cur->stack[i], 1476 if (old->stack_slot_type[i] == STACK_INVALID)
1421 sizeof(old->stack[0])) != 0) { 1477 continue;
1422 if (old->stack[i].stype == STACK_INVALID) 1478 if (old->stack_slot_type[i] != cur->stack_slot_type[i])
1423 continue; 1479 /* Ex: old explored (safe) state has STACK_SPILL in
1480 * this stack slot, but current has has STACK_MISC ->
1481 * this verifier states are not equivalent,
1482 * return false to continue verification of this path
1483 */
1424 return false; 1484 return false;
1425 } 1485 if (i % BPF_REG_SIZE)
1486 continue;
1487 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
1488 &cur->spilled_regs[i / BPF_REG_SIZE],
1489 sizeof(old->spilled_regs[0])))
1490 /* when explored and current stack slot types are
1491 * the same, check that stored pointers types
1492 * are the same as well.
1493 * Ex: explored safe path could have stored
1494 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
1495 * but current path has stored:
1496 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
1497 * such verifier states are not equivalent.
1498 * return false to continue verification of this path
1499 */
1500 return false;
1501 else
1502 continue;
1426 } 1503 }
1427 return true; 1504 return true;
1428} 1505}
@@ -1664,8 +1741,10 @@ process_bpf_exit:
1664 u8 mode = BPF_MODE(insn->code); 1741 u8 mode = BPF_MODE(insn->code);
1665 1742
1666 if (mode == BPF_ABS || mode == BPF_IND) { 1743 if (mode == BPF_ABS || mode == BPF_IND) {
1667 verbose("LD_ABS is not supported yet\n"); 1744 err = check_ld_abs(env, insn);
1668 return -EINVAL; 1745 if (err)
1746 return err;
1747
1669 } else if (mode == BPF_IMM) { 1748 } else if (mode == BPF_IMM) {
1670 err = check_ld_imm(env, insn); 1749 err = check_ld_imm(env, insn);
1671 if (err) 1750 if (err)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..04cfe8ace520 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
277 if (!(cgrp->root->subsys_mask & (1 << ss->id))) 277 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
278 return NULL; 278 return NULL;
279 279
280 /*
281 * This function is used while updating css associations and thus
282 * can't test the csses directly. Use ->child_subsys_mask.
283 */
280 while (cgroup_parent(cgrp) && 284 while (cgroup_parent(cgrp) &&
281 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) 285 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
282 cgrp = cgroup_parent(cgrp); 286 cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
284 return cgroup_css(cgrp, ss); 288 return cgroup_css(cgrp, ss);
285} 289}
286 290
291/**
292 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
293 * @cgrp: the cgroup of interest
294 * @ss: the subsystem of interest
295 *
296 * Find and get the effective css of @cgrp for @ss. The effective css is
297 * defined as the matching css of the nearest ancestor including self which
298 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
299 * the root css is returned, so this function always returns a valid css.
300 * The returned css must be put using css_put().
301 */
302struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
303 struct cgroup_subsys *ss)
304{
305 struct cgroup_subsys_state *css;
306
307 rcu_read_lock();
308
309 do {
310 css = cgroup_css(cgrp, ss);
311
312 if (css && css_tryget_online(css))
313 goto out_unlock;
314 cgrp = cgroup_parent(cgrp);
315 } while (cgrp);
316
317 css = init_css_set.subsys[ss->id];
318 css_get(css);
319out_unlock:
320 rcu_read_unlock();
321 return css;
322}
323
287/* convenient tests for these bits */ 324/* convenient tests for these bits */
288static inline bool cgroup_is_dead(const struct cgroup *cgrp) 325static inline bool cgroup_is_dead(const struct cgroup *cgrp)
289{ 326{
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
1019} 1056}
1020 1057
1021/** 1058/**
1022 * cgroup_refresh_child_subsys_mask - update child_subsys_mask 1059 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1023 * @cgrp: the target cgroup 1060 * @cgrp: the target cgroup
1061 * @subtree_control: the new subtree_control mask to consider
1024 * 1062 *
1025 * On the default hierarchy, a subsystem may request other subsystems to be 1063 * On the default hierarchy, a subsystem may request other subsystems to be
1026 * enabled together through its ->depends_on mask. In such cases, more 1064 * enabled together through its ->depends_on mask. In such cases, more
1027 * subsystems than specified in "cgroup.subtree_control" may be enabled. 1065 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1028 * 1066 *
1029 * This function determines which subsystems need to be enabled given the 1067 * This function calculates which subsystems need to be enabled if
1030 * current @cgrp->subtree_control and records it in 1068 * @subtree_control is to be applied to @cgrp. The returned mask is always
1031 * @cgrp->child_subsys_mask. The resulting mask is always a superset of 1069 * a superset of @subtree_control and follows the usual hierarchy rules.
1032 * @cgrp->subtree_control and follows the usual hierarchy rules.
1033 */ 1070 */
1034static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) 1071static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072 unsigned int subtree_control)
1035{ 1073{
1036 struct cgroup *parent = cgroup_parent(cgrp); 1074 struct cgroup *parent = cgroup_parent(cgrp);
1037 unsigned int cur_ss_mask = cgrp->subtree_control; 1075 unsigned int cur_ss_mask = subtree_control;
1038 struct cgroup_subsys *ss; 1076 struct cgroup_subsys *ss;
1039 int ssid; 1077 int ssid;
1040 1078
1041 lockdep_assert_held(&cgroup_mutex); 1079 lockdep_assert_held(&cgroup_mutex);
1042 1080
1043 if (!cgroup_on_dfl(cgrp)) { 1081 if (!cgroup_on_dfl(cgrp))
1044 cgrp->child_subsys_mask = cur_ss_mask; 1082 return cur_ss_mask;
1045 return;
1046 }
1047 1083
1048 while (true) { 1084 while (true) {
1049 unsigned int new_ss_mask = cur_ss_mask; 1085 unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1067 cur_ss_mask = new_ss_mask; 1103 cur_ss_mask = new_ss_mask;
1068 } 1104 }
1069 1105
1070 cgrp->child_subsys_mask = cur_ss_mask; 1106 return cur_ss_mask;
1107}
1108
1109/**
1110 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1111 * @cgrp: the target cgroup
1112 *
1113 * Update @cgrp->child_subsys_mask according to the current
1114 * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
1115 */
1116static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1117{
1118 cgrp->child_subsys_mask =
1119 cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
1071} 1120}
1072 1121
1073/** 1122/**
@@ -1860,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
1860 * 1909 *
1861 * And don't kill the default root. 1910 * And don't kill the default root.
1862 */ 1911 */
1863 if (css_has_online_children(&root->cgrp.self) || 1912 if (!list_empty(&root->cgrp.self.children) ||
1864 root == &cgrp_dfl_root) 1913 root == &cgrp_dfl_root)
1865 cgroup_put(&root->cgrp); 1914 cgroup_put(&root->cgrp);
1866 else 1915 else
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2641 loff_t off) 2690 loff_t off)
2642{ 2691{
2643 unsigned int enable = 0, disable = 0; 2692 unsigned int enable = 0, disable = 0;
2644 unsigned int css_enable, css_disable, old_ctrl, new_ctrl; 2693 unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2645 struct cgroup *cgrp, *child; 2694 struct cgroup *cgrp, *child;
2646 struct cgroup_subsys *ss; 2695 struct cgroup_subsys *ss;
2647 char *tok; 2696 char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2693 ret = -ENOENT; 2742 ret = -ENOENT;
2694 goto out_unlock; 2743 goto out_unlock;
2695 } 2744 }
2696
2697 /*
2698 * @ss is already enabled through dependency and
2699 * we'll just make it visible. Skip draining.
2700 */
2701 if (cgrp->child_subsys_mask & (1 << ssid))
2702 continue;
2703
2704 /*
2705 * Because css offlining is asynchronous, userland
2706 * might try to re-enable the same controller while
2707 * the previous instance is still around. In such
2708 * cases, wait till it's gone using offline_waitq.
2709 */
2710 cgroup_for_each_live_child(child, cgrp) {
2711 DEFINE_WAIT(wait);
2712
2713 if (!cgroup_css(child, ss))
2714 continue;
2715
2716 cgroup_get(child);
2717 prepare_to_wait(&child->offline_waitq, &wait,
2718 TASK_UNINTERRUPTIBLE);
2719 cgroup_kn_unlock(of->kn);
2720 schedule();
2721 finish_wait(&child->offline_waitq, &wait);
2722 cgroup_put(child);
2723
2724 return restart_syscall();
2725 }
2726 } else if (disable & (1 << ssid)) { 2745 } else if (disable & (1 << ssid)) {
2727 if (!(cgrp->subtree_control & (1 << ssid))) { 2746 if (!(cgrp->subtree_control & (1 << ssid))) {
2728 disable &= ~(1 << ssid); 2747 disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2758 * subsystems than specified may need to be enabled or disabled 2777 * subsystems than specified may need to be enabled or disabled
2759 * depending on subsystem dependencies. 2778 * depending on subsystem dependencies.
2760 */ 2779 */
2761 cgrp->subtree_control |= enable; 2780 old_sc = cgrp->subtree_control;
2762 cgrp->subtree_control &= ~disable; 2781 old_ss = cgrp->child_subsys_mask;
2782 new_sc = (old_sc | enable) & ~disable;
2783 new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
2763 2784
2764 old_ctrl = cgrp->child_subsys_mask; 2785 css_enable = ~old_ss & new_ss;
2765 cgroup_refresh_child_subsys_mask(cgrp); 2786 css_disable = old_ss & ~new_ss;
2766 new_ctrl = cgrp->child_subsys_mask;
2767
2768 css_enable = ~old_ctrl & new_ctrl;
2769 css_disable = old_ctrl & ~new_ctrl;
2770 enable |= css_enable; 2787 enable |= css_enable;
2771 disable |= css_disable; 2788 disable |= css_disable;
2772 2789
2773 /* 2790 /*
2791 * Because css offlining is asynchronous, userland might try to
2792 * re-enable the same controller while the previous instance is
2793 * still around. In such cases, wait till it's gone using
2794 * offline_waitq.
2795 */
2796 for_each_subsys(ss, ssid) {
2797 if (!(css_enable & (1 << ssid)))
2798 continue;
2799
2800 cgroup_for_each_live_child(child, cgrp) {
2801 DEFINE_WAIT(wait);
2802
2803 if (!cgroup_css(child, ss))
2804 continue;
2805
2806 cgroup_get(child);
2807 prepare_to_wait(&child->offline_waitq, &wait,
2808 TASK_UNINTERRUPTIBLE);
2809 cgroup_kn_unlock(of->kn);
2810 schedule();
2811 finish_wait(&child->offline_waitq, &wait);
2812 cgroup_put(child);
2813
2814 return restart_syscall();
2815 }
2816 }
2817
2818 cgrp->subtree_control = new_sc;
2819 cgrp->child_subsys_mask = new_ss;
2820
2821 /*
2774 * Create new csses or make the existing ones visible. A css is 2822 * Create new csses or make the existing ones visible. A css is
2775 * created invisible if it's being implicitly enabled through 2823 * created invisible if it's being implicitly enabled through
2776 * dependency. An invisible css is made visible when the userland 2824 * dependency. An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2825 } 2873 }
2826 } 2874 }
2827 2875
2876 /*
2877 * The effective csses of all the descendants (excluding @cgrp) may
2878 * have changed. Subsystems can optionally subscribe to this event
2879 * by implementing ->css_e_css_changed() which is invoked if any of
2880 * the effective csses seen from the css's cgroup may have changed.
2881 */
2882 for_each_subsys(ss, ssid) {
2883 struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
2884 struct cgroup_subsys_state *css;
2885
2886 if (!ss->css_e_css_changed || !this_css)
2887 continue;
2888
2889 css_for_each_descendant_pre(css, this_css)
2890 if (css != this_css)
2891 ss->css_e_css_changed(css);
2892 }
2893
2828 kernfs_activate(cgrp->kn); 2894 kernfs_activate(cgrp->kn);
2829 ret = 0; 2895 ret = 0;
2830out_unlock: 2896out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
2832 return ret ?: nbytes; 2898 return ret ?: nbytes;
2833 2899
2834err_undo_css: 2900err_undo_css:
2835 cgrp->subtree_control &= ~enable; 2901 cgrp->subtree_control = old_sc;
2836 cgrp->subtree_control |= disable; 2902 cgrp->child_subsys_mask = old_ss;
2837 cgroup_refresh_child_subsys_mask(cgrp);
2838 2903
2839 for_each_subsys(ss, ssid) { 2904 for_each_subsys(ss, ssid) {
2840 if (!(enable & (1 << ssid))) 2905 if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
4370 if (ss) { 4435 if (ss) {
4371 /* css release path */ 4436 /* css release path */
4372 cgroup_idr_remove(&ss->css_idr, css->id); 4437 cgroup_idr_remove(&ss->css_idr, css->id);
4438 if (ss->css_released)
4439 ss->css_released(css);
4373 } else { 4440 } else {
4374 /* cgroup release path */ 4441 /* cgroup release path */
4375 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 90a3d017b90c..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -86,6 +86,16 @@ static struct {
86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
88 88
89static void apply_puts_pending(int max)
90{
91 int delta;
92
93 if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
94 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
95 cpu_hotplug.refcount -= delta;
96 }
97}
98
89void get_online_cpus(void) 99void get_online_cpus(void)
90{ 100{
91 might_sleep(); 101 might_sleep();
@@ -93,6 +103,7 @@ void get_online_cpus(void)
93 return; 103 return;
94 cpuhp_lock_acquire_read(); 104 cpuhp_lock_acquire_read();
95 mutex_lock(&cpu_hotplug.lock); 105 mutex_lock(&cpu_hotplug.lock);
106 apply_puts_pending(65536);
96 cpu_hotplug.refcount++; 107 cpu_hotplug.refcount++;
97 mutex_unlock(&cpu_hotplug.lock); 108 mutex_unlock(&cpu_hotplug.lock);
98} 109}
@@ -105,6 +116,7 @@ bool try_get_online_cpus(void)
105 if (!mutex_trylock(&cpu_hotplug.lock)) 116 if (!mutex_trylock(&cpu_hotplug.lock))
106 return false; 117 return false;
107 cpuhp_lock_acquire_tryread(); 118 cpuhp_lock_acquire_tryread();
119 apply_puts_pending(65536);
108 cpu_hotplug.refcount++; 120 cpu_hotplug.refcount++;
109 mutex_unlock(&cpu_hotplug.lock); 121 mutex_unlock(&cpu_hotplug.lock);
110 return true; 122 return true;
@@ -161,12 +173,7 @@ void cpu_hotplug_begin(void)
161 cpuhp_lock_acquire(); 173 cpuhp_lock_acquire();
162 for (;;) { 174 for (;;) {
163 mutex_lock(&cpu_hotplug.lock); 175 mutex_lock(&cpu_hotplug.lock);
164 if (atomic_read(&cpu_hotplug.puts_pending)) { 176 apply_puts_pending(1);
165 int delta;
166
167 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
168 cpu_hotplug.refcount -= delta;
169 }
170 if (likely(!cpu_hotplug.refcount)) 177 if (likely(!cpu_hotplug.refcount))
171 break; 178 break;
172 __set_current_state(TASK_UNINTERRUPTIBLE); 179 __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249 249
250/* 250/*
251 * There are two global mutexes guarding cpuset structures - cpuset_mutex 251 * There are two global locks guarding cpuset structures - cpuset_mutex and
252 * and callback_mutex. The latter may nest inside the former. We also 252 * callback_lock. We also require taking task_lock() when dereferencing a
253 * require taking task_lock() when dereferencing a task's cpuset pointer. 253 * task's cpuset pointer. See "The task_lock() exception", at the end of this
254 * See "The task_lock() exception", at the end of this comment. 254 * comment.
255 * 255 *
256 * A task must hold both mutexes to modify cpusets. If a task holds 256 * A task must hold both locks to modify cpusets. If a task holds
257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
258 * is the only task able to also acquire callback_mutex and be able to 258 * is the only task able to also acquire callback_lock and be able to
259 * modify cpusets. It can perform various checks on the cpuset structure 259 * modify cpusets. It can perform various checks on the cpuset structure
260 * first, knowing nothing will change. It can also allocate memory while 260 * first, knowing nothing will change. It can also allocate memory while
261 * just holding cpuset_mutex. While it is performing these checks, various 261 * just holding cpuset_mutex. While it is performing these checks, various
262 * callback routines can briefly acquire callback_mutex to query cpusets. 262 * callback routines can briefly acquire callback_lock to query cpusets.
263 * Once it is ready to make the changes, it takes callback_mutex, blocking 263 * Once it is ready to make the changes, it takes callback_lock, blocking
264 * everyone else. 264 * everyone else.
265 * 265 *
266 * Calls to the kernel memory allocator can not be made while holding 266 * Calls to the kernel memory allocator can not be made while holding
267 * callback_mutex, as that would risk double tripping on callback_mutex 267 * callback_lock, as that would risk double tripping on callback_lock
268 * from one of the callbacks into the cpuset code from within 268 * from one of the callbacks into the cpuset code from within
269 * __alloc_pages(). 269 * __alloc_pages().
270 * 270 *
271 * If a task is only holding callback_mutex, then it has read-only 271 * If a task is only holding callback_lock, then it has read-only
272 * access to cpusets. 272 * access to cpusets.
273 * 273 *
274 * Now, the task_struct fields mems_allowed and mempolicy may be changed 274 * Now, the task_struct fields mems_allowed and mempolicy may be changed
275 * by other task, we use alloc_lock in the task_struct fields to protect 275 * by other task, we use alloc_lock in the task_struct fields to protect
276 * them. 276 * them.
277 * 277 *
278 * The cpuset_common_file_read() handlers only hold callback_mutex across 278 * The cpuset_common_file_read() handlers only hold callback_lock across
279 * small pieces of code, such as when reading out possibly multi-word 279 * small pieces of code, such as when reading out possibly multi-word
280 * cpumasks and nodemasks. 280 * cpumasks and nodemasks.
281 * 281 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
284 */ 284 */
285 285
286static DEFINE_MUTEX(cpuset_mutex); 286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_MUTEX(callback_mutex); 287static DEFINE_SPINLOCK(callback_lock);
288 288
289/* 289/*
290 * CPU / memory hotplug is handled asynchronously. 290 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
329 * One way or another, we guarantee to return some non-empty subset 329 * One way or another, we guarantee to return some non-empty subset
330 * of cpu_online_mask. 330 * of cpu_online_mask.
331 * 331 *
332 * Call with callback_mutex held. 332 * Call with callback_lock or cpuset_mutex held.
333 */ 333 */
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{ 335{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
347 * One way or another, we guarantee to return some non-empty subset 347 * One way or another, we guarantee to return some non-empty subset
348 * of node_states[N_MEMORY]. 348 * of node_states[N_MEMORY].
349 * 349 *
350 * Call with callback_mutex held. 350 * Call with callback_lock or cpuset_mutex held.
351 */ 351 */
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{ 353{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
359/* 359/*
360 * update task's spread flag if cpuset's page/slab spread flag is set 360 * update task's spread flag if cpuset's page/slab spread flag is set
361 * 361 *
362 * Called with callback_mutex/cpuset_mutex held 362 * Call with callback_lock or cpuset_mutex held.
363 */ 363 */
364static void cpuset_update_task_spread_flag(struct cpuset *cs, 364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk) 365 struct task_struct *tsk)
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
506 goto out; 506 goto out;
507 } 507 }
508 508
509 /*
510 * We can't shrink if we won't have enough room for SCHED_DEADLINE
511 * tasks.
512 */
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
509 ret = 0; 519 ret = 0;
510out: 520out:
511 rcu_read_unlock(); 521 rcu_read_unlock();
@@ -876,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
876 continue; 886 continue;
877 rcu_read_unlock(); 887 rcu_read_unlock();
878 888
879 mutex_lock(&callback_mutex); 889 spin_lock_irq(&callback_lock);
880 cpumask_copy(cp->effective_cpus, new_cpus); 890 cpumask_copy(cp->effective_cpus, new_cpus);
881 mutex_unlock(&callback_mutex); 891 spin_unlock_irq(&callback_lock);
882 892
883 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
884 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -943,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
943 if (retval < 0) 953 if (retval < 0)
944 return retval; 954 return retval;
945 955
946 mutex_lock(&callback_mutex); 956 spin_lock_irq(&callback_lock);
947 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
948 mutex_unlock(&callback_mutex); 958 spin_unlock_irq(&callback_lock);
949 959
950 /* use trialcs->cpus_allowed as a temp variable */ 960 /* use trialcs->cpus_allowed as a temp variable */
951 update_cpumasks_hier(cs, trialcs->cpus_allowed); 961 update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1132,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1132 continue; 1142 continue;
1133 rcu_read_unlock(); 1143 rcu_read_unlock();
1134 1144
1135 mutex_lock(&callback_mutex); 1145 spin_lock_irq(&callback_lock);
1136 cp->effective_mems = *new_mems; 1146 cp->effective_mems = *new_mems;
1137 mutex_unlock(&callback_mutex); 1147 spin_unlock_irq(&callback_lock);
1138 1148
1139 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1140 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1150 !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1155,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1155 * mempolicies and if the cpuset is marked 'memory_migrate', 1165 * mempolicies and if the cpuset is marked 'memory_migrate',
1156 * migrate the tasks pages to the new memory. 1166 * migrate the tasks pages to the new memory.
1157 * 1167 *
1158 * Call with cpuset_mutex held. May take callback_mutex during call. 1168 * Call with cpuset_mutex held. May take callback_lock during call.
1159 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1160 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1161 * their mempolicies to the cpusets new mems_allowed. 1171 * their mempolicies to the cpusets new mems_allowed.
@@ -1202,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1202 if (retval < 0) 1212 if (retval < 0)
1203 goto done; 1213 goto done;
1204 1214
1205 mutex_lock(&callback_mutex); 1215 spin_lock_irq(&callback_lock);
1206 cs->mems_allowed = trialcs->mems_allowed; 1216 cs->mems_allowed = trialcs->mems_allowed;
1207 mutex_unlock(&callback_mutex); 1217 spin_unlock_irq(&callback_lock);
1208 1218
1209 /* use trialcs->mems_allowed as a temp variable */ 1219 /* use trialcs->mems_allowed as a temp variable */
1210 update_nodemasks_hier(cs, &cs->mems_allowed); 1220 update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1295,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1295 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1296 || (is_spread_page(cs) != is_spread_page(trialcs))); 1306 || (is_spread_page(cs) != is_spread_page(trialcs)));
1297 1307
1298 mutex_lock(&callback_mutex); 1308 spin_lock_irq(&callback_lock);
1299 cs->flags = trialcs->flags; 1309 cs->flags = trialcs->flags;
1300 mutex_unlock(&callback_mutex); 1310 spin_unlock_irq(&callback_lock);
1301 1311
1302 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1303 rebuild_sched_domains_locked(); 1313 rebuild_sched_domains_locked();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1429 goto out_unlock; 1439 goto out_unlock;
1430 1440
1431 cgroup_taskset_for_each(task, tset) { 1441 cgroup_taskset_for_each(task, tset) {
1432 /* 1442 ret = task_can_attach(task, cs->cpus_allowed);
1433 * Kthreads which disallow setaffinity shouldn't be moved 1443 if (ret)
1434 * to a new cpuset; we don't want to change their cpu
1435 * affinity and isolating such threads by their set of
1436 * allowed nodes is unnecessary. Thus, cpusets are not
1437 * applicable for such threads. This prevents checking for
1438 * success of set_cpus_allowed_ptr() on all attached tasks
1439 * before cpus_allowed may be changed.
1440 */
1441 ret = -EINVAL;
1442 if (task->flags & PF_NO_SETAFFINITY)
1443 goto out_unlock; 1444 goto out_unlock;
1444 ret = security_task_setscheduler(task); 1445 ret = security_task_setscheduler(task);
1445 if (ret) 1446 if (ret)
@@ -1713,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1713 count = seq_get_buf(sf, &buf); 1714 count = seq_get_buf(sf, &buf);
1714 s = buf; 1715 s = buf;
1715 1716
1716 mutex_lock(&callback_mutex); 1717 spin_lock_irq(&callback_lock);
1717 1718
1718 switch (type) { 1719 switch (type) {
1719 case FILE_CPULIST: 1720 case FILE_CPULIST:
@@ -1740,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1740 seq_commit(sf, -1); 1741 seq_commit(sf, -1);
1741 } 1742 }
1742out_unlock: 1743out_unlock:
1743 mutex_unlock(&callback_mutex); 1744 spin_unlock_irq(&callback_lock);
1744 return ret; 1745 return ret;
1745} 1746}
1746 1747
@@ -1957,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1957 1958
1958 cpuset_inc(); 1959 cpuset_inc();
1959 1960
1960 mutex_lock(&callback_mutex); 1961 spin_lock_irq(&callback_lock);
1961 if (cgroup_on_dfl(cs->css.cgroup)) { 1962 if (cgroup_on_dfl(cs->css.cgroup)) {
1962 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1963 cs->effective_mems = parent->effective_mems; 1964 cs->effective_mems = parent->effective_mems;
1964 } 1965 }
1965 mutex_unlock(&callback_mutex); 1966 spin_unlock_irq(&callback_lock);
1966 1967
1967 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1968 goto out_unlock; 1969 goto out_unlock;
@@ -1989,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1989 } 1990 }
1990 rcu_read_unlock(); 1991 rcu_read_unlock();
1991 1992
1992 mutex_lock(&callback_mutex); 1993 spin_lock_irq(&callback_lock);
1993 cs->mems_allowed = parent->mems_allowed; 1994 cs->mems_allowed = parent->mems_allowed;
1994 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1995 mutex_unlock(&callback_mutex); 1996 spin_unlock_irq(&callback_lock);
1996out_unlock: 1997out_unlock:
1997 mutex_unlock(&cpuset_mutex); 1998 mutex_unlock(&cpuset_mutex);
1998 return 0; 1999 return 0;
@@ -2031,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2031static void cpuset_bind(struct cgroup_subsys_state *root_css) 2032static void cpuset_bind(struct cgroup_subsys_state *root_css)
2032{ 2033{
2033 mutex_lock(&cpuset_mutex); 2034 mutex_lock(&cpuset_mutex);
2034 mutex_lock(&callback_mutex); 2035 spin_lock_irq(&callback_lock);
2035 2036
2036 if (cgroup_on_dfl(root_css->cgroup)) { 2037 if (cgroup_on_dfl(root_css->cgroup)) {
2037 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2042,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2042 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2043 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2043 } 2044 }
2044 2045
2045 mutex_unlock(&callback_mutex); 2046 spin_unlock_irq(&callback_lock);
2046 mutex_unlock(&cpuset_mutex); 2047 mutex_unlock(&cpuset_mutex);
2047} 2048}
2048 2049
@@ -2127,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2127{ 2128{
2128 bool is_empty; 2129 bool is_empty;
2129 2130
2130 mutex_lock(&callback_mutex); 2131 spin_lock_irq(&callback_lock);
2131 cpumask_copy(cs->cpus_allowed, new_cpus); 2132 cpumask_copy(cs->cpus_allowed, new_cpus);
2132 cpumask_copy(cs->effective_cpus, new_cpus); 2133 cpumask_copy(cs->effective_cpus, new_cpus);
2133 cs->mems_allowed = *new_mems; 2134 cs->mems_allowed = *new_mems;
2134 cs->effective_mems = *new_mems; 2135 cs->effective_mems = *new_mems;
2135 mutex_unlock(&callback_mutex); 2136 spin_unlock_irq(&callback_lock);
2136 2137
2137 /* 2138 /*
2138 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2169,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
2169 if (nodes_empty(*new_mems)) 2170 if (nodes_empty(*new_mems))
2170 *new_mems = parent_cs(cs)->effective_mems; 2171 *new_mems = parent_cs(cs)->effective_mems;
2171 2172
2172 mutex_lock(&callback_mutex); 2173 spin_lock_irq(&callback_lock);
2173 cpumask_copy(cs->effective_cpus, new_cpus); 2174 cpumask_copy(cs->effective_cpus, new_cpus);
2174 cs->effective_mems = *new_mems; 2175 cs->effective_mems = *new_mems;
2175 mutex_unlock(&callback_mutex); 2176 spin_unlock_irq(&callback_lock);
2176 2177
2177 if (cpus_updated) 2178 if (cpus_updated)
2178 update_tasks_cpumask(cs); 2179 update_tasks_cpumask(cs);
@@ -2258,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2258 2259
2259 /* synchronize cpus_allowed to cpu_active_mask */ 2260 /* synchronize cpus_allowed to cpu_active_mask */
2260 if (cpus_updated) { 2261 if (cpus_updated) {
2261 mutex_lock(&callback_mutex); 2262 spin_lock_irq(&callback_lock);
2262 if (!on_dfl) 2263 if (!on_dfl)
2263 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2264 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2265 mutex_unlock(&callback_mutex); 2266 spin_unlock_irq(&callback_lock);
2266 /* we don't mess with cpumasks of tasks in top_cpuset */ 2267 /* we don't mess with cpumasks of tasks in top_cpuset */
2267 } 2268 }
2268 2269
2269 /* synchronize mems_allowed to N_MEMORY */ 2270 /* synchronize mems_allowed to N_MEMORY */
2270 if (mems_updated) { 2271 if (mems_updated) {
2271 mutex_lock(&callback_mutex); 2272 spin_lock_irq(&callback_lock);
2272 if (!on_dfl) 2273 if (!on_dfl)
2273 top_cpuset.mems_allowed = new_mems; 2274 top_cpuset.mems_allowed = new_mems;
2274 top_cpuset.effective_mems = new_mems; 2275 top_cpuset.effective_mems = new_mems;
2275 mutex_unlock(&callback_mutex); 2276 spin_unlock_irq(&callback_lock);
2276 update_tasks_nodemask(&top_cpuset); 2277 update_tasks_nodemask(&top_cpuset);
2277 } 2278 }
2278 2279
@@ -2365,11 +2366,13 @@ void __init cpuset_init_smp(void)
2365 2366
2366void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2367{ 2368{
2368 mutex_lock(&callback_mutex); 2369 unsigned long flags;
2370
2371 spin_lock_irqsave(&callback_lock, flags);
2369 rcu_read_lock(); 2372 rcu_read_lock();
2370 guarantee_online_cpus(task_cs(tsk), pmask); 2373 guarantee_online_cpus(task_cs(tsk), pmask);
2371 rcu_read_unlock(); 2374 rcu_read_unlock();
2372 mutex_unlock(&callback_mutex); 2375 spin_unlock_irqrestore(&callback_lock, flags);
2373} 2376}
2374 2377
2375void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2378void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2415,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
2415nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2418nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2416{ 2419{
2417 nodemask_t mask; 2420 nodemask_t mask;
2421 unsigned long flags;
2418 2422
2419 mutex_lock(&callback_mutex); 2423 spin_lock_irqsave(&callback_lock, flags);
2420 rcu_read_lock(); 2424 rcu_read_lock();
2421 guarantee_online_mems(task_cs(tsk), &mask); 2425 guarantee_online_mems(task_cs(tsk), &mask);
2422 rcu_read_unlock(); 2426 rcu_read_unlock();
2423 mutex_unlock(&callback_mutex); 2427 spin_unlock_irqrestore(&callback_lock, flags);
2424 2428
2425 return mask; 2429 return mask;
2426} 2430}
@@ -2439,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2439/* 2443/*
2440 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2444 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
2441 * mem_hardwall ancestor to the specified cpuset. Call holding 2445 * mem_hardwall ancestor to the specified cpuset. Call holding
2442 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2446 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
2443 * (an unusual configuration), then returns the root cpuset. 2447 * (an unusual configuration), then returns the root cpuset.
2444 */ 2448 */
2445static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2449static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2450,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2450} 2454}
2451 2455
2452/** 2456/**
2453 * cpuset_node_allowed_softwall - Can we allocate on a memory node? 2457 * cpuset_node_allowed - Can we allocate on a memory node?
2454 * @node: is this an allowed node? 2458 * @node: is this an allowed node?
2455 * @gfp_mask: memory allocation flags 2459 * @gfp_mask: memory allocation flags
2456 * 2460 *
@@ -2462,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2462 * flag, yes. 2466 * flag, yes.
2463 * Otherwise, no. 2467 * Otherwise, no.
2464 * 2468 *
2465 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2466 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2467 * might sleep, and might allow a node from an enclosing cpuset.
2468 *
2469 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2470 * cpusets, and never sleeps.
2471 *
2472 * The __GFP_THISNODE placement logic is really handled elsewhere, 2469 * The __GFP_THISNODE placement logic is really handled elsewhere,
2473 * by forcibly using a zonelist starting at a specified node, and by 2470 * by forcibly using a zonelist starting at a specified node, and by
2474 * (in get_page_from_freelist()) refusing to consider the zones for 2471 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2481,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2481 * GFP_KERNEL allocations are not so marked, so can escape to the 2478 * GFP_KERNEL allocations are not so marked, so can escape to the
2482 * nearest enclosing hardwalled ancestor cpuset. 2479 * nearest enclosing hardwalled ancestor cpuset.
2483 * 2480 *
2484 * Scanning up parent cpusets requires callback_mutex. The 2481 * Scanning up parent cpusets requires callback_lock. The
2485 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2482 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2486 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2483 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2487 * current tasks mems_allowed came up empty on the first pass over 2484 * current tasks mems_allowed came up empty on the first pass over
2488 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2485 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2489 * cpuset are short of memory, might require taking the callback_mutex 2486 * cpuset are short of memory, might require taking the callback_lock.
2490 * mutex.
2491 * 2487 *
2492 * The first call here from mm/page_alloc:get_page_from_freelist() 2488 * The first call here from mm/page_alloc:get_page_from_freelist()
2493 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2489 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2504,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2504 * TIF_MEMDIE - any node ok 2500 * TIF_MEMDIE - any node ok
2505 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2501 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2506 * GFP_USER - only nodes in current tasks mems allowed ok. 2502 * GFP_USER - only nodes in current tasks mems allowed ok.
2507 *
2508 * Rule:
2509 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2510 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2511 * the code that might scan up ancestor cpusets and sleep.
2512 */ 2503 */
2513int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2504int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2514{ 2505{
2515 struct cpuset *cs; /* current cpuset ancestors */ 2506 struct cpuset *cs; /* current cpuset ancestors */
2516 int allowed; /* is allocation in zone z allowed? */ 2507 int allowed; /* is allocation in zone z allowed? */
2508 unsigned long flags;
2517 2509
2518 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2510 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2519 return 1; 2511 return 1;
2520 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2521 if (node_isset(node, current->mems_allowed)) 2512 if (node_isset(node, current->mems_allowed))
2522 return 1; 2513 return 1;
2523 /* 2514 /*
@@ -2533,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2533 return 1; 2524 return 1;
2534 2525
2535 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2526 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2536 mutex_lock(&callback_mutex); 2527 spin_lock_irqsave(&callback_lock, flags);
2537 2528
2538 rcu_read_lock(); 2529 rcu_read_lock();
2539 cs = nearest_hardwall_ancestor(task_cs(current)); 2530 cs = nearest_hardwall_ancestor(task_cs(current));
2540 allowed = node_isset(node, cs->mems_allowed); 2531 allowed = node_isset(node, cs->mems_allowed);
2541 rcu_read_unlock(); 2532 rcu_read_unlock();
2542 2533
2543 mutex_unlock(&callback_mutex); 2534 spin_unlock_irqrestore(&callback_lock, flags);
2544 return allowed; 2535 return allowed;
2545} 2536}
2546 2537
2547/*
2548 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2549 * @node: is this an allowed node?
2550 * @gfp_mask: memory allocation flags
2551 *
2552 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2553 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2554 * yes. If the task has been OOM killed and has access to memory reserves as
2555 * specified by the TIF_MEMDIE flag, yes.
2556 * Otherwise, no.
2557 *
2558 * The __GFP_THISNODE placement logic is really handled elsewhere,
2559 * by forcibly using a zonelist starting at a specified node, and by
2560 * (in get_page_from_freelist()) refusing to consider the zones for
2561 * any node on the zonelist except the first. By the time any such
2562 * calls get to this routine, we should just shut up and say 'yes'.
2563 *
2564 * Unlike the cpuset_node_allowed_softwall() variant, above,
2565 * this variant requires that the node be in the current task's
2566 * mems_allowed or that we're in interrupt. It does not scan up the
2567 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2568 * It never sleeps.
2569 */
2570int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2571{
2572 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2573 return 1;
2574 if (node_isset(node, current->mems_allowed))
2575 return 1;
2576 /*
2577 * Allow tasks that have access to memory reserves because they have
2578 * been OOM killed to get memory anywhere.
2579 */
2580 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2581 return 1;
2582 return 0;
2583}
2584
2585/** 2538/**
2586 * cpuset_mem_spread_node() - On which node to begin search for a file page 2539 * cpuset_mem_spread_node() - On which node to begin search for a file page
2587 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2540 * cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
27 * version 2. This program is licensed "as is" without any warranty of any 27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied. 28 * kind, whether express or implied.
29 */ 29 */
30
31#define pr_fmt(fmt) "KGDB: " fmt
32
30#include <linux/pid_namespace.h> 33#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 34#include <linux/clocksource.h>
32#include <linux/serial_core.h> 35#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
196 return err; 199 return err;
197 err = kgdb_arch_remove_breakpoint(&tmp); 200 err = kgdb_arch_remove_breakpoint(&tmp);
198 if (err) 201 if (err)
199 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 202 pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
200 "memory destroyed at: %lx", addr); 203 addr);
201 return err; 204 return err;
202} 205}
203 206
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
256 error = kgdb_arch_set_breakpoint(&kgdb_break[i]); 259 error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
257 if (error) { 260 if (error) {
258 ret = error; 261 ret = error;
259 printk(KERN_INFO "KGDB: BP install failed: %lx", 262 pr_info("BP install failed: %lx\n",
260 kgdb_break[i].bpt_addr); 263 kgdb_break[i].bpt_addr);
261 continue; 264 continue;
262 } 265 }
263 266
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
319 continue; 322 continue;
320 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 323 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
321 if (error) { 324 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", 325 pr_info("BP remove failed: %lx\n",
323 kgdb_break[i].bpt_addr); 326 kgdb_break[i].bpt_addr);
324 ret = error; 327 ret = error;
325 } 328 }
326 329
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
367 goto setundefined; 370 goto setundefined;
368 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 371 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
369 if (error) 372 if (error)
370 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 373 pr_err("breakpoint remove failed: %lx\n",
371 kgdb_break[i].bpt_addr); 374 kgdb_break[i].bpt_addr);
372setundefined: 375setundefined:
373 kgdb_break[i].state = BP_UNDEFINED; 376 kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
400 if (print_wait) { 403 if (print_wait) {
401#ifdef CONFIG_KGDB_KDB 404#ifdef CONFIG_KGDB_KDB
402 if (!dbg_kdb_mode) 405 if (!dbg_kdb_mode)
403 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); 406 pr_crit("waiting... or $3#33 for KDB\n");
404#else 407#else
405 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); 408 pr_crit("Waiting for remote debugger\n");
406#endif 409#endif
407 } 410 }
408 return 1; 411 return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
430 exception_level = 0; 433 exception_level = 0;
431 kgdb_skipexception(ks->ex_vector, ks->linux_regs); 434 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
432 dbg_activate_sw_breakpoints(); 435 dbg_activate_sw_breakpoints();
433 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", 436 pr_crit("re-enter error: breakpoint removed %lx\n", addr);
434 addr);
435 WARN_ON_ONCE(1); 437 WARN_ON_ONCE(1);
436 438
437 return 1; 439 return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
444 panic("Recursive entry to debugger"); 446 panic("Recursive entry to debugger");
445 } 447 }
446 448
447 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); 449 pr_crit("re-enter exception: ALL breakpoints killed\n");
448#ifdef CONFIG_KGDB_KDB 450#ifdef CONFIG_KGDB_KDB
449 /* Allow kdb to debug itself one level */ 451 /* Allow kdb to debug itself one level */
450 return 0; 452 return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
471 int cpu; 473 int cpu;
472 int trace_on = 0; 474 int trace_on = 0;
473 int online_cpus = num_online_cpus(); 475 int online_cpus = num_online_cpus();
476 u64 time_left;
474 477
475 kgdb_info[ks->cpu].enter_kgdb++; 478 kgdb_info[ks->cpu].enter_kgdb++;
476 kgdb_info[ks->cpu].exception_state |= exception_state; 479 kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
595 /* 598 /*
596 * Wait for the other CPUs to be notified and be waiting for us: 599 * Wait for the other CPUs to be notified and be waiting for us:
597 */ 600 */
598 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + 601 time_left = loops_per_jiffy * HZ;
599 atomic_read(&slaves_in_kgdb)) != online_cpus) 602 while (kgdb_do_roundup && --time_left &&
603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
604 online_cpus)
600 cpu_relax(); 605 cpu_relax();
606 if (!time_left)
607 pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
601 608
602 /* 609 /*
603 * At this point the primary processor is completely 610 * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
795static void sysrq_handle_dbg(int key) 802static void sysrq_handle_dbg(int key)
796{ 803{
797 if (!dbg_io_ops) { 804 if (!dbg_io_ops) {
798 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 805 pr_crit("ERROR: No KGDB I/O module available\n");
799 return; 806 return;
800 } 807 }
801 if (!kgdb_connected) { 808 if (!kgdb_connected) {
802#ifdef CONFIG_KGDB_KDB 809#ifdef CONFIG_KGDB_KDB
803 if (!dbg_kdb_mode) 810 if (!dbg_kdb_mode)
804 printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); 811 pr_crit("KGDB or $3#33 for KDB\n");
805#else 812#else
806 printk(KERN_CRIT "Entering KGDB\n"); 813 pr_crit("Entering KGDB\n");
807#endif 814#endif
808 } 815 }
809 816
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
945{ 952{
946 kgdb_break_asap = 0; 953 kgdb_break_asap = 0;
947 954
948 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); 955 pr_crit("Waiting for connection from remote gdb...\n");
949 kgdb_breakpoint(); 956 kgdb_breakpoint();
950} 957}
951 958
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
964 if (dbg_io_ops) { 971 if (dbg_io_ops) {
965 spin_unlock(&kgdb_registration_lock); 972 spin_unlock(&kgdb_registration_lock);
966 973
967 printk(KERN_ERR "kgdb: Another I/O driver is already " 974 pr_err("Another I/O driver is already registered with KGDB\n");
968 "registered with KGDB.\n");
969 return -EBUSY; 975 return -EBUSY;
970 } 976 }
971 977
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
981 987
982 spin_unlock(&kgdb_registration_lock); 988 spin_unlock(&kgdb_registration_lock);
983 989
984 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", 990 pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
985 new_dbg_io_ops->name);
986 991
987 /* Arm KGDB now. */ 992 /* Arm KGDB now. */
988 kgdb_register_callbacks(); 993 kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
1017 1022
1018 spin_unlock(&kgdb_registration_lock); 1023 spin_unlock(&kgdb_registration_lock);
1019 1024
1020 printk(KERN_INFO 1025 pr_info("Unregistered I/O driver %s, debugger disabled\n",
1021 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1022 old_dbg_io_ops->name); 1026 old_dbg_io_ops->name);
1023} 1027}
1024EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); 1028EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) 531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
532 bp->bp_free = 1; 532 bp->bp_free = 1;
533 533
534 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", 534 kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
535 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 535 "Set/Display breakpoints", 0,
536 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", 536 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
537 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 537 kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
538 "Display breakpoints", 0,
539 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
538 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) 540 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
539 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", 541 kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
540 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); 542 "[datar [length]|dataw [length]] Set hw brk", 0,
541 kdb_register_repeat("bc", kdb_bc, "<bpnum>", 543 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
542 "Clear Breakpoint", 0, KDB_REPEAT_NONE); 544 kdb_register_flags("bc", kdb_bc, "<bpnum>",
543 kdb_register_repeat("be", kdb_bc, "<bpnum>", 545 "Clear Breakpoint", 0,
544 "Enable Breakpoint", 0, KDB_REPEAT_NONE); 546 KDB_ENABLE_FLOW_CTRL);
545 kdb_register_repeat("bd", kdb_bc, "<bpnum>", 547 kdb_register_flags("be", kdb_bc, "<bpnum>",
546 "Disable Breakpoint", 0, KDB_REPEAT_NONE); 548 "Enable Breakpoint", 0,
547 549 KDB_ENABLE_FLOW_CTRL);
548 kdb_register_repeat("ss", kdb_ss, "", 550 kdb_register_flags("bd", kdb_bc, "<bpnum>",
549 "Single Step", 1, KDB_REPEAT_NO_ARGS); 551 "Disable Breakpoint", 0,
552 KDB_ENABLE_FLOW_CTRL);
553
554 kdb_register_flags("ss", kdb_ss, "",
555 "Single Step", 1,
556 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
550 /* 557 /*
551 * Architecture dependent initialization. 558 * Architecture dependent initialization.
552 */ 559 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
129 ks->pass_exception = 1; 129 ks->pass_exception = 1;
130 KDB_FLAG_SET(CATASTROPHIC); 130 KDB_FLAG_SET(CATASTROPHIC);
131 } 131 }
132 /* set CATASTROPHIC if the system contains unresponsive processors */
133 for_each_online_cpu(i)
134 if (!kgdb_info[i].enter_kgdb)
135 KDB_FLAG_SET(CATASTROPHIC);
132 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 136 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
133 KDB_STATE_CLEAR(SSBPT); 137 KDB_STATE_CLEAR(SSBPT);
134 KDB_STATE_CLEAR(DOING_SS); 138 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/types.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
17#include <linux/kmsg_dump.h> 18#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
23#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
24#include <linux/atomic.h> 25#include <linux/atomic.h>
25#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
26#include <linux/mm.h> 28#include <linux/mm.h>
27#include <linux/init.h> 29#include <linux/init.h>
28#include <linux/kallsyms.h> 30#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
42#include <linux/slab.h> 44#include <linux/slab.h>
43#include "kdb_private.h" 45#include "kdb_private.h"
44 46
47#undef MODULE_PARAM_PREFIX
48#define MODULE_PARAM_PREFIX "kdb."
49
50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
52
45#define GREP_LEN 256 53#define GREP_LEN 256
46char kdb_grep_string[GREP_LEN]; 54char kdb_grep_string[GREP_LEN];
47int kdb_grepping_flag; 55int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
121 KDBMSG(BADLENGTH, "Invalid length field"), 129 KDBMSG(BADLENGTH, "Invalid length field"),
122 KDBMSG(NOBP, "No Breakpoint exists"), 130 KDBMSG(NOBP, "No Breakpoint exists"),
123 KDBMSG(BADADDR, "Invalid address"), 131 KDBMSG(BADADDR, "Invalid address"),
132 KDBMSG(NOPERM, "Permission denied"),
124}; 133};
125#undef KDBMSG 134#undef KDBMSG
126 135
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
188} 197}
189 198
190/* 199/*
200 * Check whether the flags of the current command and the permissions
201 * of the kdb console has allow a command to be run.
202 */
203static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
204 bool no_args)
205{
206 /* permissions comes from userspace so needs massaging slightly */
207 permissions &= KDB_ENABLE_MASK;
208 permissions |= KDB_ENABLE_ALWAYS_SAFE;
209
210 /* some commands change group when launched with no arguments */
211 if (no_args)
212 permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
213
214 flags |= KDB_ENABLE_ALL;
215
216 return permissions & flags;
217}
218
219/*
191 * kdbgetenv - This function will return the character string value of 220 * kdbgetenv - This function will return the character string value of
192 * an environment variable. 221 * an environment variable.
193 * Parameters: 222 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
476 kdb_symtab_t symtab; 505 kdb_symtab_t symtab;
477 506
478 /* 507 /*
508 * If the enable flags prohibit both arbitrary memory access
509 * and flow control then there are no reasonable grounds to
510 * provide symbol lookup.
511 */
512 if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
513 kdb_cmd_enabled, false))
514 return KDB_NOPERM;
515
516 /*
479 * Process arguments which follow the following syntax: 517 * Process arguments which follow the following syntax:
480 * 518 *
481 * symbol | numeric-address [+/- numeric-offset] 519 * symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
641 if (!s->count) 679 if (!s->count)
642 s->usable = 0; 680 s->usable = 0;
643 if (s->usable) 681 if (s->usable)
644 kdb_register(s->name, kdb_exec_defcmd, 682 /* macros are always safe because when executed each
645 s->usage, s->help, 0); 683 * internal command re-enters kdb_parse() and is
684 * safety checked individually.
685 */
686 kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
687 s->help, 0,
688 KDB_ENABLE_ALWAYS_SAFE);
646 return 0; 689 return 0;
647 } 690 }
648 if (!s->usable) 691 if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
1003 1046
1004 if (i < kdb_max_commands) { 1047 if (i < kdb_max_commands) {
1005 int result; 1048 int result;
1049
1050 if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
1051 return KDB_NOPERM;
1052
1006 KDB_STATE_SET(CMD); 1053 KDB_STATE_SET(CMD);
1007 result = (*tp->cmd_func)(argc-1, (const char **)argv); 1054 result = (*tp->cmd_func)(argc-1, (const char **)argv);
1008 if (result && ignore_errors && result > KDB_CMD_GO) 1055 if (result && ignore_errors && result > KDB_CMD_GO)
1009 result = 0; 1056 result = 0;
1010 KDB_STATE_CLEAR(CMD); 1057 KDB_STATE_CLEAR(CMD);
1011 switch (tp->cmd_repeat) { 1058
1012 case KDB_REPEAT_NONE: 1059 if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
1013 argc = 0; 1060 return result;
1014 if (argv[0]) 1061
1015 *(argv[0]) = '\0'; 1062 argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
1016 break; 1063 if (argv[argc])
1017 case KDB_REPEAT_NO_ARGS: 1064 *(argv[argc]) = '\0';
1018 argc = 1;
1019 if (argv[1])
1020 *(argv[1]) = '\0';
1021 break;
1022 case KDB_REPEAT_WITH_ARGS:
1023 break;
1024 }
1025 return result; 1065 return result;
1026 } 1066 }
1027 1067
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
1921 */ 1961 */
1922static int kdb_sr(int argc, const char **argv) 1962static int kdb_sr(int argc, const char **argv)
1923{ 1963{
1964 bool check_mask =
1965 !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
1966
1924 if (argc != 1) 1967 if (argc != 1)
1925 return KDB_ARGCOUNT; 1968 return KDB_ARGCOUNT;
1969
1926 kdb_trap_printk++; 1970 kdb_trap_printk++;
1927 __handle_sysrq(*argv[1], false); 1971 __handle_sysrq(*argv[1], check_mask);
1928 kdb_trap_printk--; 1972 kdb_trap_printk--;
1929 1973
1930 return 0; 1974 return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
1979 kdb_printf("%-20s%8u 0x%p ", mod->name, 2023 kdb_printf("%-20s%8u 0x%p ", mod->name,
1980 mod->core_size, (void *)mod); 2024 mod->core_size, (void *)mod);
1981#ifdef CONFIG_MODULE_UNLOAD 2025#ifdef CONFIG_MODULE_UNLOAD
1982 kdb_printf("%4ld ", module_refcount(mod)); 2026 kdb_printf("%4d ", module_refcount(mod));
1983#endif 2027#endif
1984 if (mod->state == MODULE_STATE_GOING) 2028 if (mod->state == MODULE_STATE_GOING)
1985 kdb_printf(" (Unloading)"); 2029 kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
2157 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { 2201 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2158 if (!cpu_online(i)) { 2202 if (!cpu_online(i)) {
2159 state = 'F'; /* cpu is offline */ 2203 state = 'F'; /* cpu is offline */
2204 } else if (!kgdb_info[i].enter_kgdb) {
2205 state = 'D'; /* cpu is online but unresponsive */
2160 } else { 2206 } else {
2161 state = ' '; /* cpu is responding to kdb */ 2207 state = ' '; /* cpu is responding to kdb */
2162 if (kdb_task_state_char(KDB_TSK(i)) == 'I') 2208 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
2210 /* 2256 /*
2211 * Validate cpunum 2257 * Validate cpunum
2212 */ 2258 */
2213 if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) 2259 if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
2214 return KDB_BADCPUNUM; 2260 return KDB_BADCPUNUM;
2215 2261
2216 dbg_switch_cpu = cpunum; 2262 dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
2375 return 0; 2421 return 0;
2376 if (!kt->cmd_name) 2422 if (!kt->cmd_name)
2377 continue; 2423 continue;
2424 if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
2425 continue;
2378 if (strlen(kt->cmd_usage) > 20) 2426 if (strlen(kt->cmd_usage) > 20)
2379 space = "\n "; 2427 space = "\n ";
2380 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, 2428 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
2629} 2677}
2630 2678
2631/* 2679/*
2632 * kdb_register_repeat - This function is used to register a kernel 2680 * kdb_register_flags - This function is used to register a kernel
2633 * debugger command. 2681 * debugger command.
2634 * Inputs: 2682 * Inputs:
2635 * cmd Command name 2683 * cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
2641 * zero for success, one if a duplicate command. 2689 * zero for success, one if a duplicate command.
2642 */ 2690 */
2643#define kdb_command_extend 50 /* arbitrary */ 2691#define kdb_command_extend 50 /* arbitrary */
2644int kdb_register_repeat(char *cmd, 2692int kdb_register_flags(char *cmd,
2645 kdb_func_t func, 2693 kdb_func_t func,
2646 char *usage, 2694 char *usage,
2647 char *help, 2695 char *help,
2648 short minlen, 2696 short minlen,
2649 kdb_repeat_t repeat) 2697 kdb_cmdflags_t flags)
2650{ 2698{
2651 int i; 2699 int i;
2652 kdbtab_t *kp; 2700 kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
2694 kp->cmd_func = func; 2742 kp->cmd_func = func;
2695 kp->cmd_usage = usage; 2743 kp->cmd_usage = usage;
2696 kp->cmd_help = help; 2744 kp->cmd_help = help;
2697 kp->cmd_flags = 0;
2698 kp->cmd_minlen = minlen; 2745 kp->cmd_minlen = minlen;
2699 kp->cmd_repeat = repeat; 2746 kp->cmd_flags = flags;
2700 2747
2701 return 0; 2748 return 0;
2702} 2749}
2703EXPORT_SYMBOL_GPL(kdb_register_repeat); 2750EXPORT_SYMBOL_GPL(kdb_register_flags);
2704 2751
2705 2752
2706/* 2753/*
2707 * kdb_register - Compatibility register function for commands that do 2754 * kdb_register - Compatibility register function for commands that do
2708 * not need to specify a repeat state. Equivalent to 2755 * not need to specify a repeat state. Equivalent to
2709 * kdb_register_repeat with KDB_REPEAT_NONE. 2756 * kdb_register_flags with flags set to 0.
2710 * Inputs: 2757 * Inputs:
2711 * cmd Command name 2758 * cmd Command name
2712 * func Function to execute the command 2759 * func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
2721 char *help, 2768 char *help,
2722 short minlen) 2769 short minlen)
2723{ 2770{
2724 return kdb_register_repeat(cmd, func, usage, help, minlen, 2771 return kdb_register_flags(cmd, func, usage, help, minlen, 0);
2725 KDB_REPEAT_NONE);
2726} 2772}
2727EXPORT_SYMBOL_GPL(kdb_register); 2773EXPORT_SYMBOL_GPL(kdb_register);
2728 2774
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
2764 for_each_kdbcmd(kp, i) 2810 for_each_kdbcmd(kp, i)
2765 kp->cmd_name = NULL; 2811 kp->cmd_name = NULL;
2766 2812
2767 kdb_register_repeat("md", kdb_md, "<vaddr>", 2813 kdb_register_flags("md", kdb_md, "<vaddr>",
2768 "Display Memory Contents, also mdWcN, e.g. md8c1", 1, 2814 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2769 KDB_REPEAT_NO_ARGS); 2815 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2770 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", 2816 kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
2771 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); 2817 "Display Raw Memory", 0,
2772 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", 2818 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2773 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); 2819 kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
2774 kdb_register_repeat("mds", kdb_md, "<vaddr>", 2820 "Display Physical Memory", 0,
2775 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); 2821 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2776 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", 2822 kdb_register_flags("mds", kdb_md, "<vaddr>",
2777 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); 2823 "Display Memory Symbolically", 0,
2778 kdb_register_repeat("go", kdb_go, "[<vaddr>]", 2824 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2779 "Continue Execution", 1, KDB_REPEAT_NONE); 2825 kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
2780 kdb_register_repeat("rd", kdb_rd, "", 2826 "Modify Memory Contents", 0,
2781 "Display Registers", 0, KDB_REPEAT_NONE); 2827 KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
2782 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", 2828 kdb_register_flags("go", kdb_go, "[<vaddr>]",
2783 "Modify Registers", 0, KDB_REPEAT_NONE); 2829 "Continue Execution", 1,
2784 kdb_register_repeat("ef", kdb_ef, "<vaddr>", 2830 KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2785 "Display exception frame", 0, KDB_REPEAT_NONE); 2831 kdb_register_flags("rd", kdb_rd, "",
2786 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", 2832 "Display Registers", 0,
2787 "Stack traceback", 1, KDB_REPEAT_NONE); 2833 KDB_ENABLE_REG_READ);
2788 kdb_register_repeat("btp", kdb_bt, "<pid>", 2834 kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
2789 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2835 "Modify Registers", 0,
2790 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", 2836 KDB_ENABLE_REG_WRITE);
2791 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); 2837 kdb_register_flags("ef", kdb_ef, "<vaddr>",
2792 kdb_register_repeat("btc", kdb_bt, "", 2838 "Display exception frame", 0,
2793 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2839 KDB_ENABLE_MEM_READ);
2794 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2840 kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
2841 "Stack traceback", 1,
2842 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2843 kdb_register_flags("btp", kdb_bt, "<pid>",
2844 "Display stack for process <pid>", 0,
2845 KDB_ENABLE_INSPECT);
2846 kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Backtrace all processes matching state flag", 0,
2848 KDB_ENABLE_INSPECT);
2849 kdb_register_flags("btc", kdb_bt, "",
2850 "Backtrace current process on each cpu", 0,
2851 KDB_ENABLE_INSPECT);
2852 kdb_register_flags("btt", kdb_bt, "<vaddr>",
2795 "Backtrace process given its struct task address", 0, 2853 "Backtrace process given its struct task address", 0,
2796 KDB_REPEAT_NONE); 2854 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2797 kdb_register_repeat("env", kdb_env, "", 2855 kdb_register_flags("env", kdb_env, "",
2798 "Show environment variables", 0, KDB_REPEAT_NONE); 2856 "Show environment variables", 0,
2799 kdb_register_repeat("set", kdb_set, "", 2857 KDB_ENABLE_ALWAYS_SAFE);
2800 "Set environment variables", 0, KDB_REPEAT_NONE); 2858 kdb_register_flags("set", kdb_set, "",
2801 kdb_register_repeat("help", kdb_help, "", 2859 "Set environment variables", 0,
2802 "Display Help Message", 1, KDB_REPEAT_NONE); 2860 KDB_ENABLE_ALWAYS_SAFE);
2803 kdb_register_repeat("?", kdb_help, "", 2861 kdb_register_flags("help", kdb_help, "",
2804 "Display Help Message", 0, KDB_REPEAT_NONE); 2862 "Display Help Message", 1,
2805 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", 2863 KDB_ENABLE_ALWAYS_SAFE);
2806 "Switch to new cpu", 0, KDB_REPEAT_NONE); 2864 kdb_register_flags("?", kdb_help, "",
2807 kdb_register_repeat("kgdb", kdb_kgdb, "", 2865 "Display Help Message", 0,
2808 "Enter kgdb mode", 0, KDB_REPEAT_NONE); 2866 KDB_ENABLE_ALWAYS_SAFE);
2809 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", 2867 kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
2810 "Display active task list", 0, KDB_REPEAT_NONE); 2868 "Switch to new cpu", 0,
2811 kdb_register_repeat("pid", kdb_pid, "<pidnum>", 2869 KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2812 "Switch to another task", 0, KDB_REPEAT_NONE); 2870 kdb_register_flags("kgdb", kdb_kgdb, "",
2813 kdb_register_repeat("reboot", kdb_reboot, "", 2871 "Enter kgdb mode", 0, 0);
2814 "Reboot the machine immediately", 0, KDB_REPEAT_NONE); 2872 kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
2873 "Display active task list", 0,
2874 KDB_ENABLE_INSPECT);
2875 kdb_register_flags("pid", kdb_pid, "<pidnum>",
2876 "Switch to another task", 0,
2877 KDB_ENABLE_INSPECT);
2878 kdb_register_flags("reboot", kdb_reboot, "",
2879 "Reboot the machine immediately", 0,
2880 KDB_ENABLE_REBOOT);
2815#if defined(CONFIG_MODULES) 2881#if defined(CONFIG_MODULES)
2816 kdb_register_repeat("lsmod", kdb_lsmod, "", 2882 kdb_register_flags("lsmod", kdb_lsmod, "",
2817 "List loaded kernel modules", 0, KDB_REPEAT_NONE); 2883 "List loaded kernel modules", 0,
2884 KDB_ENABLE_INSPECT);
2818#endif 2885#endif
2819#if defined(CONFIG_MAGIC_SYSRQ) 2886#if defined(CONFIG_MAGIC_SYSRQ)
2820 kdb_register_repeat("sr", kdb_sr, "<key>", 2887 kdb_register_flags("sr", kdb_sr, "<key>",
2821 "Magic SysRq key", 0, KDB_REPEAT_NONE); 2888 "Magic SysRq key", 0,
2889 KDB_ENABLE_ALWAYS_SAFE);
2822#endif 2890#endif
2823#if defined(CONFIG_PRINTK) 2891#if defined(CONFIG_PRINTK)
2824 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2892 kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
2825 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2893 "Display syslog buffer", 0,
2894 KDB_ENABLE_ALWAYS_SAFE);
2826#endif 2895#endif
2827 if (arch_kgdb_ops.enable_nmi) { 2896 if (arch_kgdb_ops.enable_nmi) {
2828 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", 2897 kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
2829 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); 2898 "Disable NMI entry to KDB", 0,
2830 } 2899 KDB_ENABLE_ALWAYS_SAFE);
2831 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2900 }
2832 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2901 kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2833 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2902 "Define a set of commands, down to endefcmd", 0,
2834 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2903 KDB_ENABLE_ALWAYS_SAFE);
2835 kdb_register_repeat("summary", kdb_summary, "", 2904 kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
2836 "Summarize the system", 4, KDB_REPEAT_NONE); 2905 "Send a signal to a process", 0,
2837 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", 2906 KDB_ENABLE_SIGNAL);
2838 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2907 kdb_register_flags("summary", kdb_summary, "",
2839 kdb_register_repeat("grephelp", kdb_grep_help, "", 2908 "Summarize the system", 4,
2840 "Display help on | grep", 0, KDB_REPEAT_NONE); 2909 KDB_ENABLE_ALWAYS_SAFE);
2910 kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2911 "Display per_cpu variables", 3,
2912 KDB_ENABLE_MEM_READ);
2913 kdb_register_flags("grephelp", kdb_grep_help, "",
2914 "Display help on | grep", 0,
2915 KDB_ENABLE_ALWAYS_SAFE);
2841} 2916}
2842 2917
2843/* Execute any commands defined in kdb_cmds. */ 2918/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
172 kdb_func_t cmd_func; /* Function to execute command */ 172 kdb_func_t cmd_func; /* Function to execute command */
173 char *cmd_usage; /* Usage String for this command */ 173 char *cmd_usage; /* Usage String for this command */
174 char *cmd_help; /* Help message for this command */ 174 char *cmd_help; /* Help message for this command */
175 short cmd_flags; /* Parsing flags */
176 short cmd_minlen; /* Minimum legal # command 175 short cmd_minlen; /* Minimum legal # command
177 * chars required */ 176 * chars required */
178 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ 177 kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
179} kdbtab_t; 178} kdbtab_t;
180 179
181extern int kdb_bt(int, const char **); /* KDB display back trace */ 180extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e19d3ebc29c..b4a696c4dc76 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
614 if (!f.file) 614 if (!f.file)
615 return -EBADF; 615 return -EBADF;
616 616
617 css = css_tryget_online_from_dir(f.file->f_dentry, 617 css = css_tryget_online_from_dir(f.file->f_path.dentry,
618 &perf_event_cgrp_subsys); 618 &perf_event_cgrp_subsys);
619 if (IS_ERR(css)) { 619 if (IS_ERR(css)) {
620 ret = PTR_ERR(css); 620 ret = PTR_ERR(css);
@@ -4461,18 +4461,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
4461} 4461}
4462 4462
4463static void perf_sample_regs_user(struct perf_regs *regs_user, 4463static void perf_sample_regs_user(struct perf_regs *regs_user,
4464 struct pt_regs *regs) 4464 struct pt_regs *regs,
4465 struct pt_regs *regs_user_copy)
4465{ 4466{
4466 if (!user_mode(regs)) { 4467 if (user_mode(regs)) {
4467 if (current->mm) 4468 regs_user->abi = perf_reg_abi(current);
4468 regs = task_pt_regs(current);
4469 else
4470 regs = NULL;
4471 }
4472
4473 if (regs) {
4474 regs_user->abi = perf_reg_abi(current);
4475 regs_user->regs = regs; 4469 regs_user->regs = regs;
4470 } else if (current->mm) {
4471 perf_get_regs_user(regs_user, regs, regs_user_copy);
4476 } else { 4472 } else {
4477 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 4473 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4478 regs_user->regs = NULL; 4474 regs_user->regs = NULL;
@@ -4951,7 +4947,8 @@ void perf_prepare_sample(struct perf_event_header *header,
4951 } 4947 }
4952 4948
4953 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) 4949 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
4954 perf_sample_regs_user(&data->regs_user, regs); 4950 perf_sample_regs_user(&data->regs_user, regs,
4951 &data->regs_user_copy);
4955 4952
4956 if (sample_type & PERF_SAMPLE_REGS_USER) { 4953 if (sample_type & PERF_SAMPLE_REGS_USER) {
4957 /* regs dump ABI info */ 4954 /* regs dump ABI info */
@@ -5892,6 +5889,8 @@ end:
5892 rcu_read_unlock(); 5889 rcu_read_unlock();
5893} 5890}
5894 5891
5892DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
5893
5895int perf_swevent_get_recursion_context(void) 5894int perf_swevent_get_recursion_context(void)
5896{ 5895{
5897 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 5896 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5907,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
5907 put_recursion_context(swhash->recursion, rctx); 5906 put_recursion_context(swhash->recursion, rctx);
5908} 5907}
5909 5908
5910void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 5909void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5911{ 5910{
5912 struct perf_sample_data data; 5911 struct perf_sample_data data;
5913 int rctx;
5914 5912
5915 preempt_disable_notrace(); 5913 if (WARN_ON_ONCE(!regs))
5916 rctx = perf_swevent_get_recursion_context();
5917 if (rctx < 0)
5918 return; 5914 return;
5919 5915
5920 perf_sample_data_init(&data, addr, 0); 5916 perf_sample_data_init(&data, addr, 0);
5921
5922 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 5917 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5918}
5919
5920void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5921{
5922 int rctx;
5923
5924 preempt_disable_notrace();
5925 rctx = perf_swevent_get_recursion_context();
5926 if (unlikely(rctx < 0))
5927 goto fail;
5928
5929 ___perf_sw_event(event_id, nr, regs, addr);
5923 5930
5924 perf_swevent_put_recursion_context(rctx); 5931 perf_swevent_put_recursion_context(rctx);
5932fail:
5925 preempt_enable_notrace(); 5933 preempt_enable_notrace();
5926} 5934}
5927 5935
@@ -6779,7 +6787,6 @@ skip_type:
6779 __perf_event_init_context(&cpuctx->ctx); 6787 __perf_event_init_context(&cpuctx->ctx);
6780 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 6788 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6781 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6789 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6782 cpuctx->ctx.type = cpu_context;
6783 cpuctx->ctx.pmu = pmu; 6790 cpuctx->ctx.pmu = pmu;
6784 6791
6785 __perf_cpu_hrtimer_init(cpuctx, cpu); 6792 __perf_cpu_hrtimer_init(cpuctx, cpu);
@@ -7423,7 +7430,19 @@ SYSCALL_DEFINE5(perf_event_open,
7423 * task or CPU context: 7430 * task or CPU context:
7424 */ 7431 */
7425 if (move_group) { 7432 if (move_group) {
7426 if (group_leader->ctx->type != ctx->type) 7433 /*
7434 * Make sure we're both on the same task, or both
7435 * per-cpu events.
7436 */
7437 if (group_leader->ctx->task != ctx->task)
7438 goto err_context;
7439
7440 /*
7441 * Make sure we're both events for the same CPU;
7442 * grouping events for different CPUs is broken; since
7443 * you can never concurrently schedule them anyhow.
7444 */
7445 if (group_leader->cpu != event->cpu)
7427 goto err_context; 7446 goto err_context;
7428 } else { 7447 } else {
7429 if (group_leader->ctx != ctx) 7448 if (group_leader->ctx != ctx)
@@ -7477,11 +7496,11 @@ SYSCALL_DEFINE5(perf_event_open,
7477 7496
7478 if (move_group) { 7497 if (move_group) {
7479 synchronize_rcu(); 7498 synchronize_rcu();
7480 perf_install_in_context(ctx, group_leader, event->cpu); 7499 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7481 get_ctx(ctx); 7500 get_ctx(ctx);
7482 list_for_each_entry(sibling, &group_leader->sibling_list, 7501 list_for_each_entry(sibling, &group_leader->sibling_list,
7483 group_entry) { 7502 group_entry) {
7484 perf_install_in_context(ctx, sibling, event->cpu); 7503 perf_install_in_context(ctx, sibling, sibling->cpu);
7485 get_ctx(ctx); 7504 get_ctx(ctx);
7486 } 7505 }
7487 } 7506 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cde34c5..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
193 } 193 }
194 194
195 flush_cache_page(vma, addr, pte_pfn(*ptep)); 195 flush_cache_page(vma, addr, pte_pfn(*ptep));
196 ptep_clear_flush(vma, addr, ptep); 196 ptep_clear_flush_notify(vma, addr, ptep);
197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
198 198
199 page_remove_rmap(page); 199 page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
724 int more = 0; 724 int more = 0;
725 725
726 again: 726 again:
727 mutex_lock(&mapping->i_mmap_mutex); 727 i_mmap_lock_read(mapping);
728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
729 if (!valid_vma(vma, is_register)) 729 if (!valid_vma(vma, is_register))
730 continue; 730 continue;
731 731
732 if (!prev && !more) { 732 if (!prev && !more) {
733 /* 733 /*
734 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through 734 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
735 * reclaim. This is optimistic, no harm done if it fails. 735 * reclaim. This is optimistic, no harm done if it fails.
736 */ 736 */
737 prev = kmalloc(sizeof(struct map_info), 737 prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
755 info->mm = vma->vm_mm; 755 info->mm = vma->vm_mm;
756 info->vaddr = offset_to_vaddr(vma, offset); 756 info->vaddr = offset_to_vaddr(vma, offset);
757 } 757 }
758 mutex_unlock(&mapping->i_mmap_mutex); 758 i_mmap_unlock_read(mapping);
759 759
760 if (!more) 760 if (!more)
761 goto out; 761 goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
118 } 118 }
119 119
120 /* 120 /*
121 * Accumulate here the counters for all threads but the group leader 121 * Accumulate here the counters for all threads as they die. We could
122 * as they die, so they can be added into the process-wide totals 122 * skip the group leader because it is the last user of signal_struct,
123 * when those are taken. The group leader stays around as a zombie as 123 * but we want to avoid the race with thread_group_cputime() which can
124 * long as there are other threads. When it gets reaped, the exit.c 124 * see the empty ->thread_head list.
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */ 125 */
129 task_cputime(tsk, &utime, &stime); 126 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock); 127 write_seqlock(&sig->stats_lock);
@@ -215,27 +212,6 @@ repeat:
215} 212}
216 213
217/* 214/*
218 * This checks not only the pgrp, but falls back on the pid if no
219 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
220 * without this...
221 *
222 * The caller must hold rcu lock or the tasklist lock.
223 */
224struct pid *session_of_pgrp(struct pid *pgrp)
225{
226 struct task_struct *p;
227 struct pid *sid = NULL;
228
229 p = pid_task(pgrp, PIDTYPE_PGID);
230 if (p == NULL)
231 p = pid_task(pgrp, PIDTYPE_PID);
232 if (p != NULL)
233 sid = task_session(p);
234
235 return sid;
236}
237
238/*
239 * Determine if a process group is "orphaned", according to the POSIX 215 * Determine if a process group is "orphaned", according to the POSIX
240 * definition in 2.2.2.52. Orphaned process groups are not to be affected 216 * definition in 2.2.2.52. Orphaned process groups are not to be affected
241 * by terminal-generated stop signals. Newly orphaned process groups are 217 * by terminal-generated stop signals. Newly orphaned process groups are
@@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
462 clear_thread_flag(TIF_MEMDIE); 438 clear_thread_flag(TIF_MEMDIE);
463} 439}
464 440
441static struct task_struct *find_alive_thread(struct task_struct *p)
442{
443 struct task_struct *t;
444
445 for_each_thread(p, t) {
446 if (!(t->flags & PF_EXITING))
447 return t;
448 }
449 return NULL;
450}
451
452static struct task_struct *find_child_reaper(struct task_struct *father)
453 __releases(&tasklist_lock)
454 __acquires(&tasklist_lock)
455{
456 struct pid_namespace *pid_ns = task_active_pid_ns(father);
457 struct task_struct *reaper = pid_ns->child_reaper;
458
459 if (likely(reaper != father))
460 return reaper;
461
462 reaper = find_alive_thread(father);
463 if (reaper) {
464 pid_ns->child_reaper = reaper;
465 return reaper;
466 }
467
468 write_unlock_irq(&tasklist_lock);
469 if (unlikely(pid_ns == &init_pid_ns)) {
470 panic("Attempted to kill init! exitcode=0x%08x\n",
471 father->signal->group_exit_code ?: father->exit_code);
472 }
473 zap_pid_ns_processes(pid_ns);
474 write_lock_irq(&tasklist_lock);
475
476 return father;
477}
478
465/* 479/*
466 * When we die, we re-parent all our children, and try to: 480 * When we die, we re-parent all our children, and try to:
467 * 1. give them to another thread in our thread group, if such a member exists 481 * 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
469 * child_subreaper for its children (like a service manager) 483 * child_subreaper for its children (like a service manager)
470 * 3. give it to the init process (PID 1) in our pid namespace 484 * 3. give it to the init process (PID 1) in our pid namespace
471 */ 485 */
472static struct task_struct *find_new_reaper(struct task_struct *father) 486static struct task_struct *find_new_reaper(struct task_struct *father,
473 __releases(&tasklist_lock) 487 struct task_struct *child_reaper)
474 __acquires(&tasklist_lock)
475{ 488{
476 struct pid_namespace *pid_ns = task_active_pid_ns(father); 489 struct task_struct *thread, *reaper;
477 struct task_struct *thread;
478 490
479 thread = father; 491 thread = find_alive_thread(father);
480 while_each_thread(father, thread) { 492 if (thread)
481 if (thread->flags & PF_EXITING)
482 continue;
483 if (unlikely(pid_ns->child_reaper == father))
484 pid_ns->child_reaper = thread;
485 return thread; 493 return thread;
486 }
487
488 if (unlikely(pid_ns->child_reaper == father)) {
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?:
493 father->exit_code);
494 }
495
496 zap_pid_ns_processes(pid_ns);
497 write_lock_irq(&tasklist_lock);
498 } else if (father->signal->has_child_subreaper) {
499 struct task_struct *reaper;
500 494
495 if (father->signal->has_child_subreaper) {
501 /* 496 /*
502 * Find the first ancestor marked as child_subreaper. 497 * Find the first ->is_child_subreaper ancestor in our pid_ns.
503 * Note that the code below checks same_thread_group(reaper, 498 * We start from father to ensure we can not look into another
504 * pid_ns->child_reaper). This is what we need to DTRT in a 499 * namespace, this is safe because all its threads are dead.
505 * PID namespace. However we still need the check above, see
506 * http://marc.info/?l=linux-kernel&m=131385460420380
507 */ 500 */
508 for (reaper = father->real_parent; 501 for (reaper = father;
509 reaper != &init_task; 502 !same_thread_group(reaper, child_reaper);
510 reaper = reaper->real_parent) { 503 reaper = reaper->real_parent) {
511 if (same_thread_group(reaper, pid_ns->child_reaper)) 504 /* call_usermodehelper() descendants need this check */
505 if (reaper == &init_task)
512 break; 506 break;
513 if (!reaper->signal->is_child_subreaper) 507 if (!reaper->signal->is_child_subreaper)
514 continue; 508 continue;
515 thread = reaper; 509 thread = find_alive_thread(reaper);
516 do { 510 if (thread)
517 if (!(thread->flags & PF_EXITING)) 511 return thread;
518 return reaper;
519 } while_each_thread(reaper, thread);
520 } 512 }
521 } 513 }
522 514
523 return pid_ns->child_reaper; 515 return child_reaper;
524} 516}
525 517
526/* 518/*
@@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
529static void reparent_leader(struct task_struct *father, struct task_struct *p, 521static void reparent_leader(struct task_struct *father, struct task_struct *p,
530 struct list_head *dead) 522 struct list_head *dead)
531{ 523{
532 list_move_tail(&p->sibling, &p->real_parent->children); 524 if (unlikely(p->exit_state == EXIT_DEAD))
533
534 if (p->exit_state == EXIT_DEAD)
535 return;
536 /*
537 * If this is a threaded reparent there is no need to
538 * notify anyone anything has happened.
539 */
540 if (same_thread_group(p->real_parent, father))
541 return; 525 return;
542 526
543 /* We don't want people slaying init. */ 527 /* We don't want people slaying init. */
@@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
548 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 532 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
549 if (do_notify_parent(p, p->exit_signal)) { 533 if (do_notify_parent(p, p->exit_signal)) {
550 p->exit_state = EXIT_DEAD; 534 p->exit_state = EXIT_DEAD;
551 list_move_tail(&p->sibling, dead); 535 list_add(&p->ptrace_entry, dead);
552 } 536 }
553 } 537 }
554 538
555 kill_orphaned_pgrp(p, father); 539 kill_orphaned_pgrp(p, father);
556} 540}
557 541
558static void forget_original_parent(struct task_struct *father) 542/*
543 * This does two things:
544 *
545 * A. Make init inherit all the child processes
546 * B. Check to see if any process groups have become orphaned
547 * as a result of our exiting, and if they have any stopped
548 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
549 */
550static void forget_original_parent(struct task_struct *father,
551 struct list_head *dead)
559{ 552{
560 struct task_struct *p, *n, *reaper; 553 struct task_struct *p, *t, *reaper;
561 LIST_HEAD(dead_children);
562 554
563 write_lock_irq(&tasklist_lock); 555 if (unlikely(!list_empty(&father->ptraced)))
564 /* 556 exit_ptrace(father, dead);
565 * Note that exit_ptrace() and find_new_reaper() might
566 * drop tasklist_lock and reacquire it.
567 */
568 exit_ptrace(father);
569 reaper = find_new_reaper(father);
570 557
571 list_for_each_entry_safe(p, n, &father->children, sibling) { 558 /* Can drop and reacquire tasklist_lock */
572 struct task_struct *t = p; 559 reaper = find_child_reaper(father);
560 if (list_empty(&father->children))
561 return;
573 562
574 do { 563 reaper = find_new_reaper(father, reaper);
564 list_for_each_entry(p, &father->children, sibling) {
565 for_each_thread(p, t) {
575 t->real_parent = reaper; 566 t->real_parent = reaper;
576 if (t->parent == father) { 567 BUG_ON((!t->ptrace) != (t->parent == father));
577 BUG_ON(t->ptrace); 568 if (likely(!t->ptrace))
578 t->parent = t->real_parent; 569 t->parent = t->real_parent;
579 }
580 if (t->pdeath_signal) 570 if (t->pdeath_signal)
581 group_send_sig_info(t->pdeath_signal, 571 group_send_sig_info(t->pdeath_signal,
582 SEND_SIG_NOINFO, t); 572 SEND_SIG_NOINFO, t);
583 } while_each_thread(p, t); 573 }
584 reparent_leader(father, p, &dead_children); 574 /*
585 } 575 * If this is a threaded reparent there is no need to
586 write_unlock_irq(&tasklist_lock); 576 * notify anyone anything has happened.
587 577 */
588 BUG_ON(!list_empty(&father->children)); 578 if (!same_thread_group(reaper, father))
589 579 reparent_leader(father, p, dead);
590 list_for_each_entry_safe(p, n, &dead_children, sibling) {
591 list_del_init(&p->sibling);
592 release_task(p);
593 } 580 }
581 list_splice_tail_init(&father->children, &reaper->children);
594} 582}
595 583
596/* 584/*
@@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
600static void exit_notify(struct task_struct *tsk, int group_dead) 588static void exit_notify(struct task_struct *tsk, int group_dead)
601{ 589{
602 bool autoreap; 590 bool autoreap;
603 591 struct task_struct *p, *n;
604 /* 592 LIST_HEAD(dead);
605 * This does two things:
606 *
607 * A. Make init inherit all the child processes
608 * B. Check to see if any process groups have become orphaned
609 * as a result of our exiting, and if they have any stopped
610 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
611 */
612 forget_original_parent(tsk);
613 593
614 write_lock_irq(&tasklist_lock); 594 write_lock_irq(&tasklist_lock);
595 forget_original_parent(tsk, &dead);
596
615 if (group_dead) 597 if (group_dead)
616 kill_orphaned_pgrp(tsk->group_leader, NULL); 598 kill_orphaned_pgrp(tsk->group_leader, NULL);
617 599
@@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
629 } 611 }
630 612
631 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 613 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
614 if (tsk->exit_state == EXIT_DEAD)
615 list_add(&tsk->ptrace_entry, &dead);
632 616
633 /* mt-exec, de_thread() is waiting for group leader */ 617 /* mt-exec, de_thread() is waiting for group leader */
634 if (unlikely(tsk->signal->notify_count < 0)) 618 if (unlikely(tsk->signal->notify_count < 0))
635 wake_up_process(tsk->signal->group_exit_task); 619 wake_up_process(tsk->signal->group_exit_task);
636 write_unlock_irq(&tasklist_lock); 620 write_unlock_irq(&tasklist_lock);
637 621
638 /* If the process is dead, release it - nobody will wait for it */ 622 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
639 if (autoreap) 623 list_del_init(&p->ptrace_entry);
640 release_task(tsk); 624 release_task(p);
625 }
641} 626}
642 627
643#ifdef CONFIG_DEBUG_STACK_USAGE 628#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
982 */ 967 */
983static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 968static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
984{ 969{
985 unsigned long state; 970 int state, retval, status;
986 int retval, status, traced;
987 pid_t pid = task_pid_vnr(p); 971 pid_t pid = task_pid_vnr(p);
988 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 972 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
989 struct siginfo __user *infop; 973 struct siginfo __user *infop;
@@ -997,6 +981,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
997 981
998 get_task_struct(p); 982 get_task_struct(p);
999 read_unlock(&tasklist_lock); 983 read_unlock(&tasklist_lock);
984 sched_annotate_sleep();
985
1000 if ((exit_code & 0x7f) == 0) { 986 if ((exit_code & 0x7f) == 0) {
1001 why = CLD_EXITED; 987 why = CLD_EXITED;
1002 status = exit_code >> 8; 988 status = exit_code >> 8;
@@ -1006,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1006 } 992 }
1007 return wait_noreap_copyout(wo, p, pid, uid, why, status); 993 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1008 } 994 }
1009
1010 traced = ptrace_reparented(p);
1011 /* 995 /*
1012 * Move the task's state to DEAD/TRACE, only one thread can do this. 996 * Move the task's state to DEAD/TRACE, only one thread can do this.
1013 */ 997 */
1014 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; 998 state = (ptrace_reparented(p) && thread_group_leader(p)) ?
999 EXIT_TRACE : EXIT_DEAD;
1015 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1000 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1016 return 0; 1001 return 0;
1017 /* 1002 /*
1018 * It can be ptraced but not reparented, check 1003 * We own this thread, nobody else can reap it.
1019 * thread_group_leader() to filter out sub-threads. 1004 */
1005 read_unlock(&tasklist_lock);
1006 sched_annotate_sleep();
1007
1008 /*
1009 * Check thread_group_leader() to exclude the traced sub-threads.
1020 */ 1010 */
1021 if (likely(!traced) && thread_group_leader(p)) { 1011 if (state == EXIT_DEAD && thread_group_leader(p)) {
1022 struct signal_struct *psig; 1012 struct signal_struct *sig = p->signal;
1023 struct signal_struct *sig; 1013 struct signal_struct *psig = current->signal;
1024 unsigned long maxrss; 1014 unsigned long maxrss;
1025 cputime_t tgutime, tgstime; 1015 cputime_t tgutime, tgstime;
1026 1016
@@ -1032,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1032 * accumulate in the parent's signal_struct c* fields. 1022 * accumulate in the parent's signal_struct c* fields.
1033 * 1023 *
1034 * We don't bother to take a lock here to protect these 1024 * We don't bother to take a lock here to protect these
1035 * p->signal fields, because they are only touched by 1025 * p->signal fields because the whole thread group is dead
1036 * __exit_signal, which runs with tasklist_lock 1026 * and nobody can change them.
1037 * write-locked anyway, and so is excluded here. We do 1027 *
1038 * need to protect the access to parent->signal fields, 1028 * psig->stats_lock also protects us from our sub-theads
1039 * as other threads in the parent group can be right 1029 * which can reap other children at the same time. Until
1040 * here reaping other children at the same time. 1030 * we change k_getrusage()-like users to rely on this lock
1031 * we have to take ->siglock as well.
1041 * 1032 *
1042 * We use thread_group_cputime_adjusted() to get times for 1033 * We use thread_group_cputime_adjusted() to get times for
1043 * the thread group, which consolidates times for all threads 1034 * the thread group, which consolidates times for all threads
1044 * in the group including the group leader. 1035 * in the group including the group leader.
1045 */ 1036 */
1046 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1037 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1047 spin_lock_irq(&p->real_parent->sighand->siglock); 1038 spin_lock_irq(&current->sighand->siglock);
1048 psig = p->real_parent->signal;
1049 sig = p->signal;
1050 write_seqlock(&psig->stats_lock); 1039 write_seqlock(&psig->stats_lock);
1051 psig->cutime += tgutime + sig->cutime; 1040 psig->cutime += tgutime + sig->cutime;
1052 psig->cstime += tgstime + sig->cstime; 1041 psig->cstime += tgstime + sig->cstime;
@@ -1071,15 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1071 task_io_accounting_add(&psig->ioac, &p->ioac); 1060 task_io_accounting_add(&psig->ioac, &p->ioac);
1072 task_io_accounting_add(&psig->ioac, &sig->ioac); 1061 task_io_accounting_add(&psig->ioac, &sig->ioac);
1073 write_sequnlock(&psig->stats_lock); 1062 write_sequnlock(&psig->stats_lock);
1074 spin_unlock_irq(&p->real_parent->sighand->siglock); 1063 spin_unlock_irq(&current->sighand->siglock);
1075 } 1064 }
1076 1065
1077 /*
1078 * Now we are sure this task is interesting, and no other
1079 * thread can reap it because we its state == DEAD/TRACE.
1080 */
1081 read_unlock(&tasklist_lock);
1082
1083 retval = wo->wo_rusage 1066 retval = wo->wo_rusage
1084 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1067 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1085 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1068 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1210,6 +1193,7 @@ unlock_sig:
1210 pid = task_pid_vnr(p); 1193 pid = task_pid_vnr(p);
1211 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1194 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1212 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
1196 sched_annotate_sleep();
1213 1197
1214 if (unlikely(wo->wo_flags & WNOWAIT)) 1198 if (unlikely(wo->wo_flags & WNOWAIT))
1215 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); 1199 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1256,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1272 pid = task_pid_vnr(p); 1256 pid = task_pid_vnr(p);
1273 get_task_struct(p); 1257 get_task_struct(p);
1274 read_unlock(&tasklist_lock); 1258 read_unlock(&tasklist_lock);
1259 sched_annotate_sleep();
1275 1260
1276 if (!wo->wo_info) { 1261 if (!wo->wo_info) {
1277 retval = wo->wo_rusage 1262 retval = wo->wo_rusage
@@ -1302,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1302static int wait_consider_task(struct wait_opts *wo, int ptrace, 1287static int wait_consider_task(struct wait_opts *wo, int ptrace,
1303 struct task_struct *p) 1288 struct task_struct *p)
1304{ 1289{
1290 /*
1291 * We can race with wait_task_zombie() from another thread.
1292 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1293 * can't confuse the checks below.
1294 */
1295 int exit_state = ACCESS_ONCE(p->exit_state);
1305 int ret; 1296 int ret;
1306 1297
1307 if (unlikely(p->exit_state == EXIT_DEAD)) 1298 if (unlikely(exit_state == EXIT_DEAD))
1308 return 0; 1299 return 0;
1309 1300
1310 ret = eligible_child(wo, p); 1301 ret = eligible_child(wo, p);
@@ -1325,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1325 return 0; 1316 return 0;
1326 } 1317 }
1327 1318
1328 if (unlikely(p->exit_state == EXIT_TRACE)) { 1319 if (unlikely(exit_state == EXIT_TRACE)) {
1329 /* 1320 /*
1330 * ptrace == 0 means we are the natural parent. In this case 1321 * ptrace == 0 means we are the natural parent. In this case
1331 * we should clear notask_error, debugger will notify us. 1322 * we should clear notask_error, debugger will notify us.
@@ -1352,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1352 } 1343 }
1353 1344
1354 /* slay zombie? */ 1345 /* slay zombie? */
1355 if (p->exit_state == EXIT_ZOMBIE) { 1346 if (exit_state == EXIT_ZOMBIE) {
1356 /* we don't reap group leaders with subthreads */ 1347 /* we don't reap group leaders with subthreads */
1357 if (!delay_group_leader(p)) { 1348 if (!delay_group_leader(p)) {
1358 /* 1349 /*
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
21#include <linux/mutex.h> 22#include <linux/mutex.h>
22#include <linux/init.h> 23#include <linux/init.h>
23 24
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
102 return 1; 103 return 1;
103 if (is_module_text_address(addr)) 104 if (is_module_text_address(addr))
104 return 1; 105 return 1;
106 if (is_ftrace_trampoline(addr))
107 return 1;
105 /* 108 /*
106 * There might be init symbols in saved stacktraces. 109 * There might be init symbols in saved stacktraces.
107 * Give those symbols a chance to be printed in 110 * Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
119{ 122{
120 if (core_kernel_text(addr)) 123 if (core_kernel_text(addr))
121 return 1; 124 return 1;
122 return is_module_text_address(addr); 125 if (is_module_text_address(addr))
126 return 1;
127 return is_ftrace_trampoline(addr);
123} 128}
124 129
125/* 130/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
433 get_file(file); 433 get_file(file);
434 if (tmp->vm_flags & VM_DENYWRITE) 434 if (tmp->vm_flags & VM_DENYWRITE)
435 atomic_dec(&inode->i_writecount); 435 atomic_dec(&inode->i_writecount);
436 mutex_lock(&mapping->i_mmap_mutex); 436 i_mmap_lock_write(mapping);
437 if (tmp->vm_flags & VM_SHARED) 437 if (tmp->vm_flags & VM_SHARED)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
445 vma_interval_tree_insert_after(tmp, mpnt, 445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap); 446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 447 flush_dcache_mmap_unlock(mapping);
448 mutex_unlock(&mapping->i_mmap_mutex); 448 i_mmap_unlock_write(mapping);
449 } 449 }
450 450
451 /* 451 /*
@@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1022{ 1022{
1023 if (atomic_dec_and_test(&sighand->count)) { 1023 if (atomic_dec_and_test(&sighand->count)) {
1024 signalfd_cleanup(sighand); 1024 signalfd_cleanup(sighand);
1025 /*
1026 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
1027 * without an RCU grace period, see __lock_task_sighand().
1028 */
1025 kmem_cache_free(sighand_cachep, sighand); 1029 kmem_cache_free(sighand_cachep, sighand);
1026 } 1030 }
1027} 1031}
1028 1032
1029
1030/* 1033/*
1031 * Initialize POSIX timer handling for a thread group. 1034 * Initialize POSIX timer handling for a thread group.
1032 */ 1035 */
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b7408759bdf..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
32 Note that the debugfs filesystem has to be mounted to access 32 Note that the debugfs filesystem has to be mounted to access
33 profiling data. 33 profiling data.
34 34
35config ARCH_HAS_GCOV_PROFILE_ALL
36 def_bool n
37
35config GCOV_PROFILE_ALL 38config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 39 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 40 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 41 depends on ARCH_HAS_GCOV_PROFILE_ALL
39 default n 42 default n
40 ---help--- 43 ---help---
41 This options activates profiling for the entire kernel. 44 This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/user_namespace.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10 11
11/* init to 2 - one for init_task, one to ensure it is never freed */ 12/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
213 return i; 214 return i;
214} 215}
215 216
217bool may_setgroups(void)
218{
219 struct user_namespace *user_ns = current_user_ns();
220
221 return ns_capable(user_ns, CAP_SETGID) &&
222 userns_may_setgroups(user_ns);
223}
224
216/* 225/*
217 * SMP: Our groups are copy-on-write. We can set them safely 226 * SMP: Our groups are copy-on-write. We can set them safely
218 * without another task interfering. 227 * without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
223 struct group_info *group_info; 232 struct group_info *group_info;
224 int retval; 233 int retval;
225 234
226 if (!ns_capable(current_user_ns(), CAP_SETGID)) 235 if (!may_setgroups())
227 return -EPERM; 236 return -EPERM;
228 if ((unsigned)gidsetsize > NGROUPS_MAX) 237 if ((unsigned)gidsetsize > NGROUPS_MAX)
229 return -EINVAL; 238 return -EINVAL;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 225086b2652e..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP
55config IRQ_DOMAIN 55config IRQ_DOMAIN
56 bool 56 bool
57 57
58# Support for hierarchical irq domains
59config IRQ_DOMAIN_HIERARCHY
60 bool
61 select IRQ_DOMAIN
62
63# Generic MSI interrupt support
64config GENERIC_MSI_IRQ
65 bool
66
67# Generic MSI hierarchical interrupt domain support
68config GENERIC_MSI_IRQ_DOMAIN
69 bool
70 select IRQ_DOMAIN_HIERARCHY
71 select GENERIC_MSI_IRQ
72
58config HANDLE_DOMAIN_IRQ 73config HANDLE_DOMAIN_IRQ
59 bool 74 bool
60 75
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
6obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
8obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
9obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e5202f00cabc..6f1c7a566b95 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/irqdomain.h>
18 19
19#include <trace/events/irq.h> 20#include <trace/events/irq.h>
20 21
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
178 irq_state_clr_disabled(desc); 179 irq_state_clr_disabled(desc);
179 desc->depth = 0; 180 desc->depth = 0;
180 181
182 irq_domain_activate_irq(&desc->irq_data);
181 if (desc->irq_data.chip->irq_startup) { 183 if (desc->irq_data.chip->irq_startup) {
182 ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 184 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
183 irq_state_clr_masked(desc); 185 irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
199 desc->irq_data.chip->irq_disable(&desc->irq_data); 201 desc->irq_data.chip->irq_disable(&desc->irq_data);
200 else 202 else
201 desc->irq_data.chip->irq_mask(&desc->irq_data); 203 desc->irq_data.chip->irq_mask(&desc->irq_data);
204 irq_domain_deactivate_irq(&desc->irq_data);
202 irq_state_set_masked(desc); 205 irq_state_set_masked(desc);
203} 206}
204 207
@@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
728 if (!handle) { 731 if (!handle) {
729 handle = handle_bad_irq; 732 handle = handle_bad_irq;
730 } else { 733 } else {
731 if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) 734 struct irq_data *irq_data = &desc->irq_data;
735#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
736 /*
737 * With hierarchical domains we might run into a
738 * situation where the outermost chip is not yet set
739 * up, but the inner chips are there. Instead of
740 * bailing we install the handler, but obviously we
741 * cannot enable/startup the interrupt at this point.
742 */
743 while (irq_data) {
744 if (irq_data->chip != &no_irq_chip)
745 break;
746 /*
747 * Bail out if the outer chip is not set up
748 * and the interrrupt supposed to be started
749 * right away.
750 */
751 if (WARN_ON(is_chained))
752 goto out;
753 /* Try the parent */
754 irq_data = irq_data->parent_data;
755 }
756#endif
757 if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
732 goto out; 758 goto out;
733 } 759 }
734 760
@@ -847,3 +873,105 @@ void irq_cpu_offline(void)
847 raw_spin_unlock_irqrestore(&desc->lock, flags); 873 raw_spin_unlock_irqrestore(&desc->lock, flags);
848 } 874 }
849} 875}
876
877#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
878/**
879 * irq_chip_ack_parent - Acknowledge the parent interrupt
880 * @data: Pointer to interrupt specific data
881 */
882void irq_chip_ack_parent(struct irq_data *data)
883{
884 data = data->parent_data;
885 data->chip->irq_ack(data);
886}
887
888/**
889 * irq_chip_mask_parent - Mask the parent interrupt
890 * @data: Pointer to interrupt specific data
891 */
892void irq_chip_mask_parent(struct irq_data *data)
893{
894 data = data->parent_data;
895 data->chip->irq_mask(data);
896}
897
898/**
899 * irq_chip_unmask_parent - Unmask the parent interrupt
900 * @data: Pointer to interrupt specific data
901 */
902void irq_chip_unmask_parent(struct irq_data *data)
903{
904 data = data->parent_data;
905 data->chip->irq_unmask(data);
906}
907
908/**
909 * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
910 * @data: Pointer to interrupt specific data
911 */
912void irq_chip_eoi_parent(struct irq_data *data)
913{
914 data = data->parent_data;
915 data->chip->irq_eoi(data);
916}
917
918/**
919 * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
920 * @data: Pointer to interrupt specific data
921 * @dest: The affinity mask to set
922 * @force: Flag to enforce setting (disable online checks)
923 *
924 * Conditinal, as the underlying parent chip might not implement it.
925 */
926int irq_chip_set_affinity_parent(struct irq_data *data,
927 const struct cpumask *dest, bool force)
928{
929 data = data->parent_data;
930 if (data->chip->irq_set_affinity)
931 return data->chip->irq_set_affinity(data, dest, force);
932
933 return -ENOSYS;
934}
935
936/**
937 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
938 * @data: Pointer to interrupt specific data
939 *
940 * Iterate through the domain hierarchy of the interrupt and check
941 * whether a hw retrigger function exists. If yes, invoke it.
942 */
943int irq_chip_retrigger_hierarchy(struct irq_data *data)
944{
945 for (data = data->parent_data; data; data = data->parent_data)
946 if (data->chip && data->chip->irq_retrigger)
947 return data->chip->irq_retrigger(data);
948
949 return -ENOSYS;
950}
951#endif
952
953/**
954 * irq_chip_compose_msi_msg - Componse msi message for a irq chip
955 * @data: Pointer to interrupt specific data
956 * @msg: Pointer to the MSI message
957 *
958 * For hierarchical domains we find the first chip in the hierarchy
959 * which implements the irq_compose_msi_msg callback. For non
960 * hierarchical we use the top level chip.
961 */
962int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
963{
964 struct irq_data *pos = NULL;
965
966#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
967 for (; data; data = data->parent_data)
968#endif
969 if (data->chip && data->chip->irq_compose_msi_msg)
970 pos = data;
971 if (!pos)
972 return -ENOSYS;
973
974 pos->chip->irq_compose_msi_msg(pos, msg);
975
976 return 0;
977}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
39 u32 mask = d->mask; 39 u32 mask = d->mask;
40 40
41 irq_gc_lock(gc); 41 irq_gc_lock(gc);
42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable); 42 irq_reg_writel(gc, mask, ct->regs.disable);
43 *ct->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
44 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
45} 45}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
59 59
60 irq_gc_lock(gc); 60 irq_gc_lock(gc);
61 *ct->mask_cache |= mask; 61 *ct->mask_cache |= mask;
62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 62 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
63 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
64} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); 65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 *ct->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); 82 irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); 85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
98 u32 mask = d->mask; 98 u32 mask = d->mask;
99 99
100 irq_gc_lock(gc); 100 irq_gc_lock(gc);
101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable); 101 irq_reg_writel(gc, mask, ct->regs.enable);
102 *ct->mask_cache |= mask; 102 *ct->mask_cache |= mask;
103 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
104} 104}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
114 u32 mask = d->mask; 114 u32 mask = d->mask;
115 115
116 irq_gc_lock(gc); 116 irq_gc_lock(gc);
117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 117 irq_reg_writel(gc, mask, ct->regs.ack);
118 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
119} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); 120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
130 u32 mask = ~d->mask; 130 u32 mask = ~d->mask;
131 131
132 irq_gc_lock(gc); 132 irq_gc_lock(gc);
133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 133 irq_reg_writel(gc, mask, ct->regs.ack);
134 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
135} 135}
136 136
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
145 u32 mask = d->mask; 145 u32 mask = d->mask;
146 146
147 irq_gc_lock(gc); 147 irq_gc_lock(gc);
148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask); 148 irq_reg_writel(gc, mask, ct->regs.mask);
149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack); 149 irq_reg_writel(gc, mask, ct->regs.ack);
150 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
151} 151}
152 152
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
161 u32 mask = d->mask; 161 u32 mask = d->mask;
162 162
163 irq_gc_lock(gc); 163 irq_gc_lock(gc);
164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); 164 irq_reg_writel(gc, mask, ct->regs.eoi);
165 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
166} 166}
167 167
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
191 return 0; 191 return 0;
192} 192}
193 193
194static u32 irq_readl_be(void __iomem *addr)
195{
196 return ioread32be(addr);
197}
198
199static void irq_writel_be(u32 val, void __iomem *addr)
200{
201 iowrite32be(val, addr);
202}
203
194static void 204static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, 205irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base, 206 int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
245 } 255 }
246 ct[i].mask_cache = mskptr; 256 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE) 257 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg); 258 *mskptr = irq_reg_readl(gc, mskreg);
249 } 259 }
250} 260}
251 261
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
300 dgc->gc[i] = gc = tmp; 310 dgc->gc[i] = gc = tmp;
301 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, 311 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
302 NULL, handler); 312 NULL, handler);
313
303 gc->domain = d; 314 gc->domain = d;
315 if (gcflags & IRQ_GC_BE_IO) {
316 gc->reg_readl = &irq_readl_be;
317 gc->reg_writel = &irq_writel_be;
318 }
319
304 raw_spin_lock_irqsave(&gc_lock, flags); 320 raw_spin_lock_irqsave(&gc_lock, flags);
305 list_add_tail(&gc->list, &gc_list); 321 list_add_tail(&gc->list, &gc_list);
306 raw_spin_unlock_irqrestore(&gc_lock, flags); 322 raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4332d766619d..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ 79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { } 80static inline void irq_mark_irq(unsigned int irq) { }
81extern void irq_lock_sparse(void);
82extern void irq_unlock_sparse(void);
81#else 83#else
82extern void irq_mark_irq(unsigned int irq); 84extern void irq_mark_irq(unsigned int irq);
85static inline void irq_lock_sparse(void) { }
86static inline void irq_unlock_sparse(void) { }
83#endif 87#endif
84 88
85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 89extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a1782f88f0af..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
132static inline void free_masks(struct irq_desc *desc) { } 132static inline void free_masks(struct irq_desc *desc) { }
133#endif 133#endif
134 134
135void irq_lock_sparse(void)
136{
137 mutex_lock(&sparse_irq_lock);
138}
139
140void irq_unlock_sparse(void)
141{
142 mutex_unlock(&sparse_irq_lock);
143}
144
135static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) 145static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
136{ 146{
137 struct irq_desc *desc; 147 struct irq_desc *desc;
@@ -168,6 +178,12 @@ static void free_desc(unsigned int irq)
168 178
169 unregister_irq_proc(irq, desc); 179 unregister_irq_proc(irq, desc);
170 180
181 /*
182 * sparse_irq_lock protects also show_interrupts() and
183 * kstat_irq_usr(). Once we deleted the descriptor from the
184 * sparse tree we can free it. Access in proc will fail to
185 * lookup the descriptor.
186 */
171 mutex_lock(&sparse_irq_lock); 187 mutex_lock(&sparse_irq_lock);
172 delete_irq_desc(irq); 188 delete_irq_desc(irq);
173 mutex_unlock(&sparse_irq_lock); 189 mutex_unlock(&sparse_irq_lock);
@@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
574 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 590 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
575} 591}
576 592
593/**
594 * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
595 * @irq: The interrupt number
596 * @cpu: The cpu number
597 *
598 * Returns the sum of interrupt counts on @cpu since boot for
599 * @irq. The caller must ensure that the interrupt is not removed
600 * concurrently.
601 */
577unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 602unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
578{ 603{
579 struct irq_desc *desc = irq_to_desc(irq); 604 struct irq_desc *desc = irq_to_desc(irq);
@@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
582 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 607 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
583} 608}
584 609
610/**
611 * kstat_irqs - Get the statistics for an interrupt
612 * @irq: The interrupt number
613 *
614 * Returns the sum of interrupt counts on all cpus since boot for
615 * @irq. The caller must ensure that the interrupt is not removed
616 * concurrently.
617 */
585unsigned int kstat_irqs(unsigned int irq) 618unsigned int kstat_irqs(unsigned int irq)
586{ 619{
587 struct irq_desc *desc = irq_to_desc(irq); 620 struct irq_desc *desc = irq_to_desc(irq);
@@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
594 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 627 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
595 return sum; 628 return sum;
596} 629}
630
631/**
632 * kstat_irqs_usr - Get the statistics for an interrupt
633 * @irq: The interrupt number
634 *
635 * Returns the sum of interrupt counts on all cpus since boot for
636 * @irq. Contrary to kstat_irqs() this can be called from any
637 * preemptible context. It's protected against concurrent removal of
638 * an interrupt descriptor when sparse irqs are enabled.
639 */
640unsigned int kstat_irqs_usr(unsigned int irq)
641{
642 int sum;
643
644 irq_lock_sparse();
645 sum = kstat_irqs(irq);
646 irq_unlock_sparse();
647 return sum;
648}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6534ff6ce02e..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
23static DEFINE_MUTEX(revmap_trees_mutex); 23static DEFINE_MUTEX(revmap_trees_mutex);
24static struct irq_domain *irq_default_domain; 24static struct irq_domain *irq_default_domain;
25 25
26static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
27 irq_hw_number_t hwirq, int node);
28static void irq_domain_check_hierarchy(struct irq_domain *domain);
29
26/** 30/**
27 * __irq_domain_add() - Allocate a new irq_domain data structure 31 * __irq_domain_add() - Allocate a new irq_domain data structure
28 * @of_node: optional device-tree node of the interrupt controller 32 * @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
30 * @hwirq_max: Maximum number of interrupts supported by controller 34 * @hwirq_max: Maximum number of interrupts supported by controller
31 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no 35 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
32 * direct mapping 36 * direct mapping
33 * @ops: map/unmap domain callbacks 37 * @ops: domain callbacks
34 * @host_data: Controller private data pointer 38 * @host_data: Controller private data pointer
35 * 39 *
36 * Allocates and initialize and irq_domain structure. 40 * Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
56 domain->hwirq_max = hwirq_max; 60 domain->hwirq_max = hwirq_max;
57 domain->revmap_size = size; 61 domain->revmap_size = size;
58 domain->revmap_direct_max_irq = direct_max; 62 domain->revmap_direct_max_irq = direct_max;
63 irq_domain_check_hierarchy(domain);
59 64
60 mutex_lock(&irq_domain_mutex); 65 mutex_lock(&irq_domain_mutex);
61 list_add(&domain->link, &irq_domain_list); 66 list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
109 * @first_irq: first number of irq block assigned to the domain, 114 * @first_irq: first number of irq block assigned to the domain,
110 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then 115 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
111 * pre-map all of the irqs in the domain to virqs starting at first_irq. 116 * pre-map all of the irqs in the domain to virqs starting at first_irq.
112 * @ops: map/unmap domain callbacks 117 * @ops: domain callbacks
113 * @host_data: Controller private data pointer 118 * @host_data: Controller private data pointer
114 * 119 *
115 * Allocates an irq_domain, and optionally if first_irq is positive then also 120 * Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
174 179
175 domain = __irq_domain_add(of_node, first_hwirq + size, 180 domain = __irq_domain_add(of_node, first_hwirq + size,
176 first_hwirq + size, 0, ops, host_data); 181 first_hwirq + size, 0, ops, host_data);
177 if (!domain) 182 if (domain)
178 return NULL; 183 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
179
180 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
181 184
182 return domain; 185 return domain;
183} 186}
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
388unsigned int irq_create_mapping(struct irq_domain *domain, 391unsigned int irq_create_mapping(struct irq_domain *domain,
389 irq_hw_number_t hwirq) 392 irq_hw_number_t hwirq)
390{ 393{
391 unsigned int hint;
392 int virq; 394 int virq;
393 395
394 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 396 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
410 } 412 }
411 413
412 /* Allocate a virtual interrupt number */ 414 /* Allocate a virtual interrupt number */
413 hint = hwirq % nr_irqs; 415 virq = irq_domain_alloc_descs(-1, 1, hwirq,
414 if (hint == 0) 416 of_node_to_nid(domain->of_node));
415 hint++;
416 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
417 if (virq <= 0)
418 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
419 if (virq <= 0) { 417 if (virq <= 0) {
420 pr_debug("-> virq allocation failed\n"); 418 pr_debug("-> virq allocation failed\n");
421 return 0; 419 return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
471 struct irq_domain *domain; 469 struct irq_domain *domain;
472 irq_hw_number_t hwirq; 470 irq_hw_number_t hwirq;
473 unsigned int type = IRQ_TYPE_NONE; 471 unsigned int type = IRQ_TYPE_NONE;
474 unsigned int virq; 472 int virq;
475 473
476 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; 474 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
477 if (!domain) { 475 if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
489 return 0; 487 return 0;
490 } 488 }
491 489
492 /* Create mapping */ 490 if (irq_domain_is_hierarchy(domain)) {
493 virq = irq_create_mapping(domain, hwirq); 491 /*
494 if (!virq) 492 * If we've already configured this interrupt,
495 return virq; 493 * don't do it again, or hell will break loose.
494 */
495 virq = irq_find_mapping(domain, hwirq);
496 if (virq)
497 return virq;
498
499 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
500 if (virq <= 0)
501 return 0;
502 } else {
503 /* Create mapping */
504 virq = irq_create_mapping(domain, hwirq);
505 if (!virq)
506 return virq;
507 }
496 508
497 /* Set type if specified and different than the current one */ 509 /* Set type if specified and different than the current one */
498 if (type != IRQ_TYPE_NONE && 510 if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
540 return 0; 552 return 0;
541 553
542 if (hwirq < domain->revmap_direct_max_irq) { 554 if (hwirq < domain->revmap_direct_max_irq) {
543 data = irq_get_irq_data(hwirq); 555 data = irq_domain_get_irq_data(domain, hwirq);
544 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 556 if (data && data->hwirq == hwirq)
545 return hwirq; 557 return hwirq;
546 } 558 }
547 559
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
709 .xlate = irq_domain_xlate_onetwocell, 721 .xlate = irq_domain_xlate_onetwocell,
710}; 722};
711EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 723EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
724
725static int irq_domain_alloc_descs(int virq, unsigned int cnt,
726 irq_hw_number_t hwirq, int node)
727{
728 unsigned int hint;
729
730 if (virq >= 0) {
731 virq = irq_alloc_descs(virq, virq, cnt, node);
732 } else {
733 hint = hwirq % nr_irqs;
734 if (hint == 0)
735 hint++;
736 virq = irq_alloc_descs_from(hint, cnt, node);
737 if (virq <= 0 && hint > 1)
738 virq = irq_alloc_descs_from(1, cnt, node);
739 }
740
741 return virq;
742}
743
744#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
745/**
746 * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
747 * @parent: Parent irq domain to associate with the new domain
748 * @flags: Irq domain flags associated to the domain
749 * @size: Size of the domain. See below
750 * @node: Optional device-tree node of the interrupt controller
751 * @ops: Pointer to the interrupt domain callbacks
752 * @host_data: Controller private data pointer
753 *
754 * If @size is 0 a tree domain is created, otherwise a linear domain.
755 *
756 * If successful the parent is associated to the new domain and the
757 * domain flags are set.
758 * Returns pointer to IRQ domain, or NULL on failure.
759 */
760struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
761 unsigned int flags,
762 unsigned int size,
763 struct device_node *node,
764 const struct irq_domain_ops *ops,
765 void *host_data)
766{
767 struct irq_domain *domain;
768
769 if (size)
770 domain = irq_domain_add_linear(node, size, ops, host_data);
771 else
772 domain = irq_domain_add_tree(node, ops, host_data);
773 if (domain) {
774 domain->parent = parent;
775 domain->flags |= flags;
776 }
777
778 return domain;
779}
780
781static void irq_domain_insert_irq(int virq)
782{
783 struct irq_data *data;
784
785 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
786 struct irq_domain *domain = data->domain;
787 irq_hw_number_t hwirq = data->hwirq;
788
789 if (hwirq < domain->revmap_size) {
790 domain->linear_revmap[hwirq] = virq;
791 } else {
792 mutex_lock(&revmap_trees_mutex);
793 radix_tree_insert(&domain->revmap_tree, hwirq, data);
794 mutex_unlock(&revmap_trees_mutex);
795 }
796
797 /* If not already assigned, give the domain the chip's name */
798 if (!domain->name && data->chip)
799 domain->name = data->chip->name;
800 }
801
802 irq_clear_status_flags(virq, IRQ_NOREQUEST);
803}
804
805static void irq_domain_remove_irq(int virq)
806{
807 struct irq_data *data;
808
809 irq_set_status_flags(virq, IRQ_NOREQUEST);
810 irq_set_chip_and_handler(virq, NULL, NULL);
811 synchronize_irq(virq);
812 smp_mb();
813
814 for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
815 struct irq_domain *domain = data->domain;
816 irq_hw_number_t hwirq = data->hwirq;
817
818 if (hwirq < domain->revmap_size) {
819 domain->linear_revmap[hwirq] = 0;
820 } else {
821 mutex_lock(&revmap_trees_mutex);
822 radix_tree_delete(&domain->revmap_tree, hwirq);
823 mutex_unlock(&revmap_trees_mutex);
824 }
825 }
826}
827
828static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
829 struct irq_data *child)
830{
831 struct irq_data *irq_data;
832
833 irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
834 if (irq_data) {
835 child->parent_data = irq_data;
836 irq_data->irq = child->irq;
837 irq_data->node = child->node;
838 irq_data->domain = domain;
839 }
840
841 return irq_data;
842}
843
844static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
845{
846 struct irq_data *irq_data, *tmp;
847 int i;
848
849 for (i = 0; i < nr_irqs; i++) {
850 irq_data = irq_get_irq_data(virq + i);
851 tmp = irq_data->parent_data;
852 irq_data->parent_data = NULL;
853 irq_data->domain = NULL;
854
855 while (tmp) {
856 irq_data = tmp;
857 tmp = tmp->parent_data;
858 kfree(irq_data);
859 }
860 }
861}
862
863static int irq_domain_alloc_irq_data(struct irq_domain *domain,
864 unsigned int virq, unsigned int nr_irqs)
865{
866 struct irq_data *irq_data;
867 struct irq_domain *parent;
868 int i;
869
870 /* The outermost irq_data is embedded in struct irq_desc */
871 for (i = 0; i < nr_irqs; i++) {
872 irq_data = irq_get_irq_data(virq + i);
873 irq_data->domain = domain;
874
875 for (parent = domain->parent; parent; parent = parent->parent) {
876 irq_data = irq_domain_insert_irq_data(parent, irq_data);
877 if (!irq_data) {
878 irq_domain_free_irq_data(virq, i + 1);
879 return -ENOMEM;
880 }
881 }
882 }
883
884 return 0;
885}
886
887/**
888 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
889 * @domain: domain to match
890 * @virq: IRQ number to get irq_data
891 */
892struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
893 unsigned int virq)
894{
895 struct irq_data *irq_data;
896
897 for (irq_data = irq_get_irq_data(virq); irq_data;
898 irq_data = irq_data->parent_data)
899 if (irq_data->domain == domain)
900 return irq_data;
901
902 return NULL;
903}
904
905/**
906 * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
907 * @domain: Interrupt domain to match
908 * @virq: IRQ number
909 * @hwirq: The hwirq number
910 * @chip: The associated interrupt chip
911 * @chip_data: The associated chip data
912 */
913int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
914 irq_hw_number_t hwirq, struct irq_chip *chip,
915 void *chip_data)
916{
917 struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
918
919 if (!irq_data)
920 return -ENOENT;
921
922 irq_data->hwirq = hwirq;
923 irq_data->chip = chip ? chip : &no_irq_chip;
924 irq_data->chip_data = chip_data;
925
926 return 0;
927}
928
929/**
930 * irq_domain_set_info - Set the complete data for a @virq in @domain
931 * @domain: Interrupt domain to match
932 * @virq: IRQ number
933 * @hwirq: The hardware interrupt number
934 * @chip: The associated interrupt chip
935 * @chip_data: The associated interrupt chip data
936 * @handler: The interrupt flow handler
937 * @handler_data: The interrupt flow handler data
938 * @handler_name: The interrupt handler name
939 */
940void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
941 irq_hw_number_t hwirq, struct irq_chip *chip,
942 void *chip_data, irq_flow_handler_t handler,
943 void *handler_data, const char *handler_name)
944{
945 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
946 __irq_set_handler(virq, handler, 0, handler_name);
947 irq_set_handler_data(virq, handler_data);
948}
949
950/**
951 * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
952 * @irq_data: The pointer to irq_data
953 */
954void irq_domain_reset_irq_data(struct irq_data *irq_data)
955{
956 irq_data->hwirq = 0;
957 irq_data->chip = &no_irq_chip;
958 irq_data->chip_data = NULL;
959}
960
961/**
962 * irq_domain_free_irqs_common - Clear irq_data and free the parent
963 * @domain: Interrupt domain to match
964 * @virq: IRQ number to start with
965 * @nr_irqs: The number of irqs to free
966 */
967void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
968 unsigned int nr_irqs)
969{
970 struct irq_data *irq_data;
971 int i;
972
973 for (i = 0; i < nr_irqs; i++) {
974 irq_data = irq_domain_get_irq_data(domain, virq + i);
975 if (irq_data)
976 irq_domain_reset_irq_data(irq_data);
977 }
978 irq_domain_free_irqs_parent(domain, virq, nr_irqs);
979}
980
981/**
982 * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
983 * @domain: Interrupt domain to match
984 * @virq: IRQ number to start with
985 * @nr_irqs: The number of irqs to free
986 */
987void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
988 unsigned int nr_irqs)
989{
990 int i;
991
992 for (i = 0; i < nr_irqs; i++) {
993 irq_set_handler_data(virq + i, NULL);
994 irq_set_handler(virq + i, NULL);
995 }
996 irq_domain_free_irqs_common(domain, virq, nr_irqs);
997}
998
999static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
1000{
1001 return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
1002}
1003
1004static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
1005 unsigned int irq_base,
1006 unsigned int nr_irqs)
1007{
1008 domain->ops->free(domain, irq_base, nr_irqs);
1009 if (irq_domain_is_auto_recursive(domain)) {
1010 BUG_ON(!domain->parent);
1011 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1012 nr_irqs);
1013 }
1014}
1015
1016static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
1017 unsigned int irq_base,
1018 unsigned int nr_irqs, void *arg)
1019{
1020 int ret = 0;
1021 struct irq_domain *parent = domain->parent;
1022 bool recursive = irq_domain_is_auto_recursive(domain);
1023
1024 BUG_ON(recursive && !parent);
1025 if (recursive)
1026 ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
1027 nr_irqs, arg);
1028 if (ret >= 0)
1029 ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
1030 if (ret < 0 && recursive)
1031 irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
1032
1033 return ret;
1034}
1035
1036/**
1037 * __irq_domain_alloc_irqs - Allocate IRQs from domain
1038 * @domain: domain to allocate from
1039 * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
1040 * @nr_irqs: number of IRQs to allocate
1041 * @node: NUMA node id for memory allocation
1042 * @arg: domain specific argument
1043 * @realloc: IRQ descriptors have already been allocated if true
1044 *
1045 * Allocate IRQ numbers and initialized all data structures to support
1046 * hierarchy IRQ domains.
1047 * Parameter @realloc is mainly to support legacy IRQs.
1048 * Returns error code or allocated IRQ number
1049 *
1050 * The whole process to setup an IRQ has been split into two steps.
1051 * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
1052 * descriptor and required hardware resources. The second step,
1053 * irq_domain_activate_irq(), is to program hardwares with preallocated
1054 * resources. In this way, it's easier to rollback when failing to
1055 * allocate resources.
1056 */
1057int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
1058 unsigned int nr_irqs, int node, void *arg,
1059 bool realloc)
1060{
1061 int i, ret, virq;
1062
1063 if (domain == NULL) {
1064 domain = irq_default_domain;
1065 if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
1066 return -EINVAL;
1067 }
1068
1069 if (!domain->ops->alloc) {
1070 pr_debug("domain->ops->alloc() is NULL\n");
1071 return -ENOSYS;
1072 }
1073
1074 if (realloc && irq_base >= 0) {
1075 virq = irq_base;
1076 } else {
1077 virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
1078 if (virq < 0) {
1079 pr_debug("cannot allocate IRQ(base %d, count %d)\n",
1080 irq_base, nr_irqs);
1081 return virq;
1082 }
1083 }
1084
1085 if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
1086 pr_debug("cannot allocate memory for IRQ%d\n", virq);
1087 ret = -ENOMEM;
1088 goto out_free_desc;
1089 }
1090
1091 mutex_lock(&irq_domain_mutex);
1092 ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
1093 if (ret < 0) {
1094 mutex_unlock(&irq_domain_mutex);
1095 goto out_free_irq_data;
1096 }
1097 for (i = 0; i < nr_irqs; i++)
1098 irq_domain_insert_irq(virq + i);
1099 mutex_unlock(&irq_domain_mutex);
1100
1101 return virq;
1102
1103out_free_irq_data:
1104 irq_domain_free_irq_data(virq, nr_irqs);
1105out_free_desc:
1106 irq_free_descs(virq, nr_irqs);
1107 return ret;
1108}
1109
1110/**
1111 * irq_domain_free_irqs - Free IRQ number and associated data structures
1112 * @virq: base IRQ number
1113 * @nr_irqs: number of IRQs to free
1114 */
1115void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
1116{
1117 struct irq_data *data = irq_get_irq_data(virq);
1118 int i;
1119
1120 if (WARN(!data || !data->domain || !data->domain->ops->free,
1121 "NULL pointer, cannot free irq\n"))
1122 return;
1123
1124 mutex_lock(&irq_domain_mutex);
1125 for (i = 0; i < nr_irqs; i++)
1126 irq_domain_remove_irq(virq + i);
1127 irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
1128 mutex_unlock(&irq_domain_mutex);
1129
1130 irq_domain_free_irq_data(virq, nr_irqs);
1131 irq_free_descs(virq, nr_irqs);
1132}
1133
1134/**
1135 * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
1136 * @irq_base: Base IRQ number
1137 * @nr_irqs: Number of IRQs to allocate
1138 * @arg: Allocation data (arch/domain specific)
1139 *
1140 * Check whether the domain has been setup recursive. If not allocate
1141 * through the parent domain.
1142 */
1143int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
1144 unsigned int irq_base, unsigned int nr_irqs,
1145 void *arg)
1146{
1147 /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
1148 if (irq_domain_is_auto_recursive(domain))
1149 return 0;
1150
1151 domain = domain->parent;
1152 if (domain)
1153 return irq_domain_alloc_irqs_recursive(domain, irq_base,
1154 nr_irqs, arg);
1155 return -ENOSYS;
1156}
1157
1158/**
1159 * irq_domain_free_irqs_parent - Free interrupts from parent domain
1160 * @irq_base: Base IRQ number
1161 * @nr_irqs: Number of IRQs to free
1162 *
1163 * Check whether the domain has been setup recursive. If not free
1164 * through the parent domain.
1165 */
1166void irq_domain_free_irqs_parent(struct irq_domain *domain,
1167 unsigned int irq_base, unsigned int nr_irqs)
1168{
1169 /* irq_domain_free_irqs_recursive() will call parent's free */
1170 if (!irq_domain_is_auto_recursive(domain) && domain->parent)
1171 irq_domain_free_irqs_recursive(domain->parent, irq_base,
1172 nr_irqs);
1173}
1174
1175/**
1176 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
1177 * interrupt
1178 * @irq_data: outermost irq_data associated with interrupt
1179 *
1180 * This is the second step to call domain_ops->activate to program interrupt
1181 * controllers, so the interrupt could actually get delivered.
1182 */
1183void irq_domain_activate_irq(struct irq_data *irq_data)
1184{
1185 if (irq_data && irq_data->domain) {
1186 struct irq_domain *domain = irq_data->domain;
1187
1188 if (irq_data->parent_data)
1189 irq_domain_activate_irq(irq_data->parent_data);
1190 if (domain->ops->activate)
1191 domain->ops->activate(domain, irq_data);
1192 }
1193}
1194
1195/**
1196 * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
1197 * deactivate interrupt
1198 * @irq_data: outermost irq_data associated with interrupt
1199 *
1200 * It calls domain_ops->deactivate to program interrupt controllers to disable
1201 * interrupt delivery.
1202 */
1203void irq_domain_deactivate_irq(struct irq_data *irq_data)
1204{
1205 if (irq_data && irq_data->domain) {
1206 struct irq_domain *domain = irq_data->domain;
1207
1208 if (domain->ops->deactivate)
1209 domain->ops->deactivate(domain, irq_data);
1210 if (irq_data->parent_data)
1211 irq_domain_deactivate_irq(irq_data->parent_data);
1212 }
1213}
1214
1215static void irq_domain_check_hierarchy(struct irq_domain *domain)
1216{
1217 /* Hierarchy irq_domains must implement callback alloc() */
1218 if (domain->ops->alloc)
1219 domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
1220}
1221#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
1222/**
1223 * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
1224 * @domain: domain to match
1225 * @virq: IRQ number to get irq_data
1226 */
1227struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
1228 unsigned int virq)
1229{
1230 struct irq_data *irq_data = irq_get_irq_data(virq);
1231
1232 return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
1233}
1234
1235static void irq_domain_check_hierarchy(struct irq_domain *domain)
1236{
1237}
1238#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a9104b4608b..80692373abd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
183 ret = chip->irq_set_affinity(data, mask, force); 183 ret = chip->irq_set_affinity(data, mask, force);
184 switch (ret) { 184 switch (ret) {
185 case IRQ_SET_MASK_OK: 185 case IRQ_SET_MASK_OK:
186 case IRQ_SET_MASK_OK_DONE:
186 cpumask_copy(data->affinity, mask); 187 cpumask_copy(data->affinity, mask);
187 case IRQ_SET_MASK_OK_NOCOPY: 188 case IRQ_SET_MASK_OK_NOCOPY:
188 irq_set_thread_affinity(desc); 189 irq_set_thread_affinity(desc);
@@ -600,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 601
601 switch (ret) { 602 switch (ret) {
602 case IRQ_SET_MASK_OK: 603 case IRQ_SET_MASK_OK:
604 case IRQ_SET_MASK_OK_DONE:
603 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); 605 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
604 irqd_set(&desc->irq_data, flags); 606 irqd_set(&desc->irq_data, flags);
605 607
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..3e18163f336f
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,330 @@
1/*
2 * linux/kernel/irq/msi.c
3 *
4 * Copyright (C) 2014 Intel Corp.
5 * Author: Jiang Liu <jiang.liu@linux.intel.com>
6 *
7 * This file is licensed under GPLv2.
8 *
9 * This file contains common code to support Message Signalled Interrupt for
10 * PCI compatible and non PCI compatible devices.
11 */
12#include <linux/types.h>
13#include <linux/device.h>
14#include <linux/irq.h>
15#include <linux/irqdomain.h>
16#include <linux/msi.h>
17
18/* Temparory solution for building, will be removed later */
19#include <linux/pci.h>
20
21void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
22{
23 *msg = entry->msg;
24}
25
26void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
27{
28 struct msi_desc *entry = irq_get_msi_desc(irq);
29
30 __get_cached_msi_msg(entry, msg);
31}
32EXPORT_SYMBOL_GPL(get_cached_msi_msg);
33
34#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
35static inline void irq_chip_write_msi_msg(struct irq_data *data,
36 struct msi_msg *msg)
37{
38 data->chip->irq_write_msi_msg(data, msg);
39}
40
41/**
42 * msi_domain_set_affinity - Generic affinity setter function for MSI domains
43 * @irq_data: The irq data associated to the interrupt
44 * @mask: The affinity mask to set
45 * @force: Flag to enforce setting (disable online checks)
46 *
47 * Intended to be used by MSI interrupt controllers which are
48 * implemented with hierarchical domains.
49 */
50int msi_domain_set_affinity(struct irq_data *irq_data,
51 const struct cpumask *mask, bool force)
52{
53 struct irq_data *parent = irq_data->parent_data;
54 struct msi_msg msg;
55 int ret;
56
57 ret = parent->chip->irq_set_affinity(parent, mask, force);
58 if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
59 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
60 irq_chip_write_msi_msg(irq_data, &msg);
61 }
62
63 return ret;
64}
65
66static void msi_domain_activate(struct irq_domain *domain,
67 struct irq_data *irq_data)
68{
69 struct msi_msg msg;
70
71 BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
72 irq_chip_write_msi_msg(irq_data, &msg);
73}
74
75static void msi_domain_deactivate(struct irq_domain *domain,
76 struct irq_data *irq_data)
77{
78 struct msi_msg msg;
79
80 memset(&msg, 0, sizeof(msg));
81 irq_chip_write_msi_msg(irq_data, &msg);
82}
83
84static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
85 unsigned int nr_irqs, void *arg)
86{
87 struct msi_domain_info *info = domain->host_data;
88 struct msi_domain_ops *ops = info->ops;
89 irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
90 int i, ret;
91
92 if (irq_find_mapping(domain, hwirq) > 0)
93 return -EEXIST;
94
95 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
96 if (ret < 0)
97 return ret;
98
99 for (i = 0; i < nr_irqs; i++) {
100 ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
101 if (ret < 0) {
102 if (ops->msi_free) {
103 for (i--; i > 0; i--)
104 ops->msi_free(domain, info, virq + i);
105 }
106 irq_domain_free_irqs_top(domain, virq, nr_irqs);
107 return ret;
108 }
109 }
110
111 return 0;
112}
113
114static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
115 unsigned int nr_irqs)
116{
117 struct msi_domain_info *info = domain->host_data;
118 int i;
119
120 if (info->ops->msi_free) {
121 for (i = 0; i < nr_irqs; i++)
122 info->ops->msi_free(domain, info, virq + i);
123 }
124 irq_domain_free_irqs_top(domain, virq, nr_irqs);
125}
126
127static struct irq_domain_ops msi_domain_ops = {
128 .alloc = msi_domain_alloc,
129 .free = msi_domain_free,
130 .activate = msi_domain_activate,
131 .deactivate = msi_domain_deactivate,
132};
133
134#ifdef GENERIC_MSI_DOMAIN_OPS
135static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
136 msi_alloc_info_t *arg)
137{
138 return arg->hwirq;
139}
140
141static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
142 int nvec, msi_alloc_info_t *arg)
143{
144 memset(arg, 0, sizeof(*arg));
145 return 0;
146}
147
148static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
149 struct msi_desc *desc)
150{
151 arg->desc = desc;
152}
153#else
154#define msi_domain_ops_get_hwirq NULL
155#define msi_domain_ops_prepare NULL
156#define msi_domain_ops_set_desc NULL
157#endif /* !GENERIC_MSI_DOMAIN_OPS */
158
159static int msi_domain_ops_init(struct irq_domain *domain,
160 struct msi_domain_info *info,
161 unsigned int virq, irq_hw_number_t hwirq,
162 msi_alloc_info_t *arg)
163{
164 irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
165 info->chip_data);
166 if (info->handler && info->handler_name) {
167 __irq_set_handler(virq, info->handler, 0, info->handler_name);
168 if (info->handler_data)
169 irq_set_handler_data(virq, info->handler_data);
170 }
171 return 0;
172}
173
174static int msi_domain_ops_check(struct irq_domain *domain,
175 struct msi_domain_info *info,
176 struct device *dev)
177{
178 return 0;
179}
180
181static struct msi_domain_ops msi_domain_ops_default = {
182 .get_hwirq = msi_domain_ops_get_hwirq,
183 .msi_init = msi_domain_ops_init,
184 .msi_check = msi_domain_ops_check,
185 .msi_prepare = msi_domain_ops_prepare,
186 .set_desc = msi_domain_ops_set_desc,
187};
188
189static void msi_domain_update_dom_ops(struct msi_domain_info *info)
190{
191 struct msi_domain_ops *ops = info->ops;
192
193 if (ops == NULL) {
194 info->ops = &msi_domain_ops_default;
195 return;
196 }
197
198 if (ops->get_hwirq == NULL)
199 ops->get_hwirq = msi_domain_ops_default.get_hwirq;
200 if (ops->msi_init == NULL)
201 ops->msi_init = msi_domain_ops_default.msi_init;
202 if (ops->msi_check == NULL)
203 ops->msi_check = msi_domain_ops_default.msi_check;
204 if (ops->msi_prepare == NULL)
205 ops->msi_prepare = msi_domain_ops_default.msi_prepare;
206 if (ops->set_desc == NULL)
207 ops->set_desc = msi_domain_ops_default.set_desc;
208}
209
210static void msi_domain_update_chip_ops(struct msi_domain_info *info)
211{
212 struct irq_chip *chip = info->chip;
213
214 BUG_ON(!chip);
215 if (!chip->irq_mask)
216 chip->irq_mask = pci_msi_mask_irq;
217 if (!chip->irq_unmask)
218 chip->irq_unmask = pci_msi_unmask_irq;
219 if (!chip->irq_set_affinity)
220 chip->irq_set_affinity = msi_domain_set_affinity;
221}
222
223/**
224 * msi_create_irq_domain - Create a MSI interrupt domain
225 * @of_node: Optional device-tree node of the interrupt controller
226 * @info: MSI domain info
227 * @parent: Parent irq domain
228 */
229struct irq_domain *msi_create_irq_domain(struct device_node *node,
230 struct msi_domain_info *info,
231 struct irq_domain *parent)
232{
233 if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
234 msi_domain_update_dom_ops(info);
235 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
236 msi_domain_update_chip_ops(info);
237
238 return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
239 info);
240}
241
242/**
243 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
244 * @domain: The domain to allocate from
245 * @dev: Pointer to device struct of the device for which the interrupts
246 * are allocated
247 * @nvec: The number of interrupts to allocate
248 *
249 * Returns 0 on success or an error code.
250 */
251int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
252 int nvec)
253{
254 struct msi_domain_info *info = domain->host_data;
255 struct msi_domain_ops *ops = info->ops;
256 msi_alloc_info_t arg;
257 struct msi_desc *desc;
258 int i, ret, virq = -1;
259
260 ret = ops->msi_check(domain, info, dev);
261 if (ret == 0)
262 ret = ops->msi_prepare(domain, dev, nvec, &arg);
263 if (ret)
264 return ret;
265
266 for_each_msi_entry(desc, dev) {
267 ops->set_desc(&arg, desc);
268 if (info->flags & MSI_FLAG_IDENTITY_MAP)
269 virq = (int)ops->get_hwirq(info, &arg);
270 else
271 virq = -1;
272
273 virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
274 dev_to_node(dev), &arg, false);
275 if (virq < 0) {
276 ret = -ENOSPC;
277 if (ops->handle_error)
278 ret = ops->handle_error(domain, desc, ret);
279 if (ops->msi_finish)
280 ops->msi_finish(&arg, ret);
281 return ret;
282 }
283
284 for (i = 0; i < desc->nvec_used; i++)
285 irq_set_msi_desc_off(virq, i, desc);
286 }
287
288 if (ops->msi_finish)
289 ops->msi_finish(&arg, 0);
290
291 for_each_msi_entry(desc, dev) {
292 if (desc->nvec_used == 1)
293 dev_dbg(dev, "irq %d for MSI\n", virq);
294 else
295 dev_dbg(dev, "irq [%d-%d] for MSI\n",
296 virq, virq + desc->nvec_used - 1);
297 }
298
299 return 0;
300}
301
302/**
303 * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
304 * @domain: The domain to managing the interrupts
305 * @dev: Pointer to device struct of the device for which the interrupts
306 * are free
307 */
308void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
309{
310 struct msi_desc *desc;
311
312 for_each_msi_entry(desc, dev) {
313 irq_domain_free_irqs(desc->irq, desc->nvec_used);
314 desc->irq = 0;
315 }
316}
317
318/**
319 * msi_get_domain_info - Get the MSI interrupt domain info for @domain
320 * @domain: The interrupt domain to retrieve data from
321 *
322 * Returns the pointer to the msi_domain_info stored in
323 * @domain->host_data.
324 */
325struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
326{
327 return (struct msi_domain_info *)domain->host_data;
328}
329
330#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
15 15
16#include "internals.h" 16#include "internals.h"
17 17
18/*
19 * Access rules:
20 *
21 * procfs protects read/write of /proc/irq/N/ files against a
22 * concurrent free of the interrupt descriptor. remove_proc_entry()
23 * immediately prevents new read/writes to happen and waits for
24 * already running read/write functions to complete.
25 *
26 * We remove the proc entries first and then delete the interrupt
27 * descriptor from the radix tree and free it. So it is guaranteed
28 * that irq_to_desc(N) is valid as long as the read/writes are
29 * permitted by procfs.
30 *
31 * The read from /proc/interrupts is a different problem because there
32 * is no protection. So the lookup and the access to irqdesc
33 * information must be protected by sparse_irq_lock.
34 */
18static struct proc_dir_entry *root_irq_dir; 35static struct proc_dir_entry *root_irq_dir;
19 36
20#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
437 seq_putc(p, '\n'); 454 seq_putc(p, '\n');
438 } 455 }
439 456
457 irq_lock_sparse();
440 desc = irq_to_desc(i); 458 desc = irq_to_desc(i);
441 if (!desc) 459 if (!desc)
442 return 0; 460 goto outsparse;
443 461
444 raw_spin_lock_irqsave(&desc->lock, flags); 462 raw_spin_lock_irqsave(&desc->lock, flags);
445 for_each_online_cpu(j) 463 for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
479 seq_putc(p, '\n'); 497 seq_putc(p, '\n');
480out: 498out:
481 raw_spin_unlock_irqrestore(&desc->lock, flags); 499 raw_spin_unlock_irqrestore(&desc->lock, flags);
500outsparse:
501 irq_unlock_sparse();
482 return 0; 502 return 0;
483} 503}
484#endif 504#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
175 175
176void irq_work_tick(void) 176void irq_work_tick(void)
177{ 177{
178 struct llist_head *raised = &__get_cpu_var(raised_list); 178 struct llist_head *raised = this_cpu_ptr(&raised_list);
179 179
180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) 180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
181 irq_work_run_list(raised); 181 irq_work_run_list(raised);
182 irq_work_run_list(&__get_cpu_var(lazy_list)); 182 irq_work_run_list(this_cpu_ptr(&lazy_list));
183} 183}
184 184
185/* 185/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6e9a61..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
600 if (!kexec_on_panic) { 600 if (!kexec_on_panic) {
601 image->swap_page = kimage_alloc_control_pages(image, 0); 601 image->swap_page = kimage_alloc_control_pages(image, 0);
602 if (!image->swap_page) { 602 if (!image->swap_page) {
603 pr_err(KERN_ERR "Could not allocate swap buffer\n"); 603 pr_err("Could not allocate swap buffer\n");
604 goto out_free_control_pages; 604 goto out_free_control_pages;
605 } 605 }
606 } 606 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq; 48static struct workqueue_struct *khelper_wq;
49 49
50/*
51 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
52 * locking to protect this global - it is private to the singleton khelper
53 * thread and should only ever be modified by that thread.
54 */
55static const struct task_struct *kmod_thread_locker;
56
57#define CAP_BSET (void *)1 50#define CAP_BSET (void *)1
58#define CAP_PI (void *)2 51#define CAP_PI (void *)2
59 52
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
223static int ____call_usermodehelper(void *data) 216static int ____call_usermodehelper(void *data)
224{ 217{
225 struct subprocess_info *sub_info = data; 218 struct subprocess_info *sub_info = data;
226 int wait = sub_info->wait & ~UMH_KILLABLE;
227 struct cred *new; 219 struct cred *new;
228 int retval; 220 int retval;
229 221
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
267out: 259out:
268 sub_info->retval = retval; 260 sub_info->retval = retval;
269 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ 261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
270 if (wait != UMH_WAIT_PROC) 262 if (!(sub_info->wait & UMH_WAIT_PROC))
271 umh_complete(sub_info); 263 umh_complete(sub_info);
272 if (!retval) 264 if (!retval)
273 return 0; 265 return 0;
274 do_exit(0); 266 do_exit(0);
275} 267}
276 268
277static int call_helper(void *data)
278{
279 /* Worker thread started blocking khelper thread. */
280 kmod_thread_locker = current;
281 return ____call_usermodehelper(data);
282}
283
284/* Keventd can't block, but this (a child) can. */ 269/* Keventd can't block, but this (a child) can. */
285static int wait_for_helper(void *data) 270static int wait_for_helper(void *data)
286{ 271{
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
323{ 308{
324 struct subprocess_info *sub_info = 309 struct subprocess_info *sub_info =
325 container_of(work, struct subprocess_info, work); 310 container_of(work, struct subprocess_info, work);
326 int wait = sub_info->wait & ~UMH_KILLABLE;
327 pid_t pid; 311 pid_t pid;
328 312
329 /* CLONE_VFORK: wait until the usermode helper has execve'd 313 if (sub_info->wait & UMH_WAIT_PROC)
330 * successfully We need the data structures to stay around
331 * until that is done. */
332 if (wait == UMH_WAIT_PROC)
333 pid = kernel_thread(wait_for_helper, sub_info, 314 pid = kernel_thread(wait_for_helper, sub_info,
334 CLONE_FS | CLONE_FILES | SIGCHLD); 315 CLONE_FS | CLONE_FILES | SIGCHLD);
335 else { 316 else
336 pid = kernel_thread(call_helper, sub_info, 317 pid = kernel_thread(____call_usermodehelper, sub_info,
337 CLONE_VFORK | SIGCHLD); 318 SIGCHLD);
338 /* Worker thread stopped blocking khelper thread. */
339 kmod_thread_locker = NULL;
340 }
341 319
342 if (pid < 0) { 320 if (pid < 0) {
343 sub_info->retval = pid; 321 sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 goto out; 549 goto out;
572 } 550 }
573 /* 551 /*
574 * Worker thread must not wait for khelper thread at below
575 * wait_for_completion() if the thread was created with CLONE_VFORK
576 * flag, for khelper thread is already waiting for the thread at
577 * wait_for_completion() in do_fork().
578 */
579 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
580 retval = -EBUSY;
581 goto out;
582 }
583
584 /*
585 * Set the completion pointer only if there is a waiter. 552 * Set the completion pointer only if there is a waiter.
586 * This makes it possible to use umh_complete to free 553 * This makes it possible to use umh_complete to free
587 * the data structure in case of UMH_NO_WAIT. 554 * the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
127 127
128static void free_insn_page(void *page) 128static void free_insn_page(void *page)
129{ 129{
130 module_free(NULL, page); 130 module_memfree(page);
131} 131}
132 132
133struct kprobe_insn_cache kprobe_insn_slots = { 133struct kprobe_insn_cache kprobe_insn_slots = {
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
915#ifdef CONFIG_KPROBES_ON_FTRACE 915#ifdef CONFIG_KPROBES_ON_FTRACE
916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
917 .func = kprobe_ftrace_handler, 917 .func = kprobe_ftrace_handler,
918 .flags = FTRACE_OPS_FL_SAVE_REGS, 918 .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
919}; 919};
920static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
921 921
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1410 return ret; 1410 return ret;
1411} 1411}
1412 1412
1413static int check_kprobe_address_safe(struct kprobe *p, 1413int __weak arch_check_ftrace_location(struct kprobe *p)
1414 struct module **probed_mod)
1415{ 1414{
1416 int ret = 0;
1417 unsigned long ftrace_addr; 1415 unsigned long ftrace_addr;
1418 1416
1419 /*
1420 * If the address is located on a ftrace nop, set the
1421 * breakpoint to the following instruction.
1422 */
1423 ftrace_addr = ftrace_location((unsigned long)p->addr); 1417 ftrace_addr = ftrace_location((unsigned long)p->addr);
1424 if (ftrace_addr) { 1418 if (ftrace_addr) {
1425#ifdef CONFIG_KPROBES_ON_FTRACE 1419#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
1431 return -EINVAL; 1425 return -EINVAL;
1432#endif 1426#endif
1433 } 1427 }
1428 return 0;
1429}
1434 1430
1431static int check_kprobe_address_safe(struct kprobe *p,
1432 struct module **probed_mod)
1433{
1434 int ret;
1435
1436 ret = arch_check_ftrace_location(p);
1437 if (ret)
1438 return ret;
1435 jump_label_lock(); 1439 jump_label_lock();
1436 preempt_disable(); 1440 preempt_disable();
1437 1441
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
80 DEBUG_LOCKS_WARN_ON(lock->owner != current); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current);
81 81
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
83 mutex_clear_owner(lock);
84 } 83 }
85 84
86 /* 85 /*
87 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug 86 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
88 * mutexes so that we can do it here after we've verified state. 87 * mutexes so that we can do it here after we've verified state.
89 */ 88 */
89 mutex_clear_owner(lock);
90 atomic_set(&lock->count, 1); 90 atomic_set(&lock->count, 1);
91} 91}
92 92
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
378 * reschedule now, before we try-lock the mutex. This avoids getting 378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex. 379 * scheduled out right after we obtained the mutex.
380 */ 380 */
381 if (need_resched()) 381 if (need_resched()) {
382 /*
383 * We _should_ have TASK_RUNNING here, but just in case
384 * we do not, make it so, otherwise we might get stuck.
385 */
386 __set_current_state(TASK_RUNNING);
382 schedule_preempt_disabled(); 387 schedule_preempt_disabled();
388 }
383 389
384 return false; 390 return false;
385} 391}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
42#include <linux/vermagic.h> 42#include <linux/vermagic.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/sched.h> 44#include <linux/sched.h>
45#include <linux/stop_machine.h>
46#include <linux/device.h> 45#include <linux/device.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/mutex.h> 47#include <linux/mutex.h>
@@ -98,7 +97,7 @@
98 * 1) List of modules (also safely readable with preempt_disable), 97 * 1) List of modules (also safely readable with preempt_disable),
99 * 2) module_use links, 98 * 2) module_use links,
100 * 3) module_addr_min/module_addr_max. 99 * 3) module_addr_min/module_addr_max.
101 * (delete uses stop_machine/add uses RCU list operations). */ 100 * (delete and add uses RCU list operations). */
102DEFINE_MUTEX(module_mutex); 101DEFINE_MUTEX(module_mutex);
103EXPORT_SYMBOL_GPL(module_mutex); 102EXPORT_SYMBOL_GPL(module_mutex);
104static LIST_HEAD(modules); 103static LIST_HEAD(modules);
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
158 * Protected by module_mutex. */ 157 * Protected by module_mutex. */
159static unsigned long module_addr_min = -1UL, module_addr_max = 0; 158static unsigned long module_addr_min = -1UL, module_addr_max = 0;
160 159
161int register_module_notifier(struct notifier_block * nb) 160int register_module_notifier(struct notifier_block *nb)
162{ 161{
163 return blocking_notifier_chain_register(&module_notify_list, nb); 162 return blocking_notifier_chain_register(&module_notify_list, nb);
164} 163}
165EXPORT_SYMBOL(register_module_notifier); 164EXPORT_SYMBOL(register_module_notifier);
166 165
167int unregister_module_notifier(struct notifier_block * nb) 166int unregister_module_notifier(struct notifier_block *nb)
168{ 167{
169 return blocking_notifier_chain_unregister(&module_notify_list, nb); 168 return blocking_notifier_chain_unregister(&module_notify_list, nb);
170} 169}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
628 627
629EXPORT_TRACEPOINT_SYMBOL(module_get); 628EXPORT_TRACEPOINT_SYMBOL(module_get);
630 629
630/* MODULE_REF_BASE is the base reference count by kmodule loader. */
631#define MODULE_REF_BASE 1
632
631/* Init the unload section of the module. */ 633/* Init the unload section of the module. */
632static int module_unload_init(struct module *mod) 634static int module_unload_init(struct module *mod)
633{ 635{
634 mod->refptr = alloc_percpu(struct module_ref); 636 /*
635 if (!mod->refptr) 637 * Initialize reference counter to MODULE_REF_BASE.
636 return -ENOMEM; 638 * refcnt == 0 means module is going.
639 */
640 atomic_set(&mod->refcnt, MODULE_REF_BASE);
637 641
638 INIT_LIST_HEAD(&mod->source_list); 642 INIT_LIST_HEAD(&mod->source_list);
639 INIT_LIST_HEAD(&mod->target_list); 643 INIT_LIST_HEAD(&mod->target_list);
640 644
641 /* Hold reference count during initialization. */ 645 /* Hold reference count during initialization. */
642 raw_cpu_write(mod->refptr->incs, 1); 646 atomic_inc(&mod->refcnt);
643 647
644 return 0; 648 return 0;
645} 649}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
721 kfree(use); 725 kfree(use);
722 } 726 }
723 mutex_unlock(&module_mutex); 727 mutex_unlock(&module_mutex);
724
725 free_percpu(mod->refptr);
726} 728}
727 729
728#ifdef CONFIG_MODULE_FORCE_UNLOAD 730#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,48 @@ static inline int try_force_unload(unsigned int flags)
740} 742}
741#endif /* CONFIG_MODULE_FORCE_UNLOAD */ 743#endif /* CONFIG_MODULE_FORCE_UNLOAD */
742 744
743struct stopref 745/* Try to release refcount of module, 0 means success. */
746static int try_release_module_ref(struct module *mod)
744{ 747{
745 struct module *mod; 748 int ret;
746 int flags;
747 int *forced;
748};
749 749
750/* Whole machine is stopped with interrupts off when this runs. */ 750 /* Try to decrement refcnt which we set at loading */
751static int __try_stop_module(void *_sref) 751 ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
752{ 752 BUG_ON(ret < 0);
753 struct stopref *sref = _sref; 753 if (ret)
754 /* Someone can put this right now, recover with checking */
755 ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
756
757 return ret;
758}
754 759
760static int try_stop_module(struct module *mod, int flags, int *forced)
761{
755 /* If it's not unused, quit unless we're forcing. */ 762 /* If it's not unused, quit unless we're forcing. */
756 if (module_refcount(sref->mod) != 0) { 763 if (try_release_module_ref(mod) != 0) {
757 if (!(*sref->forced = try_force_unload(sref->flags))) 764 *forced = try_force_unload(flags);
765 if (!(*forced))
758 return -EWOULDBLOCK; 766 return -EWOULDBLOCK;
759 } 767 }
760 768
761 /* Mark it as dying. */ 769 /* Mark it as dying. */
762 sref->mod->state = MODULE_STATE_GOING; 770 mod->state = MODULE_STATE_GOING;
763 return 0;
764}
765 771
766static int try_stop_module(struct module *mod, int flags, int *forced) 772 return 0;
767{
768 struct stopref sref = { mod, flags, forced };
769
770 return stop_machine(__try_stop_module, &sref, NULL);
771} 773}
772 774
773unsigned long module_refcount(struct module *mod) 775/**
776 * module_refcount - return the refcount or -1 if unloading
777 *
778 * @mod: the module we're checking
779 *
780 * Returns:
781 * -1 if the module is in the process of unloading
782 * otherwise the number of references in the kernel to the module
783 */
784int module_refcount(struct module *mod)
774{ 785{
775 unsigned long incs = 0, decs = 0; 786 return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
776 int cpu;
777
778 for_each_possible_cpu(cpu)
779 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
780 /*
781 * ensure the incs are added up after the decs.
782 * module_put ensures incs are visible before decs with smp_wmb.
783 *
784 * This 2-count scheme avoids the situation where the refcount
785 * for CPU0 is read, then CPU0 increments the module refcount,
786 * then CPU1 drops that refcount, then the refcount for CPU1 is
787 * read. We would record a decrement but not its corresponding
788 * increment so we would see a low count (disaster).
789 *
790 * Rare situation? But module_refcount can be preempted, and we
791 * might be tallying up 4096+ CPUs. So it is not impossible.
792 */
793 smp_rmb();
794 for_each_possible_cpu(cpu)
795 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
796 return incs - decs;
797} 787}
798EXPORT_SYMBOL(module_refcount); 788EXPORT_SYMBOL(module_refcount);
799 789
@@ -875,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
875 struct module_use *use; 865 struct module_use *use;
876 int printed_something = 0; 866 int printed_something = 0;
877 867
878 seq_printf(m, " %lu ", module_refcount(mod)); 868 seq_printf(m, " %i ", module_refcount(mod));
879 869
880 /* Always include a trailing , so userspace can differentiate 870 /*
881 between this and the old multi-field proc format. */ 871 * Always include a trailing , so userspace can differentiate
872 * between this and the old multi-field proc format.
873 */
882 list_for_each_entry(use, &mod->source_list, source_list) { 874 list_for_each_entry(use, &mod->source_list, source_list) {
883 printed_something = 1; 875 printed_something = 1;
884 seq_printf(m, "%s,", use->source->name); 876 seq_printf(m, "%s,", use->source->name);
@@ -886,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
886 878
887 if (mod->init != NULL && mod->exit == NULL) { 879 if (mod->init != NULL && mod->exit == NULL) {
888 printed_something = 1; 880 printed_something = 1;
889 seq_printf(m, "[permanent],"); 881 seq_puts(m, "[permanent],");
890 } 882 }
891 883
892 if (!printed_something) 884 if (!printed_something)
893 seq_printf(m, "-"); 885 seq_puts(m, "-");
894} 886}
895 887
896void __symbol_put(const char *symbol) 888void __symbol_put(const char *symbol)
@@ -925,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
925static ssize_t show_refcnt(struct module_attribute *mattr, 917static ssize_t show_refcnt(struct module_attribute *mattr,
926 struct module_kobject *mk, char *buffer) 918 struct module_kobject *mk, char *buffer)
927{ 919{
928 return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); 920 return sprintf(buffer, "%i\n", module_refcount(mk->mod));
929} 921}
930 922
931static struct module_attribute modinfo_refcnt = 923static struct module_attribute modinfo_refcnt =
@@ -935,7 +927,7 @@ void __module_get(struct module *module)
935{ 927{
936 if (module) { 928 if (module) {
937 preempt_disable(); 929 preempt_disable();
938 __this_cpu_inc(module->refptr->incs); 930 atomic_inc(&module->refcnt);
939 trace_module_get(module, _RET_IP_); 931 trace_module_get(module, _RET_IP_);
940 preempt_enable(); 932 preempt_enable();
941 } 933 }
@@ -948,11 +940,11 @@ bool try_module_get(struct module *module)
948 940
949 if (module) { 941 if (module) {
950 preempt_disable(); 942 preempt_disable();
951 943 /* Note: here, we can fail to get a reference */
952 if (likely(module_is_live(module))) { 944 if (likely(module_is_live(module) &&
953 __this_cpu_inc(module->refptr->incs); 945 atomic_inc_not_zero(&module->refcnt) != 0))
954 trace_module_get(module, _RET_IP_); 946 trace_module_get(module, _RET_IP_);
955 } else 947 else
956 ret = false; 948 ret = false;
957 949
958 preempt_enable(); 950 preempt_enable();
@@ -963,11 +955,12 @@ EXPORT_SYMBOL(try_module_get);
963 955
964void module_put(struct module *module) 956void module_put(struct module *module)
965{ 957{
958 int ret;
959
966 if (module) { 960 if (module) {
967 preempt_disable(); 961 preempt_disable();
968 smp_wmb(); /* see comment in module_refcount */ 962 ret = atomic_dec_if_positive(&module->refcnt);
969 __this_cpu_inc(module->refptr->decs); 963 WARN_ON(ret < 0); /* Failed to put refcount */
970
971 trace_module_put(module, _RET_IP_); 964 trace_module_put(module, _RET_IP_);
972 preempt_enable(); 965 preempt_enable();
973 } 966 }
@@ -978,7 +971,7 @@ EXPORT_SYMBOL(module_put);
978static inline void print_unload_info(struct seq_file *m, struct module *mod) 971static inline void print_unload_info(struct seq_file *m, struct module *mod)
979{ 972{
980 /* We don't know the usage count, or what modules are using. */ 973 /* We don't know the usage count, or what modules are using. */
981 seq_printf(m, " - -"); 974 seq_puts(m, " - -");
982} 975}
983 976
984static inline void module_unload_free(struct module *mod) 977static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc,
1131static int check_version(Elf_Shdr *sechdrs, 1124static int check_version(Elf_Shdr *sechdrs,
1132 unsigned int versindex, 1125 unsigned int versindex,
1133 const char *symname, 1126 const char *symname,
1134 struct module *mod, 1127 struct module *mod,
1135 const unsigned long *crc, 1128 const unsigned long *crc,
1136 const struct module *crc_owner) 1129 const struct module *crc_owner)
1137{ 1130{
@@ -1165,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs,
1165 return 0; 1158 return 0;
1166 1159
1167bad_version: 1160bad_version:
1168 printk("%s: disagrees about version of symbol %s\n", 1161 pr_warn("%s: disagrees about version of symbol %s\n",
1169 mod->name, symname); 1162 mod->name, symname);
1170 return 0; 1163 return 0;
1171} 1164}
@@ -1200,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1200static inline int check_version(Elf_Shdr *sechdrs, 1193static inline int check_version(Elf_Shdr *sechdrs,
1201 unsigned int versindex, 1194 unsigned int versindex,
1202 const char *symname, 1195 const char *symname,
1203 struct module *mod, 1196 struct module *mod,
1204 const unsigned long *crc, 1197 const unsigned long *crc,
1205 const struct module *crc_owner) 1198 const struct module *crc_owner)
1206{ 1199{
@@ -1288,15 +1281,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
1288 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1281 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1289} 1282}
1290 1283
1291struct module_sect_attr 1284struct module_sect_attr {
1292{
1293 struct module_attribute mattr; 1285 struct module_attribute mattr;
1294 char *name; 1286 char *name;
1295 unsigned long address; 1287 unsigned long address;
1296}; 1288};
1297 1289
1298struct module_sect_attrs 1290struct module_sect_attrs {
1299{
1300 struct attribute_group grp; 1291 struct attribute_group grp;
1301 unsigned int nsections; 1292 unsigned int nsections;
1302 struct module_sect_attr attrs[0]; 1293 struct module_sect_attr attrs[0];
@@ -1550,7 +1541,8 @@ static int module_add_modinfo_attrs(struct module *mod)
1550 (attr->test && attr->test(mod))) { 1541 (attr->test && attr->test(mod))) {
1551 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1542 memcpy(temp_attr, attr, sizeof(*temp_attr));
1552 sysfs_attr_init(&temp_attr->attr); 1543 sysfs_attr_init(&temp_attr->attr);
1553 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1544 error = sysfs_create_file(&mod->mkobj.kobj,
1545 &temp_attr->attr);
1554 ++temp_attr; 1546 ++temp_attr;
1555 } 1547 }
1556 } 1548 }
@@ -1566,7 +1558,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
1566 /* pick a field to test for end of list */ 1558 /* pick a field to test for end of list */
1567 if (!attr->attr.name) 1559 if (!attr->attr.name)
1568 break; 1560 break;
1569 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); 1561 sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
1570 if (attr->free) 1562 if (attr->free)
1571 attr->free(mod); 1563 attr->free(mod);
1572 } 1564 }
@@ -1697,18 +1689,6 @@ static void mod_sysfs_teardown(struct module *mod)
1697 mod_sysfs_fini(mod); 1689 mod_sysfs_fini(mod);
1698} 1690}
1699 1691
1700/*
1701 * unlink the module with the whole machine is stopped with interrupts off
1702 * - this defends against kallsyms not taking locks
1703 */
1704static int __unlink_module(void *_mod)
1705{
1706 struct module *mod = _mod;
1707 list_del(&mod->list);
1708 module_bug_cleanup(mod);
1709 return 0;
1710}
1711
1712#ifdef CONFIG_DEBUG_SET_MODULE_RONX 1692#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1713/* 1693/*
1714 * LKM RO/NX protection: protect module's text/ro-data 1694 * LKM RO/NX protection: protect module's text/ro-data
@@ -1824,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1824static void unset_module_init_ro_nx(struct module *mod) { } 1804static void unset_module_init_ro_nx(struct module *mod) { }
1825#endif 1805#endif
1826 1806
1827void __weak module_free(struct module *mod, void *module_region) 1807void __weak module_memfree(void *module_region)
1828{ 1808{
1829 vfree(module_region); 1809 vfree(module_region);
1830} 1810}
@@ -1833,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod)
1833{ 1813{
1834} 1814}
1835 1815
1816void __weak module_arch_freeing_init(struct module *mod)
1817{
1818}
1819
1836/* Free a module, remove from lists, etc. */ 1820/* Free a module, remove from lists, etc. */
1837static void free_module(struct module *mod) 1821static void free_module(struct module *mod)
1838{ 1822{
@@ -1860,12 +1844,18 @@ static void free_module(struct module *mod)
1860 1844
1861 /* Now we can delete it from the lists */ 1845 /* Now we can delete it from the lists */
1862 mutex_lock(&module_mutex); 1846 mutex_lock(&module_mutex);
1863 stop_machine(__unlink_module, mod, NULL); 1847 /* Unlink carefully: kallsyms could be walking list. */
1848 list_del_rcu(&mod->list);
1849 /* Remove this module from bug list, this uses list_del_rcu */
1850 module_bug_cleanup(mod);
1851 /* Wait for RCU synchronizing before releasing mod->list and buglist. */
1852 synchronize_rcu();
1864 mutex_unlock(&module_mutex); 1853 mutex_unlock(&module_mutex);
1865 1854
1866 /* This may be NULL, but that's OK */ 1855 /* This may be NULL, but that's OK */
1867 unset_module_init_ro_nx(mod); 1856 unset_module_init_ro_nx(mod);
1868 module_free(mod, mod->module_init); 1857 module_arch_freeing_init(mod);
1858 module_memfree(mod->module_init);
1869 kfree(mod->args); 1859 kfree(mod->args);
1870 percpu_modfree(mod); 1860 percpu_modfree(mod);
1871 1861
@@ -1874,7 +1864,7 @@ static void free_module(struct module *mod)
1874 1864
1875 /* Finally, free the core (containing the module structure) */ 1865 /* Finally, free the core (containing the module structure) */
1876 unset_module_core_ro_nx(mod); 1866 unset_module_core_ro_nx(mod);
1877 module_free(mod, mod->module_core); 1867 module_memfree(mod->module_core);
1878 1868
1879#ifdef CONFIG_MPU 1869#ifdef CONFIG_MPU
1880 update_protections(current->mm); 1870 update_protections(current->mm);
@@ -1955,7 +1945,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1955 /* We compiled with -fno-common. These are not 1945 /* We compiled with -fno-common. These are not
1956 supposed to happen. */ 1946 supposed to happen. */
1957 pr_debug("Common symbol: %s\n", name); 1947 pr_debug("Common symbol: %s\n", name);
1958 printk("%s: please compile with -fno-common\n", 1948 pr_warn("%s: please compile with -fno-common\n",
1959 mod->name); 1949 mod->name);
1960 ret = -ENOEXEC; 1950 ret = -ENOEXEC;
1961 break; 1951 break;
@@ -2259,7 +2249,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
2259} 2249}
2260 2250
2261static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, 2251static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2262 unsigned int shnum) 2252 unsigned int shnum)
2263{ 2253{
2264 const Elf_Shdr *sec; 2254 const Elf_Shdr *sec;
2265 2255
@@ -2735,7 +2725,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
2735 * This shouldn't happen with same compiler and binutils 2725 * This shouldn't happen with same compiler and binutils
2736 * building all parts of the module. 2726 * building all parts of the module.
2737 */ 2727 */
2738 printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", 2728 pr_warn("%s: has both .ctors and .init_array.\n",
2739 mod->name); 2729 mod->name);
2740 return -EINVAL; 2730 return -EINVAL;
2741 } 2731 }
@@ -2809,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info)
2809 */ 2799 */
2810 kmemleak_ignore(ptr); 2800 kmemleak_ignore(ptr);
2811 if (!ptr) { 2801 if (!ptr) {
2812 module_free(mod, mod->module_core); 2802 module_memfree(mod->module_core);
2813 return -ENOMEM; 2803 return -ENOMEM;
2814 } 2804 }
2815 memset(ptr, 0, mod->init_size); 2805 memset(ptr, 0, mod->init_size);
@@ -2954,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2954static void module_deallocate(struct module *mod, struct load_info *info) 2944static void module_deallocate(struct module *mod, struct load_info *info)
2955{ 2945{
2956 percpu_modfree(mod); 2946 percpu_modfree(mod);
2957 module_free(mod, mod->module_init); 2947 module_arch_freeing_init(mod);
2958 module_free(mod, mod->module_core); 2948 module_memfree(mod->module_init);
2949 module_memfree(mod->module_core);
2959} 2950}
2960 2951
2961int __weak module_finalize(const Elf_Ehdr *hdr, 2952int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3007,10 +2998,31 @@ static void do_mod_ctors(struct module *mod)
3007#endif 2998#endif
3008} 2999}
3009 3000
3001/* For freeing module_init on success, in case kallsyms traversing */
3002struct mod_initfree {
3003 struct rcu_head rcu;
3004 void *module_init;
3005};
3006
3007static void do_free_init(struct rcu_head *head)
3008{
3009 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
3010 module_memfree(m->module_init);
3011 kfree(m);
3012}
3013
3010/* This is where the real work happens */ 3014/* This is where the real work happens */
3011static int do_init_module(struct module *mod) 3015static int do_init_module(struct module *mod)
3012{ 3016{
3013 int ret = 0; 3017 int ret = 0;
3018 struct mod_initfree *freeinit;
3019
3020 freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
3021 if (!freeinit) {
3022 ret = -ENOMEM;
3023 goto fail;
3024 }
3025 freeinit->module_init = mod->module_init;
3014 3026
3015 /* 3027 /*
3016 * We want to find out whether @mod uses async during init. Clear 3028 * We want to find out whether @mod uses async during init. Clear
@@ -3023,16 +3035,7 @@ static int do_init_module(struct module *mod)
3023 if (mod->init != NULL) 3035 if (mod->init != NULL)
3024 ret = do_one_initcall(mod->init); 3036 ret = do_one_initcall(mod->init);
3025 if (ret < 0) { 3037 if (ret < 0) {
3026 /* Init routine failed: abort. Try to protect us from 3038 goto fail_free_freeinit;
3027 buggy refcounters. */
3028 mod->state = MODULE_STATE_GOING;
3029 synchronize_sched();
3030 module_put(mod);
3031 blocking_notifier_call_chain(&module_notify_list,
3032 MODULE_STATE_GOING, mod);
3033 free_module(mod);
3034 wake_up_all(&module_wq);
3035 return ret;
3036 } 3039 }
3037 if (ret > 0) { 3040 if (ret > 0) {
3038 pr_warn("%s: '%s'->init suspiciously returned %d, it should " 3041 pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3077,15 +3080,35 @@ static int do_init_module(struct module *mod)
3077 mod->strtab = mod->core_strtab; 3080 mod->strtab = mod->core_strtab;
3078#endif 3081#endif
3079 unset_module_init_ro_nx(mod); 3082 unset_module_init_ro_nx(mod);
3080 module_free(mod, mod->module_init); 3083 module_arch_freeing_init(mod);
3081 mod->module_init = NULL; 3084 mod->module_init = NULL;
3082 mod->init_size = 0; 3085 mod->init_size = 0;
3083 mod->init_ro_size = 0; 3086 mod->init_ro_size = 0;
3084 mod->init_text_size = 0; 3087 mod->init_text_size = 0;
3088 /*
3089 * We want to free module_init, but be aware that kallsyms may be
3090 * walking this with preempt disabled. In all the failure paths,
3091 * we call synchronize_rcu/synchronize_sched, but we don't want
3092 * to slow down the success path, so use actual RCU here.
3093 */
3094 call_rcu(&freeinit->rcu, do_free_init);
3085 mutex_unlock(&module_mutex); 3095 mutex_unlock(&module_mutex);
3086 wake_up_all(&module_wq); 3096 wake_up_all(&module_wq);
3087 3097
3088 return 0; 3098 return 0;
3099
3100fail_free_freeinit:
3101 kfree(freeinit);
3102fail:
3103 /* Try to protect us from buggy refcounters. */
3104 mod->state = MODULE_STATE_GOING;
3105 synchronize_sched();
3106 module_put(mod);
3107 blocking_notifier_call_chain(&module_notify_list,
3108 MODULE_STATE_GOING, mod);
3109 free_module(mod);
3110 wake_up_all(&module_wq);
3111 return ret;
3089} 3112}
3090 3113
3091static int may_init_module(void) 3114static int may_init_module(void)
@@ -3097,6 +3120,32 @@ static int may_init_module(void)
3097} 3120}
3098 3121
3099/* 3122/*
3123 * Can't use wait_event_interruptible() because our condition
3124 * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
3125 */
3126static int wait_finished_loading(struct module *mod)
3127{
3128 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3129 int ret = 0;
3130
3131 add_wait_queue(&module_wq, &wait);
3132 for (;;) {
3133 if (finished_loading(mod->name))
3134 break;
3135
3136 if (signal_pending(current)) {
3137 ret = -ERESTARTSYS;
3138 break;
3139 }
3140
3141 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3142 }
3143 remove_wait_queue(&module_wq, &wait);
3144
3145 return ret;
3146}
3147
3148/*
3100 * We try to place it in the list now to make sure it's unique before 3149 * We try to place it in the list now to make sure it's unique before
3101 * we dedicate too many resources. In particular, temporary percpu 3150 * we dedicate too many resources. In particular, temporary percpu
3102 * memory exhaustion. 3151 * memory exhaustion.
@@ -3116,8 +3165,8 @@ again:
3116 || old->state == MODULE_STATE_UNFORMED) { 3165 || old->state == MODULE_STATE_UNFORMED) {
3117 /* Wait in case it fails to load. */ 3166 /* Wait in case it fails to load. */
3118 mutex_unlock(&module_mutex); 3167 mutex_unlock(&module_mutex);
3119 err = wait_event_interruptible(module_wq, 3168
3120 finished_loading(mod->name)); 3169 err = wait_finished_loading(mod);
3121 if (err) 3170 if (err)
3122 goto out_unlocked; 3171 goto out_unlocked;
3123 goto again; 3172 goto again;
@@ -3176,7 +3225,7 @@ out:
3176 3225
3177static int unknown_module_param_cb(char *param, char *val, const char *modname) 3226static int unknown_module_param_cb(char *param, char *val, const char *modname)
3178{ 3227{
3179 /* Check for magic 'dyndbg' arg */ 3228 /* Check for magic 'dyndbg' arg */
3180 int ret = ddebug_dyndbg_module_param_cb(param, val, modname); 3229 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3181 if (ret != 0) 3230 if (ret != 0)
3182 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); 3231 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3326,6 +3375,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
3326 /* Unlink carefully: kallsyms could be walking list. */ 3375 /* Unlink carefully: kallsyms could be walking list. */
3327 list_del_rcu(&mod->list); 3376 list_del_rcu(&mod->list);
3328 wake_up_all(&module_wq); 3377 wake_up_all(&module_wq);
3378 /* Wait for RCU synchronizing before releasing mod->list. */
3379 synchronize_rcu();
3329 mutex_unlock(&module_mutex); 3380 mutex_unlock(&module_mutex);
3330 free_module: 3381 free_module:
3331 module_deallocate(mod, info); 3382 module_deallocate(mod, info);
@@ -3659,8 +3710,8 @@ static int m_show(struct seq_file *m, void *p)
3659 3710
3660 /* Informative for users. */ 3711 /* Informative for users. */
3661 seq_printf(m, " %s", 3712 seq_printf(m, " %s",
3662 mod->state == MODULE_STATE_GOING ? "Unloading": 3713 mod->state == MODULE_STATE_GOING ? "Unloading" :
3663 mod->state == MODULE_STATE_COMING ? "Loading": 3714 mod->state == MODULE_STATE_COMING ? "Loading" :
3664 "Live"); 3715 "Live");
3665 /* Used by oprofile and other similar tools. */ 3716 /* Used by oprofile and other similar tools. */
3666 seq_printf(m, " 0x%pK", mod->module_core); 3717 seq_printf(m, " 0x%pK", mod->module_core);
@@ -3669,7 +3720,7 @@ static int m_show(struct seq_file *m, void *p)
3669 if (mod->taints) 3720 if (mod->taints)
3670 seq_printf(m, " %s", module_flags(mod, buf)); 3721 seq_printf(m, " %s", module_flags(mod, buf));
3671 3722
3672 seq_printf(m, "\n"); 3723 seq_puts(m, "\n");
3673 return 0; 3724 return 0;
3674} 3725}
3675 3726
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
220 220
221SYSCALL_DEFINE2(setns, int, fd, int, nstype) 221SYSCALL_DEFINE2(setns, int, fd, int, nstype)
222{ 222{
223 const struct proc_ns_operations *ops;
224 struct task_struct *tsk = current; 223 struct task_struct *tsk = current;
225 struct nsproxy *new_nsproxy; 224 struct nsproxy *new_nsproxy;
226 struct proc_ns *ei;
227 struct file *file; 225 struct file *file;
226 struct ns_common *ns;
228 int err; 227 int err;
229 228
230 file = proc_ns_fget(fd); 229 file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
232 return PTR_ERR(file); 231 return PTR_ERR(file);
233 232
234 err = -EINVAL; 233 err = -EINVAL;
235 ei = get_proc_ns(file_inode(file)); 234 ns = get_proc_ns(file_inode(file));
236 ops = ei->ns_ops; 235 if (nstype && (ns->ops->type != nstype))
237 if (nstype && (ops->type != nstype))
238 goto out; 236 goto out;
239 237
240 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 238 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
243 goto out; 241 goto out;
244 } 242 }
245 243
246 err = ops->install(new_nsproxy, ei->ns); 244 err = ns->ops->install(new_nsproxy, ns);
247 if (err) { 245 if (err) {
248 free_nsproxy(new_nsproxy); 246 free_nsproxy(new_nsproxy);
249 goto out; 247 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers; 35static bool crash_kexec_post_notifiers;
36int panic_on_warn __read_mostly;
36 37
37int panic_timeout = CONFIG_PANIC_TIMEOUT; 38int panic_timeout = CONFIG_PANIC_TIMEOUT;
38EXPORT_SYMBOL_GPL(panic_timeout); 39EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
428 if (args) 429 if (args)
429 vprintk(args->fmt, args->args); 430 vprintk(args->fmt, args->args);
430 431
432 if (panic_on_warn) {
433 /*
434 * This thread may hit another WARN() in the panic path.
435 * Resetting this prevents additional WARN() from panicking the
436 * system on this thread. Other threads are blocked by the
437 * panic_mutex in panic().
438 */
439 panic_on_warn = 0;
440 panic("panic_on_warn set ...\n");
441 }
442
431 print_modules(); 443 print_modules();
432 dump_stack(); 444 dump_stack();
433 print_oops_end_marker(); 445 print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
485 497
486core_param(panic, panic_timeout, int, 0644); 498core_param(panic, panic_timeout, int, 0644);
487core_param(pause_on_oops, pause_on_oops, int, 0644); 499core_param(pause_on_oops, pause_on_oops, int, 0644);
500core_param(panic_on_warn, panic_on_warn, int, 0644);
488 501
489static int __init setup_crash_kexec_post_notifiers(char *s) 502static int __init setup_crash_kexec_post_notifiers(char *s)
490{ 503{
diff --git a/kernel/params.c b/kernel/params.c
index db97b791390f..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -603,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
603 const struct kernel_param *kp, 603 const struct kernel_param *kp,
604 const char *name) 604 const char *name)
605{ 605{
606 struct module_param_attrs *new; 606 struct module_param_attrs *new_mp;
607 struct attribute **attrs; 607 struct attribute **new_attrs;
608 int err, num; 608 unsigned int i;
609 609
610 /* We don't bother calling this with invisible parameters. */ 610 /* We don't bother calling this with invisible parameters. */
611 BUG_ON(!kp->perm); 611 BUG_ON(!kp->perm);
612 612
613 if (!mk->mp) { 613 if (!mk->mp) {
614 num = 0; 614 /* First allocation. */
615 attrs = NULL; 615 mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
616 } else { 616 if (!mk->mp)
617 num = mk->mp->num; 617 return -ENOMEM;
618 attrs = mk->mp->grp.attrs; 618 mk->mp->grp.name = "parameters";
619 /* NULL-terminated attribute array. */
620 mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
621 GFP_KERNEL);
622 /* Caller will cleanup via free_module_param_attrs */
623 if (!mk->mp->grp.attrs)
624 return -ENOMEM;
619 } 625 }
620 626
621 /* Enlarge. */ 627 /* Enlarge allocations. */
622 new = krealloc(mk->mp, 628 new_mp = krealloc(mk->mp,
623 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), 629 sizeof(*mk->mp) +
624 GFP_KERNEL); 630 sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
625 if (!new) { 631 GFP_KERNEL);
626 kfree(attrs); 632 if (!new_mp)
627 err = -ENOMEM; 633 return -ENOMEM;
628 goto fail; 634 mk->mp = new_mp;
629 }
630 /* Despite looking like the typical realloc() bug, this is safe.
631 * We *want* the old 'attrs' to be freed either way, and we'll store
632 * the new one in the success case. */
633 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
634 if (!attrs) {
635 err = -ENOMEM;
636 goto fail_free_new;
637 }
638 635
639 /* Sysfs wants everything zeroed. */ 636 /* Extra pointer for NULL terminator */
640 memset(new, 0, sizeof(*new)); 637 new_attrs = krealloc(mk->mp->grp.attrs,
641 memset(&new->attrs[num], 0, sizeof(new->attrs[num])); 638 sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
642 memset(&attrs[num], 0, sizeof(attrs[num])); 639 GFP_KERNEL);
643 new->grp.name = "parameters"; 640 if (!new_attrs)
644 new->grp.attrs = attrs; 641 return -ENOMEM;
642 mk->mp->grp.attrs = new_attrs;
645 643
646 /* Tack new one on the end. */ 644 /* Tack new one on the end. */
647 sysfs_attr_init(&new->attrs[num].mattr.attr); 645 memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
648 new->attrs[num].param = kp; 646 sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
649 new->attrs[num].mattr.show = param_attr_show; 647 mk->mp->attrs[mk->mp->num].param = kp;
650 new->attrs[num].mattr.store = param_attr_store; 648 mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
651 new->attrs[num].mattr.attr.name = (char *)name; 649 /* Do not allow runtime DAC changes to make param writable. */
652 new->attrs[num].mattr.attr.mode = kp->perm; 650 if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
653 new->num = num+1; 651 mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
652 else
653 mk->mp->attrs[mk->mp->num].mattr.store = NULL;
654 mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
655 mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
656 mk->mp->num++;
654 657
655 /* Fix up all the pointers, since krealloc can move us */ 658 /* Fix up all the pointers, since krealloc can move us */
656 for (num = 0; num < new->num; num++) 659 for (i = 0; i < mk->mp->num; i++)
657 new->grp.attrs[num] = &new->attrs[num].mattr.attr; 660 mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
658 new->grp.attrs[num] = NULL; 661 mk->mp->grp.attrs[mk->mp->num] = NULL;
659
660 mk->mp = new;
661 return 0; 662 return 0;
662
663fail_free_new:
664 kfree(new);
665fail:
666 mk->mp = NULL;
667 return err;
668} 663}
669 664
670#ifdef CONFIG_MODULES 665#ifdef CONFIG_MODULES
671static void free_module_param_attrs(struct module_kobject *mk) 666static void free_module_param_attrs(struct module_kobject *mk)
672{ 667{
673 kfree(mk->mp->grp.attrs); 668 if (mk->mp)
669 kfree(mk->mp->grp.attrs);
674 kfree(mk->mp); 670 kfree(mk->mp);
675 mk->mp = NULL; 671 mk->mp = NULL;
676} 672}
@@ -695,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod,
695 if (kparam[i].perm == 0) 691 if (kparam[i].perm == 0)
696 continue; 692 continue;
697 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); 693 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
698 if (err) 694 if (err) {
695 free_module_param_attrs(&mod->mkobj);
699 return err; 696 return err;
697 }
700 params = true; 698 params = true;
701 } 699 }
702 700
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
79 .level = 0, 79 .level = 0,
80 .child_reaper = &init_task, 80 .child_reaper = &init_task,
81 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
82 .proc_inum = PROC_PID_INIT_INO, 82 .ns.inum = PROC_PID_INIT_INO,
83#ifdef CONFIG_PID_NS
84 .ns.ops = &pidns_operations,
85#endif
83}; 86};
84EXPORT_SYMBOL_GPL(init_pid_ns); 87EXPORT_SYMBOL_GPL(init_pid_ns);
85 88
@@ -341,6 +344,8 @@ out:
341 344
342out_unlock: 345out_unlock:
343 spin_unlock_irq(&pidmap_lock); 346 spin_unlock_irq(&pidmap_lock);
347 put_pid_ns(ns);
348
344out_free: 349out_free:
345 while (++i <= ns->level) 350 while (++i <= ns->level)
346 free_pidmap(pid->numbers + i); 351 free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
105 if (ns->pid_cachep == NULL) 105 if (ns->pid_cachep == NULL)
106 goto out_free_map; 106 goto out_free_map;
107 107
108 err = proc_alloc_inum(&ns->proc_inum); 108 err = ns_alloc_inum(&ns->ns);
109 if (err) 109 if (err)
110 goto out_free_map; 110 goto out_free_map;
111 ns->ns.ops = &pidns_operations;
111 112
112 kref_init(&ns->kref); 113 kref_init(&ns->kref);
113 ns->level = level; 114 ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
142{ 143{
143 int i; 144 int i;
144 145
145 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
146 for (i = 0; i < PIDMAP_ENTRIES; i++) 147 for (i = 0; i < PIDMAP_ENTRIES; i++)
147 kfree(ns->pidmap[i].page); 148 kfree(ns->pidmap[i].page);
148 put_user_ns(ns->user_ns); 149 put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
190 /* Don't allow any more processes into the pid namespace */ 191 /* Don't allow any more processes into the pid namespace */
191 disable_pid_allocation(pid_ns); 192 disable_pid_allocation(pid_ns);
192 193
193 /* Ignore SIGCHLD causing any terminated children to autoreap */ 194 /*
195 * Ignore SIGCHLD causing any terminated children to autoreap.
196 * This speeds up the namespace shutdown, plus see the comment
197 * below.
198 */
194 spin_lock_irq(&me->sighand->siglock); 199 spin_lock_irq(&me->sighand->siglock);
195 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 200 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
196 spin_unlock_irq(&me->sighand->siglock); 201 spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
223 } 228 }
224 read_unlock(&tasklist_lock); 229 read_unlock(&tasklist_lock);
225 230
226 /* Firstly reap the EXIT_ZOMBIE children we may have. */ 231 /*
232 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
233 * sys_wait4() will also block until our children traced from the
234 * parent namespace are detached and become EXIT_DEAD.
235 */
227 do { 236 do {
228 clear_thread_flag(TIF_SIGPENDING); 237 clear_thread_flag(TIF_SIGPENDING);
229 rc = sys_wait4(-1, NULL, __WALL, NULL); 238 rc = sys_wait4(-1, NULL, __WALL, NULL);
230 } while (rc != -ECHILD); 239 } while (rc != -ECHILD);
231 240
232 /* 241 /*
233 * sys_wait4() above can't reap the TASK_DEAD children. 242 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
234 * Make sure they all go away, see free_pid(). 243 * really care, we could reparent them to the global init. We could
244 * exit and reap ->child_reaper even if it is not the last thread in
245 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
246 * pid_ns can not go away until proc_kill_sb() drops the reference.
247 *
248 * But this ns can also have other tasks injected by setns()+fork().
249 * Again, ignoring the user visible semantics we do not really need
250 * to wait until they are all reaped, but they can be reparented to
251 * us and thus we need to ensure that pid->child_reaper stays valid
252 * until they all go away. See free_pid()->wake_up_process().
253 *
254 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
255 * if reparented.
235 */ 256 */
236 for (;;) { 257 for (;;) {
237 set_current_state(TASK_UNINTERRUPTIBLE); 258 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
313 return 0; 334 return 0;
314} 335}
315 336
316static void *pidns_get(struct task_struct *task) 337static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
338{
339 return container_of(ns, struct pid_namespace, ns);
340}
341
342static struct ns_common *pidns_get(struct task_struct *task)
317{ 343{
318 struct pid_namespace *ns; 344 struct pid_namespace *ns;
319 345
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
323 get_pid_ns(ns); 349 get_pid_ns(ns);
324 rcu_read_unlock(); 350 rcu_read_unlock();
325 351
326 return ns; 352 return ns ? &ns->ns : NULL;
327} 353}
328 354
329static void pidns_put(void *ns) 355static void pidns_put(struct ns_common *ns)
330{ 356{
331 put_pid_ns(ns); 357 put_pid_ns(to_pid_ns(ns));
332} 358}
333 359
334static int pidns_install(struct nsproxy *nsproxy, void *ns) 360static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
335{ 361{
336 struct pid_namespace *active = task_active_pid_ns(current); 362 struct pid_namespace *active = task_active_pid_ns(current);
337 struct pid_namespace *ancestor, *new = ns; 363 struct pid_namespace *ancestor, *new = to_pid_ns(ns);
338 364
339 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 365 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
340 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 366 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
362 return 0; 388 return 0;
363} 389}
364 390
365static unsigned int pidns_inum(void *ns)
366{
367 struct pid_namespace *pid_ns = ns;
368 return pid_ns->proc_inum;
369}
370
371const struct proc_ns_operations pidns_operations = { 391const struct proc_ns_operations pidns_operations = {
372 .name = "pid", 392 .name = "pid",
373 .type = CLONE_NEWPID, 393 .type = CLONE_NEWPID,
374 .get = pidns_get, 394 .get = pidns_get,
375 .put = pidns_put, 395 .put = pidns_put,
376 .install = pidns_install, 396 .install = pidns_install,
377 .inum = pidns_inum,
378}; 397};
379 398
380static __init int pid_namespaces_init(void) 399static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..48b28d387c7f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
94config PM_SLEEP 94config PM_SLEEP
95 def_bool y 95 def_bool y
96 depends on SUSPEND || HIBERNATE_CALLBACKS 96 depends on SUSPEND || HIBERNATE_CALLBACKS
97 select PM
97 98
98config PM_SLEEP_SMP 99config PM_SLEEP_SMP
99 def_bool y 100 def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
129 depends on PM_WAKELOCKS 130 depends on PM_WAKELOCKS
130 default y 131 default y
131 132
132config PM_RUNTIME 133config PM
133 bool "Run-time PM core functionality" 134 bool "Device power management core functionality"
134 depends on !IA64_HP_SIM
135 ---help--- 135 ---help---
136 Enable functionality allowing I/O devices to be put into energy-saving 136 Enable functionality allowing I/O devices to be put into energy-saving
137 (low power) states at run time (or autosuspended) after a specified 137 (low power) states, for example after a specified period of inactivity
138 period of inactivity and woken up in response to a hardware-generated 138 (autosuspended), and woken up in response to a hardware-generated
139 wake-up event or a driver's request. 139 wake-up event or a driver's request.
140 140
141 Hardware support is generally required for this functionality to work 141 Hardware support is generally required for this functionality to work
142 and the bus type drivers of the buses the devices are on are 142 and the bus type drivers of the buses the devices are on are
143 responsible for the actual handling of the autosuspend requests and 143 responsible for the actual handling of device suspend requests and
144 wake-up events. 144 wake-up events.
145 145
146config PM
147 def_bool y
148 depends on PM_SLEEP || PM_RUNTIME
149
150config PM_DEBUG 146config PM_DEBUG
151 bool "Power Management Debug Support" 147 bool "Power Management Debug Support"
152 depends on PM 148 depends on PM
@@ -298,14 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP
298 def_bool y 294 def_bool y
299 depends on PM_SLEEP && PM_GENERIC_DOMAINS 295 depends on PM_SLEEP && PM_GENERIC_DOMAINS
300 296
301config PM_GENERIC_DOMAINS_RUNTIME
302 def_bool y
303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
304
305config PM_GENERIC_DOMAINS_OF 297config PM_GENERIC_DOMAINS_OF
306 def_bool y 298 def_bool y
307 depends on PM_GENERIC_DOMAINS && OF 299 depends on PM_GENERIC_DOMAINS && OF
308 300
309config CPU_PM 301config CPU_PM
310 bool 302 bool
311 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <linux/ktime.h>
31#include <trace/events/power.h> 32#include <trace/events/power.h>
32 33
33#include "power.h" 34#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
232 * @nr_pages: Number of memory pages processed between @start and @stop. 233 * @nr_pages: Number of memory pages processed between @start and @stop.
233 * @msg: Additional diagnostic message to print. 234 * @msg: Additional diagnostic message to print.
234 */ 235 */
235void swsusp_show_speed(struct timeval *start, struct timeval *stop, 236void swsusp_show_speed(ktime_t start, ktime_t stop,
236 unsigned nr_pages, char *msg) 237 unsigned nr_pages, char *msg)
237{ 238{
239 ktime_t diff;
238 u64 elapsed_centisecs64; 240 u64 elapsed_centisecs64;
239 unsigned int centisecs; 241 unsigned int centisecs;
240 unsigned int k; 242 unsigned int k;
241 unsigned int kps; 243 unsigned int kps;
242 244
243 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 245 diff = ktime_sub(stop, start);
244 /* 246 elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
245 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
246 * it is obvious enough for what went wrong.
247 */
248 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
249 centisecs = elapsed_centisecs64; 247 centisecs = elapsed_centisecs64;
250 if (centisecs == 0) 248 if (centisecs == 0)
251 centisecs = 1; /* avoid div-by-zero */ 249 centisecs = 1; /* avoid div-by-zero */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
174 174
175struct timeval; 175struct timeval;
176/* kernel/power/swsusp.c */ 176/* kernel/power/swsusp.c */
177extern void swsusp_show_speed(struct timeval *, struct timeval *, 177extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
178 unsigned int, char *);
179 178
180#ifdef CONFIG_SUSPEND 179#ifdef CONFIG_SUSPEND
181/* kernel/power/suspend.c */ 180/* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/ktime.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
1576 struct zone *zone; 1577 struct zone *zone;
1577 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1578 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1578 unsigned long alloc, save_highmem, pages_highmem, avail_normal; 1579 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1579 struct timeval start, stop; 1580 ktime_t start, stop;
1580 int error; 1581 int error;
1581 1582
1582 printk(KERN_INFO "PM: Preallocating image memory... "); 1583 printk(KERN_INFO "PM: Preallocating image memory... ");
1583 do_gettimeofday(&start); 1584 start = ktime_get();
1584 1585
1585 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); 1586 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1586 if (error) 1587 if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
1709 free_unnecessary_pages(); 1710 free_unnecessary_pages();
1710 1711
1711 out: 1712 out:
1712 do_gettimeofday(&stop); 1713 stop = ktime_get();
1713 printk(KERN_CONT "done (allocated %lu pages)\n", pages); 1714 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1714 swsusp_show_speed(&start, &stop, pages, "Allocated"); 1715 swsusp_show_speed(start, stop, pages, "Allocated");
1715 1716
1716 return 0; 1717 return 0;
1717 1718
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
30#include <linux/atomic.h> 30#include <linux/atomic.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/ktime.h>
33 34
34#include "power.h" 35#include "power.h"
35 36
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
445 int nr_pages; 446 int nr_pages;
446 int err2; 447 int err2;
447 struct bio *bio; 448 struct bio *bio;
448 struct timeval start; 449 ktime_t start;
449 struct timeval stop; 450 ktime_t stop;
450 451
451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", 452 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 453 nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
455 m = 1; 456 m = 1;
456 nr_pages = 0; 457 nr_pages = 0;
457 bio = NULL; 458 bio = NULL;
458 do_gettimeofday(&start); 459 start = ktime_get();
459 while (1) { 460 while (1) {
460 ret = snapshot_read_next(snapshot); 461 ret = snapshot_read_next(snapshot);
461 if (ret <= 0) 462 if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
469 nr_pages++; 470 nr_pages++;
470 } 471 }
471 err2 = hib_wait_on_bio_chain(&bio); 472 err2 = hib_wait_on_bio_chain(&bio);
472 do_gettimeofday(&stop); 473 stop = ktime_get();
473 if (!ret) 474 if (!ret)
474 ret = err2; 475 ret = err2;
475 if (!ret) 476 if (!ret)
476 printk(KERN_INFO "PM: Image saving done.\n"); 477 printk(KERN_INFO "PM: Image saving done.\n");
477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 478 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
478 return ret; 479 return ret;
479} 480}
480 481
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
580 int nr_pages; 581 int nr_pages;
581 int err2; 582 int err2;
582 struct bio *bio; 583 struct bio *bio;
583 struct timeval start; 584 ktime_t start;
584 struct timeval stop; 585 ktime_t stop;
585 size_t off; 586 size_t off;
586 unsigned thr, run_threads, nr_threads; 587 unsigned thr, run_threads, nr_threads;
587 unsigned char *page = NULL; 588 unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
674 m = 1; 675 m = 1;
675 nr_pages = 0; 676 nr_pages = 0;
676 bio = NULL; 677 bio = NULL;
677 do_gettimeofday(&start); 678 start = ktime_get();
678 for (;;) { 679 for (;;) {
679 for (thr = 0; thr < nr_threads; thr++) { 680 for (thr = 0; thr < nr_threads; thr++) {
680 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 681 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
759 760
760out_finish: 761out_finish:
761 err2 = hib_wait_on_bio_chain(&bio); 762 err2 = hib_wait_on_bio_chain(&bio);
762 do_gettimeofday(&stop); 763 stop = ktime_get();
763 if (!ret) 764 if (!ret)
764 ret = err2; 765 ret = err2;
765 if (!ret) 766 if (!ret)
766 printk(KERN_INFO "PM: Image saving done.\n"); 767 printk(KERN_INFO "PM: Image saving done.\n");
767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 768 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
768out_clean: 769out_clean:
769 if (crc) { 770 if (crc) {
770 if (crc->thr) 771 if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
965{ 966{
966 unsigned int m; 967 unsigned int m;
967 int ret = 0; 968 int ret = 0;
968 struct timeval start; 969 ktime_t start;
969 struct timeval stop; 970 ktime_t stop;
970 struct bio *bio; 971 struct bio *bio;
971 int err2; 972 int err2;
972 unsigned nr_pages; 973 unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
978 m = 1; 979 m = 1;
979 nr_pages = 0; 980 nr_pages = 0;
980 bio = NULL; 981 bio = NULL;
981 do_gettimeofday(&start); 982 start = ktime_get();
982 for ( ; ; ) { 983 for ( ; ; ) {
983 ret = snapshot_write_next(snapshot); 984 ret = snapshot_write_next(snapshot);
984 if (ret <= 0) 985 if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
996 nr_pages++; 997 nr_pages++;
997 } 998 }
998 err2 = hib_wait_on_bio_chain(&bio); 999 err2 = hib_wait_on_bio_chain(&bio);
999 do_gettimeofday(&stop); 1000 stop = ktime_get();
1000 if (!ret) 1001 if (!ret)
1001 ret = err2; 1002 ret = err2;
1002 if (!ret) { 1003 if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
1005 if (!snapshot_image_loaded(snapshot)) 1006 if (!snapshot_image_loaded(snapshot))
1006 ret = -ENODATA; 1007 ret = -ENODATA;
1007 } 1008 }
1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1009 swsusp_show_speed(start, stop, nr_to_read, "Read");
1009 return ret; 1010 return ret;
1010} 1011}
1011 1012
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
1067 int ret = 0; 1068 int ret = 0;
1068 int eof = 0; 1069 int eof = 0;
1069 struct bio *bio; 1070 struct bio *bio;
1070 struct timeval start; 1071 ktime_t start;
1071 struct timeval stop; 1072 ktime_t stop;
1072 unsigned nr_pages; 1073 unsigned nr_pages;
1073 size_t off; 1074 size_t off;
1074 unsigned i, thr, run_threads, nr_threads; 1075 unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1190 m = 1; 1191 m = 1;
1191 nr_pages = 0; 1192 nr_pages = 0;
1192 bio = NULL; 1193 bio = NULL;
1193 do_gettimeofday(&start); 1194 start = ktime_get();
1194 1195
1195 ret = snapshot_write_next(snapshot); 1196 ret = snapshot_write_next(snapshot);
1196 if (ret <= 0) 1197 if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
1343 wait_event(crc->done, atomic_read(&crc->stop)); 1344 wait_event(crc->done, atomic_read(&crc->stop));
1344 atomic_set(&crc->stop, 0); 1345 atomic_set(&crc->stop, 0);
1345 } 1346 }
1346 do_gettimeofday(&stop); 1347 stop = ktime_get();
1347 if (!ret) { 1348 if (!ret) {
1348 printk(KERN_INFO "PM: Image loading done.\n"); 1349 printk(KERN_INFO "PM: Image loading done.\n");
1349 snapshot_write_finalize(snapshot); 1350 snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
1359 } 1360 }
1360 } 1361 }
1361 } 1362 }
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1363 swsusp_show_speed(start, stop, nr_to_read, "Read");
1363out_clean: 1364out_clean:
1364 for (i = 0; i < ring_size; i++) 1365 for (i = 0; i < ring_size; i++)
1365 free_page((unsigned long)page[i]); 1366 free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
1374 kthread_stop(data[thr].thr); 1375 kthread_stop(data[thr].thr);
1375 vfree(data); 1376 vfree(data);
1376 } 1377 }
1377 if (page) vfree(page); 1378 vfree(page);
1378 1379
1379 return ret; 1380 return ret;
1380} 1381}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
63}; 63};
64 64
65/* Deferred messaged from sched code are marked by this special level */
66#define SCHED_MESSAGE_LOGLEVEL -2
67
68/* 65/*
69 * Low level drivers may need that to know if they can schedule in 66 * Low level drivers may need that to know if they can schedule in
70 * their unblank() callback or not. So let's export it. 67 * their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
480 type != SYSLOG_ACTION_SIZE_BUFFER; 477 type != SYSLOG_ACTION_SIZE_BUFFER;
481} 478}
482 479
483static int check_syslog_permissions(int type, bool from_file) 480int check_syslog_permissions(int type, bool from_file)
484{ 481{
485 /* 482 /*
486 * If this is from /proc/kmsg and we've already opened it, then we've 483 * If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1259int do_syslog(int type, char __user *buf, int len, bool from_file) 1256int do_syslog(int type, char __user *buf, int len, bool from_file)
1260{ 1257{
1261 bool clear = false; 1258 bool clear = false;
1262 static int saved_console_loglevel = -1; 1259 static int saved_console_loglevel = LOGLEVEL_DEFAULT;
1263 int error; 1260 int error;
1264 1261
1265 error = check_syslog_permissions(type, from_file); 1262 error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1316 break; 1313 break;
1317 /* Disable logging to console */ 1314 /* Disable logging to console */
1318 case SYSLOG_ACTION_CONSOLE_OFF: 1315 case SYSLOG_ACTION_CONSOLE_OFF:
1319 if (saved_console_loglevel == -1) 1316 if (saved_console_loglevel == LOGLEVEL_DEFAULT)
1320 saved_console_loglevel = console_loglevel; 1317 saved_console_loglevel = console_loglevel;
1321 console_loglevel = minimum_console_loglevel; 1318 console_loglevel = minimum_console_loglevel;
1322 break; 1319 break;
1323 /* Enable logging to console */ 1320 /* Enable logging to console */
1324 case SYSLOG_ACTION_CONSOLE_ON: 1321 case SYSLOG_ACTION_CONSOLE_ON:
1325 if (saved_console_loglevel != -1) { 1322 if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
1326 console_loglevel = saved_console_loglevel; 1323 console_loglevel = saved_console_loglevel;
1327 saved_console_loglevel = -1; 1324 saved_console_loglevel = LOGLEVEL_DEFAULT;
1328 } 1325 }
1329 break; 1326 break;
1330 /* Set level of messages printed to console */ 1327 /* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1336 len = minimum_console_loglevel; 1333 len = minimum_console_loglevel;
1337 console_loglevel = len; 1334 console_loglevel = len;
1338 /* Implicitly re-enable logging to console */ 1335 /* Implicitly re-enable logging to console */
1339 saved_console_loglevel = -1; 1336 saved_console_loglevel = LOGLEVEL_DEFAULT;
1340 error = 0; 1337 error = 0;
1341 break; 1338 break;
1342 /* Number of chars in the log buffer */ 1339 /* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1627 int printed_len = 0; 1624 int printed_len = 0;
1628 bool in_sched = false; 1625 bool in_sched = false;
1629 /* cpu currently holding logbuf_lock in this function */ 1626 /* cpu currently holding logbuf_lock in this function */
1630 static volatile unsigned int logbuf_cpu = UINT_MAX; 1627 static unsigned int logbuf_cpu = UINT_MAX;
1631 1628
1632 if (level == SCHED_MESSAGE_LOGLEVEL) { 1629 if (level == LOGLEVEL_SCHED) {
1633 level = -1; 1630 level = LOGLEVEL_DEFAULT;
1634 in_sched = true; 1631 in_sched = true;
1635 } 1632 }
1636 1633
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1695 const char *end_of_header = printk_skip_level(text); 1692 const char *end_of_header = printk_skip_level(text);
1696 switch (kern_level) { 1693 switch (kern_level) {
1697 case '0' ... '7': 1694 case '0' ... '7':
1698 if (level == -1) 1695 if (level == LOGLEVEL_DEFAULT)
1699 level = kern_level - '0'; 1696 level = kern_level - '0';
1697 /* fallthrough */
1700 case 'd': /* KERN_DEFAULT */ 1698 case 'd': /* KERN_DEFAULT */
1701 lflags |= LOG_PREFIX; 1699 lflags |= LOG_PREFIX;
1702 } 1700 }
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1710 } 1708 }
1711 } 1709 }
1712 1710
1713 if (level == -1) 1711 if (level == LOGLEVEL_DEFAULT)
1714 level = default_message_loglevel; 1712 level = default_message_loglevel;
1715 1713
1716 if (dict) 1714 if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
1788 1786
1789asmlinkage int vprintk(const char *fmt, va_list args) 1787asmlinkage int vprintk(const char *fmt, va_list args)
1790{ 1788{
1791 return vprintk_emit(0, -1, NULL, 0, fmt, args); 1789 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1792} 1790}
1793EXPORT_SYMBOL(vprintk); 1791EXPORT_SYMBOL(vprintk);
1794 1792
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
1807} 1805}
1808EXPORT_SYMBOL(printk_emit); 1806EXPORT_SYMBOL(printk_emit);
1809 1807
1808int vprintk_default(const char *fmt, va_list args)
1809{
1810 int r;
1811
1812#ifdef CONFIG_KGDB_KDB
1813 if (unlikely(kdb_trap_printk)) {
1814 r = vkdb_printf(fmt, args);
1815 return r;
1816 }
1817#endif
1818 r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1819
1820 return r;
1821}
1822EXPORT_SYMBOL_GPL(vprintk_default);
1823
1824/*
1825 * This allows printk to be diverted to another function per cpu.
1826 * This is useful for calling printk functions from within NMI
1827 * without worrying about race conditions that can lock up the
1828 * box.
1829 */
1830DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
1831
1810/** 1832/**
1811 * printk - print a kernel message 1833 * printk - print a kernel message
1812 * @fmt: format string 1834 * @fmt: format string
@@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
1830 */ 1852 */
1831asmlinkage __visible int printk(const char *fmt, ...) 1853asmlinkage __visible int printk(const char *fmt, ...)
1832{ 1854{
1855 printk_func_t vprintk_func;
1833 va_list args; 1856 va_list args;
1834 int r; 1857 int r;
1835 1858
1836#ifdef CONFIG_KGDB_KDB
1837 if (unlikely(kdb_trap_printk)) {
1838 va_start(args, fmt);
1839 r = vkdb_printf(fmt, args);
1840 va_end(args);
1841 return r;
1842 }
1843#endif
1844 va_start(args, fmt); 1859 va_start(args, fmt);
1845 r = vprintk_emit(0, -1, NULL, 0, fmt, args); 1860
1861 /*
1862 * If a caller overrides the per_cpu printk_func, then it needs
1863 * to disable preemption when calling printk(). Otherwise
1864 * the printk_func should be set to the default. No need to
1865 * disable preemption here.
1866 */
1867 vprintk_func = this_cpu_read(printk_func);
1868 r = vprintk_func(fmt, args);
1869
1846 va_end(args); 1870 va_end(args);
1847 1871
1848 return r; 1872 return r;
@@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1876 bool syslog, char *buf, size_t size) { return 0; } 1900 bool syslog, char *buf, size_t size) { return 0; }
1877static size_t cont_print_text(char *text, size_t size) { return 0; } 1901static size_t cont_print_text(char *text, size_t size) { return 0; }
1878 1902
1903/* Still needs to be defined for users */
1904DEFINE_PER_CPU(printk_func_t, printk_func);
1905
1879#endif /* CONFIG_PRINTK */ 1906#endif /* CONFIG_PRINTK */
1880 1907
1881#ifdef CONFIG_EARLY_PRINTK 1908#ifdef CONFIG_EARLY_PRINTK
1882struct console *early_console; 1909struct console *early_console;
1883 1910
1884void early_vprintk(const char *fmt, va_list ap)
1885{
1886 if (early_console) {
1887 char buf[512];
1888 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1889
1890 early_console->write(early_console, buf, n);
1891 }
1892}
1893
1894asmlinkage __visible void early_printk(const char *fmt, ...) 1911asmlinkage __visible void early_printk(const char *fmt, ...)
1895{ 1912{
1896 va_list ap; 1913 va_list ap;
1914 char buf[512];
1915 int n;
1916
1917 if (!early_console)
1918 return;
1897 1919
1898 va_start(ap, fmt); 1920 va_start(ap, fmt);
1899 early_vprintk(fmt, ap); 1921 n = vscnprintf(buf, sizeof(buf), fmt, ap);
1900 va_end(ap); 1922 va_end(ap);
1923
1924 early_console->write(early_console, buf, n);
1901} 1925}
1902#endif 1926#endif
1903 1927
@@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...)
2634 2658
2635 preempt_disable(); 2659 preempt_disable();
2636 va_start(args, fmt); 2660 va_start(args, fmt);
2637 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); 2661 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2638 va_end(args); 2662 va_end(args);
2639 2663
2640 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2664 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
485 485
486/* 486/*
487 * Detach all tasks we were using ptrace on. Called with tasklist held 487 * Detach all tasks we were using ptrace on. Called with tasklist held
488 * for writing, and returns with it held too. But note it can release 488 * for writing.
489 * and reacquire the lock.
490 */ 489 */
491void exit_ptrace(struct task_struct *tracer) 490void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
492 __releases(&tasklist_lock)
493 __acquires(&tasklist_lock)
494{ 491{
495 struct task_struct *p, *n; 492 struct task_struct *p, *n;
496 LIST_HEAD(ptrace_dead);
497
498 if (likely(list_empty(&tracer->ptraced)))
499 return;
500 493
501 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 494 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
502 if (unlikely(p->ptrace & PT_EXITKILL)) 495 if (unlikely(p->ptrace & PT_EXITKILL))
503 send_sig_info(SIGKILL, SEND_SIG_FORCED, p); 496 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
504 497
505 if (__ptrace_detach(tracer, p)) 498 if (__ptrace_detach(tracer, p))
506 list_add(&p->ptrace_entry, &ptrace_dead); 499 list_add(&p->ptrace_entry, dead);
507 }
508
509 write_unlock_irq(&tasklist_lock);
510 BUG_ON(!list_empty(&tracer->ptraced));
511
512 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
513 list_del_init(&p->ptrace_entry);
514 release_task(p);
515 } 500 }
516
517 write_lock_irq(&tasklist_lock);
518} 501}
519 502
520int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 503int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
113{ 113{
114 const struct range *r1 = x1; 114 const struct range *r1 = x1;
115 const struct range *r2 = x2; 115 const struct range *r2 = x2;
116 s64 start1, start2;
117 116
118 start1 = r1->start; 117 if (r1->start < r2->start)
119 start2 = r2->start; 118 return -1;
120 119 if (r1->start > r2->start)
121 return start1 - start2; 120 return 1;
121 return 0;
122} 122}
123 123
124int clean_sort_range(struct range *range, int az) 124int clean_sort_range(struct range *range, int az)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
1obj-y += update.o srcu.o 1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o 4obj-$(CONFIG_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o 6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
135 */ 135 */
136#define TPS(x) tracepoint_string(x) 136#define TPS(x) tracepoint_string(x)
137 137
138void rcu_early_boot_tests(void);
139
138#endif /* __LINUX_RCU_H */ 140#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 240fa9094f83..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg)
812 cur_ops->cb_barrier(); 812 cur_ops->cb_barrier();
813 stutter_wait("rcu_torture_cbflood"); 813 stutter_wait("rcu_torture_cbflood");
814 } while (!torture_must_stop()); 814 } while (!torture_must_stop());
815 vfree(rhp);
815 torture_kthread_stopping("rcu_torture_cbflood"); 816 torture_kthread_stopping("rcu_torture_cbflood");
816 return 0; 817 return 0;
817} 818}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c0623fc47125..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -247,7 +247,7 @@ void rcu_bh_qs(void)
247 * be called from hardirq context. It is normally called from the 247 * be called from hardirq context. It is normally called from the
248 * scheduling-clock interrupt. 248 * scheduling-clock interrupt.
249 */ 249 */
250void rcu_check_callbacks(int cpu, int user) 250void rcu_check_callbacks(int user)
251{ 251{
252 RCU_TRACE(check_cpu_stalls()); 252 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 253 if (user || rcu_is_cpu_rrupt_from_idle())
@@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
380} 380}
381EXPORT_SYMBOL_GPL(call_rcu_bh); 381EXPORT_SYMBOL_GPL(call_rcu_bh);
382 382
383void rcu_init(void) 383void __init rcu_init(void)
384{ 384{
385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 385 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
386
387 rcu_early_boot_tests();
386} 388}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9815447d22e0..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \
105 .name = RCU_STATE_NAME(sname), \ 105 .name = RCU_STATE_NAME(sname), \
106 .abbr = sabbr, \ 106 .abbr = sabbr, \
107}; \ 107}; \
108DEFINE_PER_CPU(struct rcu_data, sname##_data) 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
109 109
110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155#ifdef CONFIG_RCU_BOOST
156
157/*
158 * Control variables for per-CPU and per-rcu_node kthreads. These
159 * handle all flavors of RCU.
160 */
161static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
162DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
163DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
164DEFINE_PER_CPU(char, rcu_cpu_has_work);
165
166#endif /* #ifdef CONFIG_RCU_BOOST */
167
168static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
169static void invoke_rcu_core(void); 156static void invoke_rcu_core(void);
170static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
286 * and requires special handling for preemptible RCU. 273 * and requires special handling for preemptible RCU.
287 * The caller must have disabled preemption. 274 * The caller must have disabled preemption.
288 */ 275 */
289void rcu_note_context_switch(int cpu) 276void rcu_note_context_switch(void)
290{ 277{
291 trace_rcu_utilization(TPS("Start context switch")); 278 trace_rcu_utilization(TPS("Start context switch"));
292 rcu_sched_qs(); 279 rcu_sched_qs();
293 rcu_preempt_note_context_switch(cpu); 280 rcu_preempt_note_context_switch();
294 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 281 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
295 rcu_momentary_dyntick_idle(); 282 rcu_momentary_dyntick_idle();
296 trace_rcu_utilization(TPS("End context switch")); 283 trace_rcu_utilization(TPS("End context switch"));
@@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
325 unsigned long *maxj), 312 unsigned long *maxj),
326 bool *isidle, unsigned long *maxj); 313 bool *isidle, unsigned long *maxj);
327static void force_quiescent_state(struct rcu_state *rsp); 314static void force_quiescent_state(struct rcu_state *rsp);
328static int rcu_pending(int cpu); 315static int rcu_pending(void);
329 316
330/* 317/*
331 * Return the number of RCU-sched batches processed thus far for debug & stats. 318 * Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
510 * we really have entered idle, and must do the appropriate accounting. 497 * we really have entered idle, and must do the appropriate accounting.
511 * The caller must have disabled interrupts. 498 * The caller must have disabled interrupts.
512 */ 499 */
513static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 500static void rcu_eqs_enter_common(long long oldval, bool user)
514 bool user)
515{ 501{
516 struct rcu_state *rsp; 502 struct rcu_state *rsp;
517 struct rcu_data *rdp; 503 struct rcu_data *rdp;
504 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
518 505
519 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 506 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
520 if (!user && !is_idle_task(current)) { 507 if (!user && !is_idle_task(current)) {
@@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
531 rdp = this_cpu_ptr(rsp->rda); 518 rdp = this_cpu_ptr(rsp->rda);
532 do_nocb_deferred_wakeup(rdp); 519 do_nocb_deferred_wakeup(rdp);
533 } 520 }
534 rcu_prepare_for_idle(smp_processor_id()); 521 rcu_prepare_for_idle();
535 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 522 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
536 smp_mb__before_atomic(); /* See above. */ 523 smp_mb__before_atomic(); /* See above. */
537 atomic_inc(&rdtp->dynticks); 524 atomic_inc(&rdtp->dynticks);
@@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user)
565 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 552 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
566 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 553 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
567 rdtp->dynticks_nesting = 0; 554 rdtp->dynticks_nesting = 0;
568 rcu_eqs_enter_common(rdtp, oldval, user); 555 rcu_eqs_enter_common(oldval, user);
569 } else { 556 } else {
570 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 557 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
571 } 558 }
@@ -589,7 +576,7 @@ void rcu_idle_enter(void)
589 576
590 local_irq_save(flags); 577 local_irq_save(flags);
591 rcu_eqs_enter(false); 578 rcu_eqs_enter(false);
592 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); 579 rcu_sysidle_enter(0);
593 local_irq_restore(flags); 580 local_irq_restore(flags);
594} 581}
595EXPORT_SYMBOL_GPL(rcu_idle_enter); 582EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -639,8 +626,8 @@ void rcu_irq_exit(void)
639 if (rdtp->dynticks_nesting) 626 if (rdtp->dynticks_nesting)
640 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 627 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
641 else 628 else
642 rcu_eqs_enter_common(rdtp, oldval, true); 629 rcu_eqs_enter_common(oldval, true);
643 rcu_sysidle_enter(rdtp, 1); 630 rcu_sysidle_enter(1);
644 local_irq_restore(flags); 631 local_irq_restore(flags);
645} 632}
646 633
@@ -651,16 +638,17 @@ void rcu_irq_exit(void)
651 * we really have exited idle, and must do the appropriate accounting. 638 * we really have exited idle, and must do the appropriate accounting.
652 * The caller must have disabled interrupts. 639 * The caller must have disabled interrupts.
653 */ 640 */
654static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 641static void rcu_eqs_exit_common(long long oldval, int user)
655 int user)
656{ 642{
643 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
644
657 rcu_dynticks_task_exit(); 645 rcu_dynticks_task_exit();
658 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 646 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
659 atomic_inc(&rdtp->dynticks); 647 atomic_inc(&rdtp->dynticks);
660 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 648 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
661 smp_mb__after_atomic(); /* See above. */ 649 smp_mb__after_atomic(); /* See above. */
662 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 650 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
663 rcu_cleanup_after_idle(smp_processor_id()); 651 rcu_cleanup_after_idle();
664 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 652 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
665 if (!user && !is_idle_task(current)) { 653 if (!user && !is_idle_task(current)) {
666 struct task_struct *idle __maybe_unused = 654 struct task_struct *idle __maybe_unused =
@@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user)
691 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 679 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
692 } else { 680 } else {
693 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 681 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
694 rcu_eqs_exit_common(rdtp, oldval, user); 682 rcu_eqs_exit_common(oldval, user);
695 } 683 }
696} 684}
697 685
@@ -712,7 +700,7 @@ void rcu_idle_exit(void)
712 700
713 local_irq_save(flags); 701 local_irq_save(flags);
714 rcu_eqs_exit(false); 702 rcu_eqs_exit(false);
715 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); 703 rcu_sysidle_exit(0);
716 local_irq_restore(flags); 704 local_irq_restore(flags);
717} 705}
718EXPORT_SYMBOL_GPL(rcu_idle_exit); 706EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -763,8 +751,8 @@ void rcu_irq_enter(void)
763 if (oldval) 751 if (oldval)
764 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 752 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
765 else 753 else
766 rcu_eqs_exit_common(rdtp, oldval, true); 754 rcu_eqs_exit_common(oldval, true);
767 rcu_sysidle_exit(rdtp, 1); 755 rcu_sysidle_exit(1);
768 local_irq_restore(flags); 756 local_irq_restore(flags);
769} 757}
770 758
@@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2387 * invoked from the scheduling-clock interrupt. If rcu_pending returns 2375 * invoked from the scheduling-clock interrupt. If rcu_pending returns
2388 * false, there is no point in invoking rcu_check_callbacks(). 2376 * false, there is no point in invoking rcu_check_callbacks().
2389 */ 2377 */
2390void rcu_check_callbacks(int cpu, int user) 2378void rcu_check_callbacks(int user)
2391{ 2379{
2392 trace_rcu_utilization(TPS("Start scheduler-tick")); 2380 trace_rcu_utilization(TPS("Start scheduler-tick"));
2393 increment_cpu_stall_ticks(); 2381 increment_cpu_stall_ticks();
@@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user)
2419 2407
2420 rcu_bh_qs(); 2408 rcu_bh_qs();
2421 } 2409 }
2422 rcu_preempt_check_callbacks(cpu); 2410 rcu_preempt_check_callbacks();
2423 if (rcu_pending(cpu)) 2411 if (rcu_pending())
2424 invoke_rcu_core(); 2412 invoke_rcu_core();
2425 if (user) 2413 if (user)
2426 rcu_note_voluntary_context_switch(current); 2414 rcu_note_voluntary_context_switch(current);
@@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2963 */ 2951 */
2964void synchronize_sched_expedited(void) 2952void synchronize_sched_expedited(void)
2965{ 2953{
2954 cpumask_var_t cm;
2955 bool cma = false;
2956 int cpu;
2966 long firstsnap, s, snap; 2957 long firstsnap, s, snap;
2967 int trycount = 0; 2958 int trycount = 0;
2968 struct rcu_state *rsp = &rcu_sched_state; 2959 struct rcu_state *rsp = &rcu_sched_state;
@@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void)
2997 } 2988 }
2998 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2989 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2999 2990
2991 /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
2992 cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
2993 if (cma) {
2994 cpumask_copy(cm, cpu_online_mask);
2995 cpumask_clear_cpu(raw_smp_processor_id(), cm);
2996 for_each_cpu(cpu, cm) {
2997 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2998
2999 if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3000 cpumask_clear_cpu(cpu, cm);
3001 }
3002 if (cpumask_weight(cm) == 0)
3003 goto all_cpus_idle;
3004 }
3005
3000 /* 3006 /*
3001 * Each pass through the following loop attempts to force a 3007 * Each pass through the following loop attempts to force a
3002 * context switch on each CPU. 3008 * context switch on each CPU.
3003 */ 3009 */
3004 while (try_stop_cpus(cpu_online_mask, 3010 while (try_stop_cpus(cma ? cm : cpu_online_mask,
3005 synchronize_sched_expedited_cpu_stop, 3011 synchronize_sched_expedited_cpu_stop,
3006 NULL) == -EAGAIN) { 3012 NULL) == -EAGAIN) {
3007 put_online_cpus(); 3013 put_online_cpus();
@@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void)
3013 /* ensure test happens before caller kfree */ 3019 /* ensure test happens before caller kfree */
3014 smp_mb__before_atomic(); /* ^^^ */ 3020 smp_mb__before_atomic(); /* ^^^ */
3015 atomic_long_inc(&rsp->expedited_workdone1); 3021 atomic_long_inc(&rsp->expedited_workdone1);
3022 free_cpumask_var(cm);
3016 return; 3023 return;
3017 } 3024 }
3018 3025
@@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void)
3022 } else { 3029 } else {
3023 wait_rcu_gp(call_rcu_sched); 3030 wait_rcu_gp(call_rcu_sched);
3024 atomic_long_inc(&rsp->expedited_normal); 3031 atomic_long_inc(&rsp->expedited_normal);
3032 free_cpumask_var(cm);
3025 return; 3033 return;
3026 } 3034 }
3027 3035
@@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void)
3031 /* ensure test happens before caller kfree */ 3039 /* ensure test happens before caller kfree */
3032 smp_mb__before_atomic(); /* ^^^ */ 3040 smp_mb__before_atomic(); /* ^^^ */
3033 atomic_long_inc(&rsp->expedited_workdone2); 3041 atomic_long_inc(&rsp->expedited_workdone2);
3042 free_cpumask_var(cm);
3034 return; 3043 return;
3035 } 3044 }
3036 3045
@@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void)
3045 /* CPU hotplug operation in flight, use normal GP. */ 3054 /* CPU hotplug operation in flight, use normal GP. */
3046 wait_rcu_gp(call_rcu_sched); 3055 wait_rcu_gp(call_rcu_sched);
3047 atomic_long_inc(&rsp->expedited_normal); 3056 atomic_long_inc(&rsp->expedited_normal);
3057 free_cpumask_var(cm);
3048 return; 3058 return;
3049 } 3059 }
3050 snap = atomic_long_read(&rsp->expedited_start); 3060 snap = atomic_long_read(&rsp->expedited_start);
@@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void)
3052 } 3062 }
3053 atomic_long_inc(&rsp->expedited_stoppedcpus); 3063 atomic_long_inc(&rsp->expedited_stoppedcpus);
3054 3064
3065all_cpus_idle:
3066 free_cpumask_var(cm);
3067
3055 /* 3068 /*
3056 * Everyone up to our most recent fetch is covered by our grace 3069 * Everyone up to our most recent fetch is covered by our grace
3057 * period. Update the counter, but only if our work is still 3070 * period. Update the counter, but only if our work is still
@@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3143 * by the current CPU, returning 1 if so. This function is part of the 3156 * by the current CPU, returning 1 if so. This function is part of the
3144 * RCU implementation; it is -not- an exported member of the RCU API. 3157 * RCU implementation; it is -not- an exported member of the RCU API.
3145 */ 3158 */
3146static int rcu_pending(int cpu) 3159static int rcu_pending(void)
3147{ 3160{
3148 struct rcu_state *rsp; 3161 struct rcu_state *rsp;
3149 3162
3150 for_each_rcu_flavor(rsp) 3163 for_each_rcu_flavor(rsp)
3151 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) 3164 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
3152 return 1; 3165 return 1;
3153 return 0; 3166 return 0;
3154} 3167}
@@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu)
3158 * non-NULL, store an indication of whether all callbacks are lazy. 3171 * non-NULL, store an indication of whether all callbacks are lazy.
3159 * (If there are no callbacks, all of them are deemed to be lazy.) 3172 * (If there are no callbacks, all of them are deemed to be lazy.)
3160 */ 3173 */
3161static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 3174static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3162{ 3175{
3163 bool al = true; 3176 bool al = true;
3164 bool hc = false; 3177 bool hc = false;
@@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
3166 struct rcu_state *rsp; 3179 struct rcu_state *rsp;
3167 3180
3168 for_each_rcu_flavor(rsp) { 3181 for_each_rcu_flavor(rsp) {
3169 rdp = per_cpu_ptr(rsp->rda, cpu); 3182 rdp = this_cpu_ptr(rsp->rda);
3170 if (!rdp->nxtlist) 3183 if (!rdp->nxtlist)
3171 continue; 3184 continue;
3172 hc = true; 3185 hc = true;
@@ -3485,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
3485 case CPU_DEAD_FROZEN: 3498 case CPU_DEAD_FROZEN:
3486 case CPU_UP_CANCELED: 3499 case CPU_UP_CANCELED:
3487 case CPU_UP_CANCELED_FROZEN: 3500 case CPU_UP_CANCELED_FROZEN:
3488 for_each_rcu_flavor(rsp) 3501 for_each_rcu_flavor(rsp) {
3489 rcu_cleanup_dead_cpu(cpu, rsp); 3502 rcu_cleanup_dead_cpu(cpu, rsp);
3503 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
3504 }
3490 break; 3505 break;
3491 default: 3506 default:
3492 break; 3507 break;
@@ -3766,6 +3781,8 @@ void __init rcu_init(void)
3766 pm_notifier(rcu_pm_notify, 0); 3781 pm_notifier(rcu_pm_notify, 0);
3767 for_each_online_cpu(cpu) 3782 for_each_online_cpu(cpu)
3768 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3783 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3784
3785 rcu_early_boot_tests();
3769} 3786}
3770 3787
3771#include "tree_plugin.h" 3788#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bbdc45d8d74f..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
139 unsigned long expmask; /* Groups that have ->blkd_tasks */ 139 unsigned long expmask; /* Groups that have ->blkd_tasks */
140 /* elements that need to drain to allow the */ 140 /* elements that need to drain to allow the */
141 /* current expedited grace period to */ 141 /* current expedited grace period to */
142 /* complete (only for TREE_PREEMPT_RCU). */ 142 /* complete (only for PREEMPT_RCU). */
143 unsigned long qsmaskinit; 143 unsigned long qsmaskinit;
144 /* Per-GP initial value for qsmask & expmask. */ 144 /* Per-GP initial value for qsmask & expmask. */
145 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 145 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
530extern struct rcu_state rcu_bh_state; 530extern struct rcu_state rcu_bh_state;
531DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 531DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
532 532
533#ifdef CONFIG_TREE_PREEMPT_RCU 533#ifdef CONFIG_PREEMPT_RCU
534extern struct rcu_state rcu_preempt_state; 534extern struct rcu_state rcu_preempt_state;
535DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 535DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
536#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 536#endif /* #ifdef CONFIG_PREEMPT_RCU */
537 537
538#ifdef CONFIG_RCU_BOOST 538#ifdef CONFIG_RCU_BOOST
539DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 539DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
547/* Forward declarations for rcutree_plugin.h */ 547/* Forward declarations for rcutree_plugin.h */
548static void rcu_bootup_announce(void); 548static void rcu_bootup_announce(void);
549long rcu_batches_completed(void); 549long rcu_batches_completed(void);
550static void rcu_preempt_note_context_switch(int cpu); 550static void rcu_preempt_note_context_switch(void);
551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 551static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
552#ifdef CONFIG_HOTPLUG_CPU 552#ifdef CONFIG_HOTPLUG_CPU
553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 553static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
561 struct rcu_node *rnp, 561 struct rcu_node *rnp,
562 struct rcu_data *rdp); 562 struct rcu_data *rdp);
563#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 563#endif /* #ifdef CONFIG_HOTPLUG_CPU */
564static void rcu_preempt_check_callbacks(int cpu); 564static void rcu_preempt_check_callbacks(void);
565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 565void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 566#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 567static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
568 bool wake); 568 bool wake);
569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 569#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
570static void __init __rcu_init_preempt(void); 570static void __init __rcu_init_preempt(void);
571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 571static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 572static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -579,8 +579,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
579#endif /* #ifdef CONFIG_RCU_BOOST */ 579#endif /* #ifdef CONFIG_RCU_BOOST */
580static void __init rcu_spawn_boost_kthreads(void); 580static void __init rcu_spawn_boost_kthreads(void);
581static void rcu_prepare_kthreads(int cpu); 581static void rcu_prepare_kthreads(int cpu);
582static void rcu_cleanup_after_idle(int cpu); 582static void rcu_cleanup_after_idle(void);
583static void rcu_prepare_for_idle(int cpu); 583static void rcu_prepare_for_idle(void);
584static void rcu_idle_count_callbacks_posted(void); 584static void rcu_idle_count_callbacks_posted(void);
585static void print_cpu_stall_info_begin(void); 585static void print_cpu_stall_info_begin(void);
586static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 586static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
@@ -606,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
606#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 606#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
607static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 607static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
608static bool init_nocb_callback_list(struct rcu_data *rdp); 608static bool init_nocb_callback_list(struct rcu_data *rdp);
609static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 609static void rcu_sysidle_enter(int irq);
610static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); 610static void rcu_sysidle_exit(int irq);
611static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 611static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
612 unsigned long *maxj); 612 unsigned long *maxj);
613static bool is_sysidle_rcu_state(struct rcu_state *rsp); 613static bool is_sysidle_rcu_state(struct rcu_state *rsp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1d7f27bd38f..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "../time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1
34
35#ifdef CONFIG_RCU_BOOST 33#ifdef CONFIG_RCU_BOOST
34
36#include "../locking/rtmutex_common.h" 35#include "../locking/rtmutex_common.h"
37#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 36
38#else 37/* rcuc/rcub kthread realtime priority */
39#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 38static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
40#endif 39module_param(kthread_prio, int, 0644);
40
41/*
42 * Control variables for per-CPU and per-rcu_node kthreads. These
43 * handle all flavors of RCU.
44 */
45static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
46DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
47DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
48DEFINE_PER_CPU(char, rcu_cpu_has_work);
49
50#endif /* #ifdef CONFIG_RCU_BOOST */
41 51
42#ifdef CONFIG_RCU_NOCB_CPU 52#ifdef CONFIG_RCU_NOCB_CPU
43static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 53static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 82#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 pr_info("\tRCU torture testing starts during boot.\n"); 83 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 84#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 85#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 86 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 87#endif
@@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 92 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 93 if (nr_cpu_ids != NR_CPUS)
87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 94 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
95#ifdef CONFIG_RCU_BOOST
96 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
97#endif
88} 98}
89 99
90#ifdef CONFIG_TREE_PREEMPT_RCU 100#ifdef CONFIG_PREEMPT_RCU
91 101
92RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 102RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
93static struct rcu_state *rcu_state_p = &rcu_preempt_state; 103static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -156,7 +166,7 @@ static void rcu_preempt_qs(void)
156 * 166 *
157 * Caller must disable preemption. 167 * Caller must disable preemption.
158 */ 168 */
159static void rcu_preempt_note_context_switch(int cpu) 169static void rcu_preempt_note_context_switch(void)
160{ 170{
161 struct task_struct *t = current; 171 struct task_struct *t = current;
162 unsigned long flags; 172 unsigned long flags;
@@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
167 !t->rcu_read_unlock_special.b.blocked) { 177 !t->rcu_read_unlock_special.b.blocked) {
168 178
169 /* Possibly blocking in an RCU read-side critical section. */ 179 /* Possibly blocking in an RCU read-side critical section. */
170 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 180 rdp = this_cpu_ptr(rcu_preempt_state.rda);
171 rnp = rdp->mynode; 181 rnp = rdp->mynode;
172 raw_spin_lock_irqsave(&rnp->lock, flags); 182 raw_spin_lock_irqsave(&rnp->lock, flags);
173 smp_mb__after_unlock_lock(); 183 smp_mb__after_unlock_lock();
@@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
415 } 425 }
416} 426}
417 427
418#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
419
420/* 428/*
421 * Dump detailed information for all tasks blocking the current RCU 429 * Dump detailed information for all tasks blocking the current RCU
422 * grace period on the specified rcu_node structure. 430 * grace period on the specified rcu_node structure.
@@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
451 rcu_print_detail_task_stall_rnp(rnp); 459 rcu_print_detail_task_stall_rnp(rnp);
452} 460}
453 461
454#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
455
456static void rcu_print_detail_task_stall(struct rcu_state *rsp)
457{
458}
459
460#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
461
462#ifdef CONFIG_RCU_CPU_STALL_INFO 462#ifdef CONFIG_RCU_CPU_STALL_INFO
463 463
464static void rcu_print_task_stall_begin(struct rcu_node *rnp) 464static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
621 * 621 *
622 * Caller must disable hard irqs. 622 * Caller must disable hard irqs.
623 */ 623 */
624static void rcu_preempt_check_callbacks(int cpu) 624static void rcu_preempt_check_callbacks(void)
625{ 625{
626 struct task_struct *t = current; 626 struct task_struct *t = current;
627 627
@@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu)
630 return; 630 return;
631 } 631 }
632 if (t->rcu_read_lock_nesting > 0 && 632 if (t->rcu_read_lock_nesting > 0 &&
633 per_cpu(rcu_preempt_data, cpu).qs_pending && 633 __this_cpu_read(rcu_preempt_data.qs_pending) &&
634 !per_cpu(rcu_preempt_data, cpu).passed_quiesce) 634 !__this_cpu_read(rcu_preempt_data.passed_quiesce))
635 t->rcu_read_unlock_special.b.need_qs = true; 635 t->rcu_read_unlock_special.b.need_qs = true;
636} 636}
637 637
@@ -919,7 +919,7 @@ void exit_rcu(void)
919 __rcu_read_unlock(); 919 __rcu_read_unlock();
920} 920}
921 921
922#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 922#else /* #ifdef CONFIG_PREEMPT_RCU */
923 923
924static struct rcu_state *rcu_state_p = &rcu_sched_state; 924static struct rcu_state *rcu_state_p = &rcu_sched_state;
925 925
@@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
945 * Because preemptible RCU does not exist, we never have to check for 945 * Because preemptible RCU does not exist, we never have to check for
946 * CPUs being in quiescent states. 946 * CPUs being in quiescent states.
947 */ 947 */
948static void rcu_preempt_note_context_switch(int cpu) 948static void rcu_preempt_note_context_switch(void)
949{ 949{
950} 950}
951 951
@@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1017 * Because preemptible RCU does not exist, it never has any callbacks 1017 * Because preemptible RCU does not exist, it never has any callbacks
1018 * to check. 1018 * to check.
1019 */ 1019 */
1020static void rcu_preempt_check_callbacks(int cpu) 1020static void rcu_preempt_check_callbacks(void)
1021{ 1021{
1022} 1022}
1023 1023
@@ -1070,7 +1070,7 @@ void exit_rcu(void)
1070{ 1070{
1071} 1071}
1072 1072
1073#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1073#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1074 1074
1075#ifdef CONFIG_RCU_BOOST 1075#ifdef CONFIG_RCU_BOOST
1076 1076
@@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1326 smp_mb__after_unlock_lock(); 1326 smp_mb__after_unlock_lock();
1327 rnp->boost_kthread_task = t; 1327 rnp->boost_kthread_task = t;
1328 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1328 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1329 sp.sched_priority = RCU_BOOST_PRIO; 1329 sp.sched_priority = kthread_prio;
1330 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1330 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1331 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1331 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1332 return 0; 1332 return 0;
@@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
1343{ 1343{
1344 struct sched_param sp; 1344 struct sched_param sp;
1345 1345
1346 sp.sched_priority = RCU_KTHREAD_PRIO; 1346 sp.sched_priority = kthread_prio;
1347 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1347 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1348} 1348}
1349 1349
@@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
1512 * any flavor of RCU. 1512 * any flavor of RCU.
1513 */ 1513 */
1514#ifndef CONFIG_RCU_NOCB_CPU_ALL 1514#ifndef CONFIG_RCU_NOCB_CPU_ALL
1515int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1515int rcu_needs_cpu(unsigned long *delta_jiffies)
1516{ 1516{
1517 *delta_jiffies = ULONG_MAX; 1517 *delta_jiffies = ULONG_MAX;
1518 return rcu_cpu_has_callbacks(cpu, NULL); 1518 return rcu_cpu_has_callbacks(NULL);
1519} 1519}
1520#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1520#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1521 1521
@@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1523 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1523 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1524 * after it. 1524 * after it.
1525 */ 1525 */
1526static void rcu_cleanup_after_idle(int cpu) 1526static void rcu_cleanup_after_idle(void)
1527{ 1527{
1528} 1528}
1529 1529
@@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
1531 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1531 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1532 * is nothing. 1532 * is nothing.
1533 */ 1533 */
1534static void rcu_prepare_for_idle(int cpu) 1534static void rcu_prepare_for_idle(void)
1535{ 1535{
1536} 1536}
1537 1537
@@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1624 * The caller must have disabled interrupts. 1624 * The caller must have disabled interrupts.
1625 */ 1625 */
1626#ifndef CONFIG_RCU_NOCB_CPU_ALL 1626#ifndef CONFIG_RCU_NOCB_CPU_ALL
1627int rcu_needs_cpu(int cpu, unsigned long *dj) 1627int rcu_needs_cpu(unsigned long *dj)
1628{ 1628{
1629 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1629 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1630 1630
1631 /* Snapshot to detect later posting of non-lazy callback. */ 1631 /* Snapshot to detect later posting of non-lazy callback. */
1632 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1632 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1633 1633
1634 /* If no callbacks, RCU doesn't need the CPU. */ 1634 /* If no callbacks, RCU doesn't need the CPU. */
1635 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { 1635 if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
1636 *dj = ULONG_MAX; 1636 *dj = ULONG_MAX;
1637 return 0; 1637 return 0;
1638 } 1638 }
@@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1666 * 1666 *
1667 * The caller must have disabled interrupts. 1667 * The caller must have disabled interrupts.
1668 */ 1668 */
1669static void rcu_prepare_for_idle(int cpu) 1669static void rcu_prepare_for_idle(void)
1670{ 1670{
1671#ifndef CONFIG_RCU_NOCB_CPU_ALL 1671#ifndef CONFIG_RCU_NOCB_CPU_ALL
1672 bool needwake; 1672 bool needwake;
1673 struct rcu_data *rdp; 1673 struct rcu_data *rdp;
1674 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1674 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1675 struct rcu_node *rnp; 1675 struct rcu_node *rnp;
1676 struct rcu_state *rsp; 1676 struct rcu_state *rsp;
1677 int tne; 1677 int tne;
@@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
1679 /* Handle nohz enablement switches conservatively. */ 1679 /* Handle nohz enablement switches conservatively. */
1680 tne = ACCESS_ONCE(tick_nohz_active); 1680 tne = ACCESS_ONCE(tick_nohz_active);
1681 if (tne != rdtp->tick_nohz_enabled_snap) { 1681 if (tne != rdtp->tick_nohz_enabled_snap) {
1682 if (rcu_cpu_has_callbacks(cpu, NULL)) 1682 if (rcu_cpu_has_callbacks(NULL))
1683 invoke_rcu_core(); /* force nohz to see update. */ 1683 invoke_rcu_core(); /* force nohz to see update. */
1684 rdtp->tick_nohz_enabled_snap = tne; 1684 rdtp->tick_nohz_enabled_snap = tne;
1685 return; 1685 return;
@@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
1688 return; 1688 return;
1689 1689
1690 /* If this is a no-CBs CPU, no callbacks, just return. */ 1690 /* If this is a no-CBs CPU, no callbacks, just return. */
1691 if (rcu_is_nocb_cpu(cpu)) 1691 if (rcu_is_nocb_cpu(smp_processor_id()))
1692 return; 1692 return;
1693 1693
1694 /* 1694 /*
@@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
1712 return; 1712 return;
1713 rdtp->last_accelerate = jiffies; 1713 rdtp->last_accelerate = jiffies;
1714 for_each_rcu_flavor(rsp) { 1714 for_each_rcu_flavor(rsp) {
1715 rdp = per_cpu_ptr(rsp->rda, cpu); 1715 rdp = this_cpu_ptr(rsp->rda);
1716 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1716 if (!*rdp->nxttail[RCU_DONE_TAIL])
1717 continue; 1717 continue;
1718 rnp = rdp->mynode; 1718 rnp = rdp->mynode;
@@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
1731 * any grace periods that elapsed while the CPU was idle, and if any 1731 * any grace periods that elapsed while the CPU was idle, and if any
1732 * callbacks are now ready to invoke, initiate invocation. 1732 * callbacks are now ready to invoke, initiate invocation.
1733 */ 1733 */
1734static void rcu_cleanup_after_idle(int cpu) 1734static void rcu_cleanup_after_idle(void)
1735{ 1735{
1736#ifndef CONFIG_RCU_NOCB_CPU_ALL 1736#ifndef CONFIG_RCU_NOCB_CPU_ALL
1737 if (rcu_is_nocb_cpu(cpu)) 1737 if (rcu_is_nocb_cpu(smp_processor_id()))
1738 return; 1738 return;
1739 if (rcu_try_advance_all_cbs()) 1739 if (rcu_try_advance_all_cbs())
1740 invoke_rcu_core(); 1740 invoke_rcu_core();
@@ -2573,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
2573 rdp->nocb_leader = rdp_spawn; 2573 rdp->nocb_leader = rdp_spawn;
2574 if (rdp_last && rdp != rdp_spawn) 2574 if (rdp_last && rdp != rdp_spawn)
2575 rdp_last->nocb_next_follower = rdp; 2575 rdp_last->nocb_next_follower = rdp;
2576 rdp_last = rdp; 2576 if (rdp == rdp_spawn) {
2577 rdp = rdp->nocb_next_follower; 2577 rdp = rdp->nocb_next_follower;
2578 rdp_last->nocb_next_follower = NULL; 2578 } else {
2579 rdp_last = rdp;
2580 rdp = rdp->nocb_next_follower;
2581 rdp_last->nocb_next_follower = NULL;
2582 }
2579 } while (rdp); 2583 } while (rdp);
2580 rdp_spawn->nocb_next_follower = rdp_old_leader; 2584 rdp_spawn->nocb_next_follower = rdp_old_leader;
2581 } 2585 }
@@ -2761,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */
2761 * to detect full-system idle states, not RCU quiescent states and grace 2765 * to detect full-system idle states, not RCU quiescent states and grace
2762 * periods. The caller must have disabled interrupts. 2766 * periods. The caller must have disabled interrupts.
2763 */ 2767 */
2764static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2768static void rcu_sysidle_enter(int irq)
2765{ 2769{
2766 unsigned long j; 2770 unsigned long j;
2771 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2767 2772
2768 /* If there are no nohz_full= CPUs, no need to track this. */ 2773 /* If there are no nohz_full= CPUs, no need to track this. */
2769 if (!tick_nohz_full_enabled()) 2774 if (!tick_nohz_full_enabled())
@@ -2832,8 +2837,10 @@ void rcu_sysidle_force_exit(void)
2832 * usermode execution does -not- count as idle here! The caller must 2837 * usermode execution does -not- count as idle here! The caller must
2833 * have disabled interrupts. 2838 * have disabled interrupts.
2834 */ 2839 */
2835static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2840static void rcu_sysidle_exit(int irq)
2836{ 2841{
2842 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2843
2837 /* If there are no nohz_full= CPUs, no need to track this. */ 2844 /* If there are no nohz_full= CPUs, no need to track this. */
2838 if (!tick_nohz_full_enabled()) 2845 if (!tick_nohz_full_enabled())
2839 return; 2846 return;
@@ -3127,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
3127 3134
3128#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3135#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3129 3136
3130static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 3137static void rcu_sysidle_enter(int irq)
3131{ 3138{
3132} 3139}
3133 3140
3134static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 3141static void rcu_sysidle_exit(int irq)
3135{ 3142{
3136} 3143}
3137 3144
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ef8ba58694e..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
306EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 306EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
307#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 307#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
308 308
309#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 309#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
310void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, 310void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
311 unsigned long secs, 311 unsigned long secs,
312 unsigned long c_old, unsigned long c) 312 unsigned long c_old, unsigned long c)
@@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg)
531 struct rcu_head *next; 531 struct rcu_head *next;
532 LIST_HEAD(rcu_tasks_holdouts); 532 LIST_HEAD(rcu_tasks_holdouts);
533 533
534 /* FIXME: Add housekeeping affinity. */ 534 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
535 housekeeping_affine(current);
535 536
536 /* 537 /*
537 * Each pass through the following loop makes one check for 538 * Each pass through the following loop makes one check for
@@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void)
690} 691}
691 692
692#endif /* #ifdef CONFIG_TASKS_RCU */ 693#endif /* #ifdef CONFIG_TASKS_RCU */
694
695#ifdef CONFIG_PROVE_RCU
696
697/*
698 * Early boot self test parameters, one for each flavor
699 */
700static bool rcu_self_test;
701static bool rcu_self_test_bh;
702static bool rcu_self_test_sched;
703
704module_param(rcu_self_test, bool, 0444);
705module_param(rcu_self_test_bh, bool, 0444);
706module_param(rcu_self_test_sched, bool, 0444);
707
708static int rcu_self_test_counter;
709
710static void test_callback(struct rcu_head *r)
711{
712 rcu_self_test_counter++;
713 pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
714}
715
716static void early_boot_test_call_rcu(void)
717{
718 static struct rcu_head head;
719
720 call_rcu(&head, test_callback);
721}
722
723static void early_boot_test_call_rcu_bh(void)
724{
725 static struct rcu_head head;
726
727 call_rcu_bh(&head, test_callback);
728}
729
730static void early_boot_test_call_rcu_sched(void)
731{
732 static struct rcu_head head;
733
734 call_rcu_sched(&head, test_callback);
735}
736
737void rcu_early_boot_tests(void)
738{
739 pr_info("Running RCU self tests\n");
740
741 if (rcu_self_test)
742 early_boot_test_call_rcu();
743 if (rcu_self_test_bh)
744 early_boot_test_call_rcu_bh();
745 if (rcu_self_test_sched)
746 early_boot_test_call_rcu_sched();
747}
748
749static int rcu_verify_early_boot_tests(void)
750{
751 int ret = 0;
752 int early_boot_test_counter = 0;
753
754 if (rcu_self_test) {
755 early_boot_test_counter++;
756 rcu_barrier();
757 }
758 if (rcu_self_test_bh) {
759 early_boot_test_counter++;
760 rcu_barrier_bh();
761 }
762 if (rcu_self_test_sched) {
763 early_boot_test_counter++;
764 rcu_barrier_sched();
765 }
766
767 if (rcu_self_test_counter != early_boot_test_counter) {
768 WARN_ON(1);
769 ret = -1;
770 }
771
772 return ret;
773}
774late_initcall(rcu_verify_early_boot_tests);
775#else
776void rcu_early_boot_tests(void) {}
777#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
148 * 148 *
149 * This waits to be signaled for completion of a specific task. It is NOT 149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting 150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO. 151 * for IO (which traditionally means blkio only).
152 */ 152 */
153void __sched wait_for_completion_io(struct completion *x) 153void __sched wait_for_completion_io(struct completion *x)
154{ 154{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
163 * 163 *
164 * This waits for either a completion of a specific task to be signaled or for a 164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not 165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO. 166 * interruptible. The caller is accounted as waiting for IO (which traditionally
167 * means blkio only).
167 * 168 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 169 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed. 170 * till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e7283015a6..d22fb16a7153 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
1008 return cpu_curr(task_cpu(p)) == p; 1008 return cpu_curr(task_cpu(p)) == p;
1009} 1009}
1010 1010
1011/*
1012 * Can drop rq->lock because from sched_class::switched_from() methods drop it.
1013 */
1011static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1014static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1012 const struct sched_class *prev_class, 1015 const struct sched_class *prev_class,
1013 int oldprio) 1016 int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1015 if (prev_class != p->sched_class) { 1018 if (prev_class != p->sched_class) {
1016 if (prev_class->switched_from) 1019 if (prev_class->switched_from)
1017 prev_class->switched_from(rq, p); 1020 prev_class->switched_from(rq, p);
1021 /* Possble rq->lock 'hole'. */
1018 p->sched_class->switched_to(rq, p); 1022 p->sched_class->switched_to(rq, p);
1019 } else if (oldprio != p->prio || dl_task(p)) 1023 } else if (oldprio != p->prio || dl_task(p))
1020 p->sched_class->prio_changed(rq, p, oldprio); 1024 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1054 * ttwu() will sort out the placement. 1058 * ttwu() will sort out the placement.
1055 */ 1059 */
1056 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1060 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1057 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 1061 !p->on_rq);
1058 1062
1059#ifdef CONFIG_LOCKDEP 1063#ifdef CONFIG_LOCKDEP
1060 /* 1064 /*
@@ -1078,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1078 if (p->sched_class->migrate_task_rq) 1082 if (p->sched_class->migrate_task_rq)
1079 p->sched_class->migrate_task_rq(p, new_cpu); 1083 p->sched_class->migrate_task_rq(p, new_cpu);
1080 p->se.nr_migrations++; 1084 p->se.nr_migrations++;
1081 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1085 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1082 } 1086 }
1083 1087
1084 __set_task_cpu(p, new_cpu); 1088 __set_task_cpu(p, new_cpu);
@@ -1407,7 +1411,8 @@ out:
1407static inline 1411static inline
1408int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1412int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1409{ 1413{
1410 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1414 if (p->nr_cpus_allowed > 1)
1415 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1411 1416
1412 /* 1417 /*
1413 * In order not to call set_task_cpu() on a blocking task we need 1418 * In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
1623 struct rq *rq = cpu_rq(cpu); 1628 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags; 1629 unsigned long flags;
1625 1630
1626 if (!is_idle_task(rq->curr)) 1631 rcu_read_lock();
1627 return; 1632
1633 if (!is_idle_task(rcu_dereference(rq->curr)))
1634 goto out;
1628 1635
1629 if (set_nr_if_polling(rq->idle)) { 1636 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu); 1637 trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
1635 /* Else cpu is not in idle, do nothing here */ 1642 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags); 1643 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 } 1644 }
1645
1646out:
1647 rcu_read_unlock();
1638} 1648}
1639 1649
1640bool cpus_share_cache(int this_cpu, int that_cpu) 1650bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1853 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1863 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1854 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1864 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1855 p->numa_work.next = &p->numa_work; 1865 p->numa_work.next = &p->numa_work;
1856 p->numa_faults_memory = NULL; 1866 p->numa_faults = NULL;
1857 p->numa_faults_buffer_memory = NULL;
1858 p->last_task_numa_placement = 0; 1867 p->last_task_numa_placement = 0;
1859 p->last_sum_exec_runtime = 0; 1868 p->last_sum_exec_runtime = 0;
1860 1869
1861 INIT_LIST_HEAD(&p->numa_entry);
1862 p->numa_group = NULL; 1870 p->numa_group = NULL;
1863#endif /* CONFIG_NUMA_BALANCING */ 1871#endif /* CONFIG_NUMA_BALANCING */
1864} 1872}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
2034} 2042}
2035#endif 2043#endif
2036 2044
2037static inline
2038void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
2039{
2040 dl_b->total_bw -= tsk_bw;
2041}
2042
2043static inline
2044void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
2045{
2046 dl_b->total_bw += tsk_bw;
2047}
2048
2049static inline
2050bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
2051{
2052 return dl_b->bw != -1 &&
2053 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
2054}
2055
2056/* 2045/*
2057 * We must be sure that accepting a new task (or allowing changing the 2046 * We must be sure that accepting a new task (or allowing changing the
2058 * parameters of an existing one) is consistent with the bandwidth 2047 * parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2220 2209
2221/** 2210/**
2222 * finish_task_switch - clean up after a task-switch 2211 * finish_task_switch - clean up after a task-switch
2223 * @rq: runqueue associated with task-switch
2224 * @prev: the thread we just switched away from. 2212 * @prev: the thread we just switched away from.
2225 * 2213 *
2226 * finish_task_switch must be called after the context switch, paired 2214 * finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2232 * so, we finish that here outside of the runqueue lock. (Doing it 2220 * so, we finish that here outside of the runqueue lock. (Doing it
2233 * with the lock held can cause deadlocks; see schedule() for 2221 * with the lock held can cause deadlocks; see schedule() for
2234 * details.) 2222 * details.)
2223 *
2224 * The context switch have flipped the stack from under us and restored the
2225 * local variables which were saved when this task called schedule() in the
2226 * past. prev == current is still correct but we need to recalculate this_rq
2227 * because prev may have moved to another CPU.
2235 */ 2228 */
2236static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2229static struct rq *finish_task_switch(struct task_struct *prev)
2237 __releases(rq->lock) 2230 __releases(rq->lock)
2238{ 2231{
2232 struct rq *rq = this_rq();
2239 struct mm_struct *mm = rq->prev_mm; 2233 struct mm_struct *mm = rq->prev_mm;
2240 long prev_state; 2234 long prev_state;
2241 2235
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2275 } 2269 }
2276 2270
2277 tick_nohz_task_switch(current); 2271 tick_nohz_task_switch(current);
2272 return rq;
2278} 2273}
2279 2274
2280#ifdef CONFIG_SMP 2275#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
2309asmlinkage __visible void schedule_tail(struct task_struct *prev) 2304asmlinkage __visible void schedule_tail(struct task_struct *prev)
2310 __releases(rq->lock) 2305 __releases(rq->lock)
2311{ 2306{
2312 struct rq *rq = this_rq(); 2307 struct rq *rq;
2313
2314 finish_task_switch(rq, prev);
2315 2308
2316 /* 2309 /* finish_task_switch() drops rq->lock and enables preemtion */
2317 * FIXME: do we need to worry about rq being invalidated by the 2310 preempt_disable();
2318 * task_switch? 2311 rq = finish_task_switch(prev);
2319 */
2320 post_schedule(rq); 2312 post_schedule(rq);
2313 preempt_enable();
2321 2314
2322 if (current->set_child_tid) 2315 if (current->set_child_tid)
2323 put_user(task_pid_vnr(current), current->set_child_tid); 2316 put_user(task_pid_vnr(current), current->set_child_tid);
2324} 2317}
2325 2318
2326/* 2319/*
2327 * context_switch - switch to the new MM and the new 2320 * context_switch - switch to the new MM and the new thread's register state.
2328 * thread's register state.
2329 */ 2321 */
2330static inline void 2322static inline struct rq *
2331context_switch(struct rq *rq, struct task_struct *prev, 2323context_switch(struct rq *rq, struct task_struct *prev,
2332 struct task_struct *next) 2324 struct task_struct *next)
2333{ 2325{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
2366 context_tracking_task_switch(prev, next); 2358 context_tracking_task_switch(prev, next);
2367 /* Here we just switch the register state and the stack. */ 2359 /* Here we just switch the register state and the stack. */
2368 switch_to(prev, next, prev); 2360 switch_to(prev, next, prev);
2369
2370 barrier(); 2361 barrier();
2371 /* 2362
2372 * this_rq must be evaluated again because prev may have moved 2363 return finish_task_switch(prev);
2373 * CPUs since it called schedule(), thus the 'rq' on its stack
2374 * frame will be invalid.
2375 */
2376 finish_task_switch(this_rq(), prev);
2377} 2364}
2378 2365
2379/* 2366/*
@@ -2773,7 +2760,7 @@ need_resched:
2773 preempt_disable(); 2760 preempt_disable();
2774 cpu = smp_processor_id(); 2761 cpu = smp_processor_id();
2775 rq = cpu_rq(cpu); 2762 rq = cpu_rq(cpu);
2776 rcu_note_context_switch(cpu); 2763 rcu_note_context_switch();
2777 prev = rq->curr; 2764 prev = rq->curr;
2778 2765
2779 schedule_debug(prev); 2766 schedule_debug(prev);
@@ -2826,15 +2813,8 @@ need_resched:
2826 rq->curr = next; 2813 rq->curr = next;
2827 ++*switch_count; 2814 ++*switch_count;
2828 2815
2829 context_switch(rq, prev, next); /* unlocks the rq */ 2816 rq = context_switch(rq, prev, next); /* unlocks the rq */
2830 /* 2817 cpu = cpu_of(rq);
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 } else 2818 } else
2839 raw_spin_unlock_irq(&rq->lock); 2819 raw_spin_unlock_irq(&rq->lock);
2840 2820
@@ -4547,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
4547#ifdef CONFIG_DEBUG_STACK_USAGE 4527#ifdef CONFIG_DEBUG_STACK_USAGE
4548 free = stack_not_used(p); 4528 free = stack_not_used(p);
4549#endif 4529#endif
4530 ppid = 0;
4550 rcu_read_lock(); 4531 rcu_read_lock();
4551 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4532 if (pid_alive(p))
4533 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4552 rcu_read_unlock(); 4534 rcu_read_unlock();
4553 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4535 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4554 task_pid_nr(p), ppid, 4536 task_pid_nr(p), ppid,
@@ -4653,6 +4635,81 @@ void init_idle(struct task_struct *idle, int cpu)
4653#endif 4635#endif
4654} 4636}
4655 4637
4638int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4639 const struct cpumask *trial)
4640{
4641 int ret = 1, trial_cpus;
4642 struct dl_bw *cur_dl_b;
4643 unsigned long flags;
4644
4645 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial);
4648
4649 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
4650 if (cur_dl_b->bw != -1 &&
4651 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
4652 ret = 0;
4653 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
4654 rcu_read_unlock_sched();
4655
4656 return ret;
4657}
4658
4659int task_can_attach(struct task_struct *p,
4660 const struct cpumask *cs_cpus_allowed)
4661{
4662 int ret = 0;
4663
4664 /*
4665 * Kthreads which disallow setaffinity shouldn't be moved
4666 * to a new cpuset; we don't want to change their cpu
4667 * affinity and isolating such threads by their set of
4668 * allowed nodes is unnecessary. Thus, cpusets are not
4669 * applicable for such threads. This prevents checking for
4670 * success of set_cpus_allowed_ptr() on all attached tasks
4671 * before cpus_allowed may be changed.
4672 */
4673 if (p->flags & PF_NO_SETAFFINITY) {
4674 ret = -EINVAL;
4675 goto out;
4676 }
4677
4678#ifdef CONFIG_SMP
4679 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
4680 cs_cpus_allowed)) {
4681 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
4682 cs_cpus_allowed);
4683 struct dl_bw *dl_b;
4684 bool overflow;
4685 int cpus;
4686 unsigned long flags;
4687
4688 rcu_read_lock_sched();
4689 dl_b = dl_bw_of(dest_cpu);
4690 raw_spin_lock_irqsave(&dl_b->lock, flags);
4691 cpus = dl_bw_cpus(dest_cpu);
4692 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
4693 if (overflow)
4694 ret = -EBUSY;
4695 else {
4696 /*
4697 * We reserve space for this task in the destination
4698 * root_domain, as we can't fail after this point.
4699 * We will free resources in the source root_domain
4700 * later on (see set_cpus_allowed_dl()).
4701 */
4702 __dl_add(dl_b, p->dl.dl_bw);
4703 }
4704 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
4705 rcu_read_unlock_sched();
4706
4707 }
4708#endif
4709out:
4710 return ret;
4711}
4712
4656#ifdef CONFIG_SMP 4713#ifdef CONFIG_SMP
4657/* 4714/*
4658 * move_queued_task - move a queued task to new rq. 4715 * move_queued_task - move a queued task to new rq.
@@ -6103,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
6103 6160
6104#ifdef CONFIG_NUMA 6161#ifdef CONFIG_NUMA
6105static int sched_domains_numa_levels; 6162static int sched_domains_numa_levels;
6163enum numa_topology_type sched_numa_topology_type;
6106static int *sched_domains_numa_distance; 6164static int *sched_domains_numa_distance;
6165int sched_max_numa_distance;
6107static struct cpumask ***sched_domains_numa_masks; 6166static struct cpumask ***sched_domains_numa_masks;
6108static int sched_domains_curr_level; 6167static int sched_domains_curr_level;
6109#endif 6168#endif
@@ -6275,7 +6334,7 @@ static void sched_numa_warn(const char *str)
6275 printk(KERN_WARNING "\n"); 6334 printk(KERN_WARNING "\n");
6276} 6335}
6277 6336
6278static bool find_numa_distance(int distance) 6337bool find_numa_distance(int distance)
6279{ 6338{
6280 int i; 6339 int i;
6281 6340
@@ -6290,6 +6349,56 @@ static bool find_numa_distance(int distance)
6290 return false; 6349 return false;
6291} 6350}
6292 6351
6352/*
6353 * A system can have three types of NUMA topology:
6354 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6355 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6356 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6357 *
6358 * The difference between a glueless mesh topology and a backplane
6359 * topology lies in whether communication between not directly
6360 * connected nodes goes through intermediary nodes (where programs
6361 * could run), or through backplane controllers. This affects
6362 * placement of programs.
6363 *
6364 * The type of topology can be discerned with the following tests:
6365 * - If the maximum distance between any nodes is 1 hop, the system
6366 * is directly connected.
6367 * - If for two nodes A and B, located N > 1 hops away from each other,
6368 * there is an intermediary node C, which is < N hops away from both
6369 * nodes A and B, the system is a glueless mesh.
6370 */
6371static void init_numa_topology_type(void)
6372{
6373 int a, b, c, n;
6374
6375 n = sched_max_numa_distance;
6376
6377 if (n <= 1)
6378 sched_numa_topology_type = NUMA_DIRECT;
6379
6380 for_each_online_node(a) {
6381 for_each_online_node(b) {
6382 /* Find two nodes furthest removed from each other. */
6383 if (node_distance(a, b) < n)
6384 continue;
6385
6386 /* Is there an intermediary node between a and b? */
6387 for_each_online_node(c) {
6388 if (node_distance(a, c) < n &&
6389 node_distance(b, c) < n) {
6390 sched_numa_topology_type =
6391 NUMA_GLUELESS_MESH;
6392 return;
6393 }
6394 }
6395
6396 sched_numa_topology_type = NUMA_BACKPLANE;
6397 return;
6398 }
6399 }
6400}
6401
6293static void sched_init_numa(void) 6402static void sched_init_numa(void)
6294{ 6403{
6295 int next_distance, curr_distance = node_distance(0, 0); 6404 int next_distance, curr_distance = node_distance(0, 0);
@@ -6426,6 +6535,9 @@ static void sched_init_numa(void)
6426 sched_domain_topology = tl; 6535 sched_domain_topology = tl;
6427 6536
6428 sched_domains_numa_levels = level; 6537 sched_domains_numa_levels = level;
6538 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6539
6540 init_numa_topology_type();
6429} 6541}
6430 6542
6431static void sched_domains_numa_masks_set(int cpu) 6543static void sched_domains_numa_masks_set(int cpu)
@@ -7001,9 +7113,6 @@ void __init sched_init(void)
7001#ifdef CONFIG_RT_GROUP_SCHED 7113#ifdef CONFIG_RT_GROUP_SCHED
7002 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7114 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7003#endif 7115#endif
7004#ifdef CONFIG_CPUMASK_OFFSTACK
7005 alloc_size += num_possible_cpus() * cpumask_size();
7006#endif
7007 if (alloc_size) { 7116 if (alloc_size) {
7008 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7117 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7009 7118
@@ -7023,13 +7132,13 @@ void __init sched_init(void)
7023 ptr += nr_cpu_ids * sizeof(void **); 7132 ptr += nr_cpu_ids * sizeof(void **);
7024 7133
7025#endif /* CONFIG_RT_GROUP_SCHED */ 7134#endif /* CONFIG_RT_GROUP_SCHED */
7135 }
7026#ifdef CONFIG_CPUMASK_OFFSTACK 7136#ifdef CONFIG_CPUMASK_OFFSTACK
7027 for_each_possible_cpu(i) { 7137 for_each_possible_cpu(i) {
7028 per_cpu(load_balance_mask, i) = (void *)ptr; 7138 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7029 ptr += cpumask_size(); 7139 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7030 }
7031#endif /* CONFIG_CPUMASK_OFFSTACK */
7032 } 7140 }
7141#endif /* CONFIG_CPUMASK_OFFSTACK */
7033 7142
7034 init_rt_bandwidth(&def_rt_bandwidth, 7143 init_rt_bandwidth(&def_rt_bandwidth,
7035 global_rt_period(), global_rt_runtime()); 7144 global_rt_period(), global_rt_runtime());
@@ -7178,6 +7287,25 @@ static inline int preempt_count_equals(int preempt_offset)
7178 7287
7179void __might_sleep(const char *file, int line, int preempt_offset) 7288void __might_sleep(const char *file, int line, int preempt_offset)
7180{ 7289{
7290 /*
7291 * Blocking primitives will set (and therefore destroy) current->state,
7292 * since we will exit with TASK_RUNNING make sure we enter with it,
7293 * otherwise we will destroy state.
7294 */
7295 if (WARN_ONCE(current->state != TASK_RUNNING,
7296 "do not call blocking ops when !TASK_RUNNING; "
7297 "state=%lx set at [<%p>] %pS\n",
7298 current->state,
7299 (void *)current->task_state_change,
7300 (void *)current->task_state_change))
7301 __set_current_state(TASK_RUNNING);
7302
7303 ___might_sleep(file, line, preempt_offset);
7304}
7305EXPORT_SYMBOL(__might_sleep);
7306
7307void ___might_sleep(const char *file, int line, int preempt_offset)
7308{
7181 static unsigned long prev_jiffy; /* ratelimiting */ 7309 static unsigned long prev_jiffy; /* ratelimiting */
7182 7310
7183 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7311 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7209,7 +7337,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7209#endif 7337#endif
7210 dump_stack(); 7338 dump_stack();
7211} 7339}
7212EXPORT_SYMBOL(__might_sleep); 7340EXPORT_SYMBOL(___might_sleep);
7213#endif 7341#endif
7214 7342
7215#ifdef CONFIG_MAGIC_SYSRQ 7343#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
28#else
29#define cpudl_set(cp, cpu, dl) do { } while (0)
30#define cpudl_init() do { } while (0)
31#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
32 29
33#endif /* _LINUX_CPUDL_H */ 30#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
26void cpupri_set(struct cpupri *cp, int cpu, int pri); 26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp); 27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp); 28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif 29#endif
33 30
34#endif /* _LINUX_CPUPRI_H */ 31#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 28fa9d9e9201..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
563{ 563{
564 struct hrtimer *timer = &dl_se->dl_timer; 564 struct hrtimer *timer = &dl_se->dl_timer;
565 565
566 if (hrtimer_active(timer)) {
567 hrtimer_try_to_cancel(timer);
568 return;
569 }
570
571 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 566 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
572 timer->function = dl_task_timer; 567 timer->function = dl_task_timer;
573} 568}
@@ -575,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
575static 570static
576int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 571int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
577{ 572{
578 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); 573 return (dl_se->runtime <= 0);
579 int rorun = dl_se->runtime <= 0;
580
581 if (!rorun && !dmiss)
582 return 0;
583
584 /*
585 * If we are beyond our current deadline and we are still
586 * executing, then we have already used some of the runtime of
587 * the next instance. Thus, if we do not account that, we are
588 * stealing bandwidth from the system at each deadline miss!
589 */
590 if (dmiss) {
591 dl_se->runtime = rorun ? dl_se->runtime : 0;
592 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
593 }
594
595 return 1;
596} 574}
597 575
598extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 576extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -633,7 +611,7 @@ static void update_curr_dl(struct rq *rq)
633 611
634 sched_rt_avg_update(rq, delta_exec); 612 sched_rt_avg_update(rq, delta_exec);
635 613
636 dl_se->runtime -= delta_exec; 614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
637 if (dl_runtime_exceeded(rq, dl_se)) { 615 if (dl_runtime_exceeded(rq, dl_se)) {
638 __dequeue_task_dl(rq, curr, 0); 616 __dequeue_task_dl(rq, curr, 0);
639 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -831,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
831 * parameters of the task might need updating. Otherwise, 809 * parameters of the task might need updating. Otherwise,
832 * we want a replenishment of its runtime. 810 * we want a replenishment of its runtime.
833 */ 811 */
834 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) 812 if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
835 replenish_dl_entity(dl_se, pi_se);
836 else
837 update_dl_entity(dl_se, pi_se); 813 update_dl_entity(dl_se, pi_se);
814 else if (flags & ENQUEUE_REPLENISH)
815 replenish_dl_entity(dl_se, pi_se);
838 816
839 __enqueue_dl_entity(dl_se); 817 __enqueue_dl_entity(dl_se);
840} 818}
@@ -933,7 +911,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
933 struct task_struct *curr; 911 struct task_struct *curr;
934 struct rq *rq; 912 struct rq *rq;
935 913
936 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 914 if (sd_flag != SD_BALANCE_WAKE)
937 goto out; 915 goto out;
938 916
939 rq = cpu_rq(cpu); 917 rq = cpu_rq(cpu);
@@ -1018,6 +996,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1018{ 996{
1019 hrtick_start(rq, p->dl.runtime); 997 hrtick_start(rq, p->dl.runtime);
1020} 998}
999#else /* !CONFIG_SCHED_HRTICK */
1000static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1001{
1002}
1021#endif 1003#endif
1022 1004
1023static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 1005static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1071,10 +1053,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1071 /* Running task will never be pushed. */ 1053 /* Running task will never be pushed. */
1072 dequeue_pushable_dl_task(rq, p); 1054 dequeue_pushable_dl_task(rq, p);
1073 1055
1074#ifdef CONFIG_SCHED_HRTICK
1075 if (hrtick_enabled(rq)) 1056 if (hrtick_enabled(rq))
1076 start_hrtick_dl(rq, p); 1057 start_hrtick_dl(rq, p);
1077#endif
1078 1058
1079 set_post_schedule(rq); 1059 set_post_schedule(rq);
1080 1060
@@ -1093,10 +1073,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1093{ 1073{
1094 update_curr_dl(rq); 1074 update_curr_dl(rq);
1095 1075
1096#ifdef CONFIG_SCHED_HRTICK
1097 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
1098 start_hrtick_dl(rq, p); 1077 start_hrtick_dl(rq, p);
1099#endif
1100} 1078}
1101 1079
1102static void task_fork_dl(struct task_struct *p) 1080static void task_fork_dl(struct task_struct *p)
@@ -1333,6 +1311,7 @@ static int push_dl_task(struct rq *rq)
1333{ 1311{
1334 struct task_struct *next_task; 1312 struct task_struct *next_task;
1335 struct rq *later_rq; 1313 struct rq *later_rq;
1314 int ret = 0;
1336 1315
1337 if (!rq->dl.overloaded) 1316 if (!rq->dl.overloaded)
1338 return 0; 1317 return 0;
@@ -1378,7 +1357,6 @@ retry:
1378 * The task is still there. We don't try 1357 * The task is still there. We don't try
1379 * again, some other cpu will pull it when ready. 1358 * again, some other cpu will pull it when ready.
1380 */ 1359 */
1381 dequeue_pushable_dl_task(rq, next_task);
1382 goto out; 1360 goto out;
1383 } 1361 }
1384 1362
@@ -1394,6 +1372,7 @@ retry:
1394 deactivate_task(rq, next_task, 0); 1372 deactivate_task(rq, next_task, 0);
1395 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1396 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 ret = 1;
1397 1376
1398 resched_curr(later_rq); 1377 resched_curr(later_rq);
1399 1378
@@ -1402,7 +1381,7 @@ retry:
1402out: 1381out:
1403 put_task_struct(next_task); 1382 put_task_struct(next_task);
1404 1383
1405 return 1; 1384 return ret;
1406} 1385}
1407 1386
1408static void push_dl_tasks(struct rq *rq) 1387static void push_dl_tasks(struct rq *rq)
@@ -1508,7 +1487,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1508 p->nr_cpus_allowed > 1 && 1487 p->nr_cpus_allowed > 1 &&
1509 dl_task(rq->curr) && 1488 dl_task(rq->curr) &&
1510 (rq->curr->nr_cpus_allowed < 2 || 1489 (rq->curr->nr_cpus_allowed < 2 ||
1511 dl_entity_preempt(&rq->curr->dl, &p->dl))) { 1490 !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
1512 push_dl_tasks(rq); 1491 push_dl_tasks(rq);
1513 } 1492 }
1514} 1493}
@@ -1517,10 +1496,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1517 const struct cpumask *new_mask) 1496 const struct cpumask *new_mask)
1518{ 1497{
1519 struct rq *rq; 1498 struct rq *rq;
1499 struct root_domain *src_rd;
1520 int weight; 1500 int weight;
1521 1501
1522 BUG_ON(!dl_task(p)); 1502 BUG_ON(!dl_task(p));
1523 1503
1504 rq = task_rq(p);
1505 src_rd = rq->rd;
1506 /*
1507 * Migrating a SCHED_DEADLINE task between exclusive
1508 * cpusets (different root_domains) entails a bandwidth
1509 * update. We already made space for us in the destination
1510 * domain (see cpuset_can_attach()).
1511 */
1512 if (!cpumask_intersects(src_rd->span, new_mask)) {
1513 struct dl_bw *src_dl_b;
1514
1515 src_dl_b = dl_bw_of(cpu_of(rq));
1516 /*
1517 * We now free resources of the root_domain we are migrating
1518 * off. In the worst case, sched_setattr() may temporary fail
1519 * until we complete the update.
1520 */
1521 raw_spin_lock(&src_dl_b->lock);
1522 __dl_clear(src_dl_b, p->dl.dl_bw);
1523 raw_spin_unlock(&src_dl_b->lock);
1524 }
1525
1524 /* 1526 /*
1525 * Update only if the task is actually running (i.e., 1527 * Update only if the task is actually running (i.e.,
1526 * it is on the rq AND it is not throttled). 1528 * it is on the rq AND it is not throttled).
@@ -1537,8 +1539,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1537 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1539 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1538 return; 1540 return;
1539 1541
1540 rq = task_rq(p);
1541
1542 /* 1542 /*
1543 * The process used to be able to migrate OR it can now migrate 1543 * The process used to be able to migrate OR it can now migrate
1544 */ 1544 */
@@ -1586,22 +1586,48 @@ void init_sched_dl_class(void)
1586 1586
1587#endif /* CONFIG_SMP */ 1587#endif /* CONFIG_SMP */
1588 1588
1589/*
1590 * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
1591 */
1592static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
1593{
1594 struct hrtimer *dl_timer = &p->dl.dl_timer;
1595
1596 /* Nobody will change task's class if pi_lock is held */
1597 lockdep_assert_held(&p->pi_lock);
1598
1599 if (hrtimer_active(dl_timer)) {
1600 int ret = hrtimer_try_to_cancel(dl_timer);
1601
1602 if (unlikely(ret == -1)) {
1603 /*
1604 * Note, p may migrate OR new deadline tasks
1605 * may appear in rq when we are unlocking it.
1606 * A caller of us must be fine with that.
1607 */
1608 raw_spin_unlock(&rq->lock);
1609 hrtimer_cancel(dl_timer);
1610 raw_spin_lock(&rq->lock);
1611 }
1612 }
1613}
1614
1589static void switched_from_dl(struct rq *rq, struct task_struct *p) 1615static void switched_from_dl(struct rq *rq, struct task_struct *p)
1590{ 1616{
1591 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1617 cancel_dl_timer(rq, p);
1592 hrtimer_try_to_cancel(&p->dl.dl_timer);
1593 1618
1594 __dl_clear_params(p); 1619 __dl_clear_params(p);
1595 1620
1596#ifdef CONFIG_SMP
1597 /* 1621 /*
1598 * Since this might be the only -deadline task on the rq, 1622 * Since this might be the only -deadline task on the rq,
1599 * this is the right place to try to pull some other one 1623 * this is the right place to try to pull some other one
1600 * from an overloaded cpu, if any. 1624 * from an overloaded cpu, if any.
1601 */ 1625 */
1602 if (!rq->dl.dl_nr_running) 1626 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
1603 pull_dl_task(rq); 1627 return;
1604#endif 1628
1629 if (pull_dl_task(rq))
1630 resched_curr(rq);
1605} 1631}
1606 1632
1607/* 1633/*
@@ -1622,7 +1648,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1622 1648
1623 if (task_on_rq_queued(p) && rq->curr != p) { 1649 if (task_on_rq_queued(p) && rq->curr != p) {
1624#ifdef CONFIG_SMP 1650#ifdef CONFIG_SMP
1625 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1651 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
1652 push_dl_task(rq) && rq != task_rq(p))
1626 /* Only reschedule if pushing failed */ 1653 /* Only reschedule if pushing failed */
1627 check_resched = 0; 1654 check_resched = 0;
1628#endif /* CONFIG_SMP */ 1655#endif /* CONFIG_SMP */
@@ -1704,3 +1731,12 @@ const struct sched_class dl_sched_class = {
1704 1731
1705 .update_curr = update_curr_dl, 1732 .update_curr = update_curr_dl,
1706}; 1733};
1734
1735#ifdef CONFIG_SCHED_DEBUG
1736extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
1737
1738void print_dl_stats(struct seq_file *m, int cpu)
1739{
1740 print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
1741}
1742#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
261#undef P 261#undef P
262} 262}
263 263
264void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
265{
266 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
267 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
268}
269
264extern __read_mostly int sched_clock_running; 270extern __read_mostly int sched_clock_running;
265 271
266static void print_cpu(struct seq_file *m, int cpu) 272static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
329 spin_lock_irqsave(&sched_debug_lock, flags); 335 spin_lock_irqsave(&sched_debug_lock, flags);
330 print_cfs_stats(m, cpu); 336 print_cfs_stats(m, cpu);
331 print_rt_stats(m, cpu); 337 print_rt_stats(m, cpu);
338 print_dl_stats(m, cpu);
332 339
333 print_rq(m, rq, cpu); 340 print_rq(m, rq, cpu);
334 spin_unlock_irqrestore(&sched_debug_lock, flags); 341 spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
528 unsigned long nr_faults = -1; 535 unsigned long nr_faults = -1;
529 int cpu_current, home_node; 536 int cpu_current, home_node;
530 537
531 if (p->numa_faults_memory) 538 if (p->numa_faults)
532 nr_faults = p->numa_faults_memory[2*node + i]; 539 nr_faults = p->numa_faults[2*node + i];
533 540
534 cpu_current = !i ? (task_node(p) == node) : 541 cpu_current = !i ? (task_node(p) == node) :
535 (pol && node_isset(node, pol->v.nodes)); 542 (pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104b254c..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,7 +873,6 @@ struct numa_group {
873 spinlock_t lock; /* nr_tasks, tasks */ 873 spinlock_t lock; /* nr_tasks, tasks */
874 int nr_tasks; 874 int nr_tasks;
875 pid_t gid; 875 pid_t gid;
876 struct list_head task_list;
877 876
878 struct rcu_head rcu; 877 struct rcu_head rcu;
879 nodemask_t active_nodes; 878 nodemask_t active_nodes;
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
901 return p->numa_group ? p->numa_group->gid : 0; 900 return p->numa_group ? p->numa_group->gid : 0;
902} 901}
903 902
904static inline int task_faults_idx(int nid, int priv) 903/*
904 * The averaged statistics, shared & private, memory & cpu,
905 * occupy the first half of the array. The second half of the
906 * array is for current counters, which are averaged into the
907 * first set by task_numa_placement.
908 */
909static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
905{ 910{
906 return NR_NUMA_HINT_FAULT_TYPES * nid + priv; 911 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
907} 912}
908 913
909static inline unsigned long task_faults(struct task_struct *p, int nid) 914static inline unsigned long task_faults(struct task_struct *p, int nid)
910{ 915{
911 if (!p->numa_faults_memory) 916 if (!p->numa_faults)
912 return 0; 917 return 0;
913 918
914 return p->numa_faults_memory[task_faults_idx(nid, 0)] + 919 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
915 p->numa_faults_memory[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
916} 921}
917 922
918static inline unsigned long group_faults(struct task_struct *p, int nid) 923static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
920 if (!p->numa_group) 925 if (!p->numa_group)
921 return 0; 926 return 0;
922 927
923 return p->numa_group->faults[task_faults_idx(nid, 0)] + 928 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
924 p->numa_group->faults[task_faults_idx(nid, 1)]; 929 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
925} 930}
926 931
927static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) 932static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
928{ 933{
929 return group->faults_cpu[task_faults_idx(nid, 0)] + 934 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
930 group->faults_cpu[task_faults_idx(nid, 1)]; 935 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
936}
937
938/* Handle placement on systems where not all nodes are directly connected. */
939static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
940 int maxdist, bool task)
941{
942 unsigned long score = 0;
943 int node;
944
945 /*
946 * All nodes are directly connected, and the same distance
947 * from each other. No need for fancy placement algorithms.
948 */
949 if (sched_numa_topology_type == NUMA_DIRECT)
950 return 0;
951
952 /*
953 * This code is called for each node, introducing N^2 complexity,
954 * which should be ok given the number of nodes rarely exceeds 8.
955 */
956 for_each_online_node(node) {
957 unsigned long faults;
958 int dist = node_distance(nid, node);
959
960 /*
961 * The furthest away nodes in the system are not interesting
962 * for placement; nid was already counted.
963 */
964 if (dist == sched_max_numa_distance || node == nid)
965 continue;
966
967 /*
968 * On systems with a backplane NUMA topology, compare groups
969 * of nodes, and move tasks towards the group with the most
970 * memory accesses. When comparing two nodes at distance
971 * "hoplimit", only nodes closer by than "hoplimit" are part
972 * of each group. Skip other nodes.
973 */
974 if (sched_numa_topology_type == NUMA_BACKPLANE &&
975 dist > maxdist)
976 continue;
977
978 /* Add up the faults from nearby nodes. */
979 if (task)
980 faults = task_faults(p, node);
981 else
982 faults = group_faults(p, node);
983
984 /*
985 * On systems with a glueless mesh NUMA topology, there are
986 * no fixed "groups of nodes". Instead, nodes that are not
987 * directly connected bounce traffic through intermediate
988 * nodes; a numa_group can occupy any set of nodes.
989 * The further away a node is, the less the faults count.
990 * This seems to result in good task placement.
991 */
992 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
993 faults *= (sched_max_numa_distance - dist);
994 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
995 }
996
997 score += faults;
998 }
999
1000 return score;
931} 1001}
932 1002
933/* 1003/*
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
936 * larger multiplier, in order to group tasks together that are almost 1006 * larger multiplier, in order to group tasks together that are almost
937 * evenly spread out between numa nodes. 1007 * evenly spread out between numa nodes.
938 */ 1008 */
939static inline unsigned long task_weight(struct task_struct *p, int nid) 1009static inline unsigned long task_weight(struct task_struct *p, int nid,
1010 int dist)
940{ 1011{
941 unsigned long total_faults; 1012 unsigned long faults, total_faults;
942 1013
943 if (!p->numa_faults_memory) 1014 if (!p->numa_faults)
944 return 0; 1015 return 0;
945 1016
946 total_faults = p->total_numa_faults; 1017 total_faults = p->total_numa_faults;
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
948 if (!total_faults) 1019 if (!total_faults)
949 return 0; 1020 return 0;
950 1021
951 return 1000 * task_faults(p, nid) / total_faults; 1022 faults = task_faults(p, nid);
1023 faults += score_nearby_nodes(p, nid, dist, true);
1024
1025 return 1000 * faults / total_faults;
952} 1026}
953 1027
954static inline unsigned long group_weight(struct task_struct *p, int nid) 1028static inline unsigned long group_weight(struct task_struct *p, int nid,
1029 int dist)
955{ 1030{
956 if (!p->numa_group || !p->numa_group->total_faults) 1031 unsigned long faults, total_faults;
1032
1033 if (!p->numa_group)
1034 return 0;
1035
1036 total_faults = p->numa_group->total_faults;
1037
1038 if (!total_faults)
957 return 0; 1039 return 0;
958 1040
959 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 1041 faults = group_faults(p, nid);
1042 faults += score_nearby_nodes(p, nid, dist, false);
1043
1044 return 1000 * faults / total_faults;
960} 1045}
961 1046
962bool should_numa_migrate_memory(struct task_struct *p, struct page * page, 1047bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1089,6 +1174,7 @@ struct task_numa_env {
1089 struct numa_stats src_stats, dst_stats; 1174 struct numa_stats src_stats, dst_stats;
1090 1175
1091 int imbalance_pct; 1176 int imbalance_pct;
1177 int dist;
1092 1178
1093 struct task_struct *best_task; 1179 struct task_struct *best_task;
1094 long best_imp; 1180 long best_imp;
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env,
1168 long load; 1254 long load;
1169 long imp = env->p->numa_group ? groupimp : taskimp; 1255 long imp = env->p->numa_group ? groupimp : taskimp;
1170 long moveimp = imp; 1256 long moveimp = imp;
1257 int dist = env->dist;
1171 1258
1172 rcu_read_lock(); 1259 rcu_read_lock();
1173 1260
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
1208 * in any group then look only at task weights. 1295 * in any group then look only at task weights.
1209 */ 1296 */
1210 if (cur->numa_group == env->p->numa_group) { 1297 if (cur->numa_group == env->p->numa_group) {
1211 imp = taskimp + task_weight(cur, env->src_nid) - 1298 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1212 task_weight(cur, env->dst_nid); 1299 task_weight(cur, env->dst_nid, dist);
1213 /* 1300 /*
1214 * Add some hysteresis to prevent swapping the 1301 * Add some hysteresis to prevent swapping the
1215 * tasks within a group over tiny differences. 1302 * tasks within a group over tiny differences.
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
1223 * instead. 1310 * instead.
1224 */ 1311 */
1225 if (cur->numa_group) 1312 if (cur->numa_group)
1226 imp += group_weight(cur, env->src_nid) - 1313 imp += group_weight(cur, env->src_nid, dist) -
1227 group_weight(cur, env->dst_nid); 1314 group_weight(cur, env->dst_nid, dist);
1228 else 1315 else
1229 imp += task_weight(cur, env->src_nid) - 1316 imp += task_weight(cur, env->src_nid, dist) -
1230 task_weight(cur, env->dst_nid); 1317 task_weight(cur, env->dst_nid, dist);
1231 } 1318 }
1232 } 1319 }
1233 1320
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
1326 }; 1413 };
1327 struct sched_domain *sd; 1414 struct sched_domain *sd;
1328 unsigned long taskweight, groupweight; 1415 unsigned long taskweight, groupweight;
1329 int nid, ret; 1416 int nid, ret, dist;
1330 long taskimp, groupimp; 1417 long taskimp, groupimp;
1331 1418
1332 /* 1419 /*
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
1354 return -EINVAL; 1441 return -EINVAL;
1355 } 1442 }
1356 1443
1357 taskweight = task_weight(p, env.src_nid);
1358 groupweight = group_weight(p, env.src_nid);
1359 update_numa_stats(&env.src_stats, env.src_nid);
1360 env.dst_nid = p->numa_preferred_nid; 1444 env.dst_nid = p->numa_preferred_nid;
1361 taskimp = task_weight(p, env.dst_nid) - taskweight; 1445 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1362 groupimp = group_weight(p, env.dst_nid) - groupweight; 1446 taskweight = task_weight(p, env.src_nid, dist);
1447 groupweight = group_weight(p, env.src_nid, dist);
1448 update_numa_stats(&env.src_stats, env.src_nid);
1449 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1450 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1363 update_numa_stats(&env.dst_stats, env.dst_nid); 1451 update_numa_stats(&env.dst_stats, env.dst_nid);
1364 1452
1365 /* Try to find a spot on the preferred nid. */ 1453 /* Try to find a spot on the preferred nid. */
1366 task_numa_find_cpu(&env, taskimp, groupimp); 1454 task_numa_find_cpu(&env, taskimp, groupimp);
1367 1455
1368 /* No space available on the preferred nid. Look elsewhere. */ 1456 /*
1369 if (env.best_cpu == -1) { 1457 * Look at other nodes in these cases:
1458 * - there is no space available on the preferred_nid
1459 * - the task is part of a numa_group that is interleaved across
1460 * multiple NUMA nodes; in order to better consolidate the group,
1461 * we need to check other locations.
1462 */
1463 if (env.best_cpu == -1 || (p->numa_group &&
1464 nodes_weight(p->numa_group->active_nodes) > 1)) {
1370 for_each_online_node(nid) { 1465 for_each_online_node(nid) {
1371 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1466 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1372 continue; 1467 continue;
1373 1468
1469 dist = node_distance(env.src_nid, env.dst_nid);
1470 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1471 dist != env.dist) {
1472 taskweight = task_weight(p, env.src_nid, dist);
1473 groupweight = group_weight(p, env.src_nid, dist);
1474 }
1475
1374 /* Only consider nodes where both task and groups benefit */ 1476 /* Only consider nodes where both task and groups benefit */
1375 taskimp = task_weight(p, nid) - taskweight; 1477 taskimp = task_weight(p, nid, dist) - taskweight;
1376 groupimp = group_weight(p, nid) - groupweight; 1478 groupimp = group_weight(p, nid, dist) - groupweight;
1377 if (taskimp < 0 && groupimp < 0) 1479 if (taskimp < 0 && groupimp < 0)
1378 continue; 1480 continue;
1379 1481
1482 env.dist = dist;
1380 env.dst_nid = nid; 1483 env.dst_nid = nid;
1381 update_numa_stats(&env.dst_stats, env.dst_nid); 1484 update_numa_stats(&env.dst_stats, env.dst_nid);
1382 task_numa_find_cpu(&env, taskimp, groupimp); 1485 task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
1431 unsigned long interval = HZ; 1534 unsigned long interval = HZ;
1432 1535
1433 /* This task has no NUMA fault statistics yet */ 1536 /* This task has no NUMA fault statistics yet */
1434 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1537 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1435 return; 1538 return;
1436 1539
1437 /* Periodically retry migrating the task to the preferred node */ 1540 /* Periodically retry migrating the task to the preferred node */
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1580 return delta; 1683 return delta;
1581} 1684}
1582 1685
1686/*
1687 * Determine the preferred nid for a task in a numa_group. This needs to
1688 * be done in a way that produces consistent results with group_weight,
1689 * otherwise workloads might not converge.
1690 */
1691static int preferred_group_nid(struct task_struct *p, int nid)
1692{
1693 nodemask_t nodes;
1694 int dist;
1695
1696 /* Direct connections between all NUMA nodes. */
1697 if (sched_numa_topology_type == NUMA_DIRECT)
1698 return nid;
1699
1700 /*
1701 * On a system with glueless mesh NUMA topology, group_weight
1702 * scores nodes according to the number of NUMA hinting faults on
1703 * both the node itself, and on nearby nodes.
1704 */
1705 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1706 unsigned long score, max_score = 0;
1707 int node, max_node = nid;
1708
1709 dist = sched_max_numa_distance;
1710
1711 for_each_online_node(node) {
1712 score = group_weight(p, node, dist);
1713 if (score > max_score) {
1714 max_score = score;
1715 max_node = node;
1716 }
1717 }
1718 return max_node;
1719 }
1720
1721 /*
1722 * Finding the preferred nid in a system with NUMA backplane
1723 * interconnect topology is more involved. The goal is to locate
1724 * tasks from numa_groups near each other in the system, and
1725 * untangle workloads from different sides of the system. This requires
1726 * searching down the hierarchy of node groups, recursively searching
1727 * inside the highest scoring group of nodes. The nodemask tricks
1728 * keep the complexity of the search down.
1729 */
1730 nodes = node_online_map;
1731 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1732 unsigned long max_faults = 0;
1733 nodemask_t max_group;
1734 int a, b;
1735
1736 /* Are there nodes at this distance from each other? */
1737 if (!find_numa_distance(dist))
1738 continue;
1739
1740 for_each_node_mask(a, nodes) {
1741 unsigned long faults = 0;
1742 nodemask_t this_group;
1743 nodes_clear(this_group);
1744
1745 /* Sum group's NUMA faults; includes a==b case. */
1746 for_each_node_mask(b, nodes) {
1747 if (node_distance(a, b) < dist) {
1748 faults += group_faults(p, b);
1749 node_set(b, this_group);
1750 node_clear(b, nodes);
1751 }
1752 }
1753
1754 /* Remember the top group. */
1755 if (faults > max_faults) {
1756 max_faults = faults;
1757 max_group = this_group;
1758 /*
1759 * subtle: at the smallest distance there is
1760 * just one node left in each "group", the
1761 * winner is the preferred nid.
1762 */
1763 nid = a;
1764 }
1765 }
1766 /* Next round, evaluate the nodes within max_group. */
1767 nodes = max_group;
1768 }
1769 return nid;
1770}
1771
1583static void task_numa_placement(struct task_struct *p) 1772static void task_numa_placement(struct task_struct *p)
1584{ 1773{
1585 int seq, nid, max_nid = -1, max_group_nid = -1; 1774 int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
1607 1796
1608 /* Find the node with the highest number of faults */ 1797 /* Find the node with the highest number of faults */
1609 for_each_online_node(nid) { 1798 for_each_online_node(nid) {
1799 /* Keep track of the offsets in numa_faults array */
1800 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1610 unsigned long faults = 0, group_faults = 0; 1801 unsigned long faults = 0, group_faults = 0;
1611 int priv, i; 1802 int priv;
1612 1803
1613 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { 1804 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1614 long diff, f_diff, f_weight; 1805 long diff, f_diff, f_weight;
1615 1806
1616 i = task_faults_idx(nid, priv); 1807 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1808 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1809 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1810 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1617 1811
1618 /* Decay existing window, copy faults since last scan */ 1812 /* Decay existing window, copy faults since last scan */
1619 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1813 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1620 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1814 fault_types[priv] += p->numa_faults[membuf_idx];
1621 p->numa_faults_buffer_memory[i] = 0; 1815 p->numa_faults[membuf_idx] = 0;
1622 1816
1623 /* 1817 /*
1624 * Normalize the faults_from, so all tasks in a group 1818 * Normalize the faults_from, so all tasks in a group
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
1628 * faults are less important. 1822 * faults are less important.
1629 */ 1823 */
1630 f_weight = div64_u64(runtime << 16, period + 1); 1824 f_weight = div64_u64(runtime << 16, period + 1);
1631 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / 1825 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1632 (total_faults + 1); 1826 (total_faults + 1);
1633 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1827 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1634 p->numa_faults_buffer_cpu[i] = 0; 1828 p->numa_faults[cpubuf_idx] = 0;
1635 1829
1636 p->numa_faults_memory[i] += diff; 1830 p->numa_faults[mem_idx] += diff;
1637 p->numa_faults_cpu[i] += f_diff; 1831 p->numa_faults[cpu_idx] += f_diff;
1638 faults += p->numa_faults_memory[i]; 1832 faults += p->numa_faults[mem_idx];
1639 p->total_numa_faults += diff; 1833 p->total_numa_faults += diff;
1640 if (p->numa_group) { 1834 if (p->numa_group) {
1641 /* safe because we can only change our own group */ 1835 /*
1642 p->numa_group->faults[i] += diff; 1836 * safe because we can only change our own group
1643 p->numa_group->faults_cpu[i] += f_diff; 1837 *
1838 * mem_idx represents the offset for a given
1839 * nid and priv in a specific region because it
1840 * is at the beginning of the numa_faults array.
1841 */
1842 p->numa_group->faults[mem_idx] += diff;
1843 p->numa_group->faults_cpu[mem_idx] += f_diff;
1644 p->numa_group->total_faults += diff; 1844 p->numa_group->total_faults += diff;
1645 group_faults += p->numa_group->faults[i]; 1845 group_faults += p->numa_group->faults[mem_idx];
1646 } 1846 }
1647 } 1847 }
1648 1848
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
1662 if (p->numa_group) { 1862 if (p->numa_group) {
1663 update_numa_active_node_mask(p->numa_group); 1863 update_numa_active_node_mask(p->numa_group);
1664 spin_unlock_irq(group_lock); 1864 spin_unlock_irq(group_lock);
1665 max_nid = max_group_nid; 1865 max_nid = preferred_group_nid(p, max_group_nid);
1666 } 1866 }
1667 1867
1668 if (max_faults) { 1868 if (max_faults) {
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1705 1905
1706 atomic_set(&grp->refcount, 1); 1906 atomic_set(&grp->refcount, 1);
1707 spin_lock_init(&grp->lock); 1907 spin_lock_init(&grp->lock);
1708 INIT_LIST_HEAD(&grp->task_list);
1709 grp->gid = p->pid; 1908 grp->gid = p->pid;
1710 /* Second half of the array tracks nids where faults happen */ 1909 /* Second half of the array tracks nids where faults happen */
1711 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 1910 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1714 node_set(task_node(current), grp->active_nodes); 1913 node_set(task_node(current), grp->active_nodes);
1715 1914
1716 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 1915 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1717 grp->faults[i] = p->numa_faults_memory[i]; 1916 grp->faults[i] = p->numa_faults[i];
1718 1917
1719 grp->total_faults = p->total_numa_faults; 1918 grp->total_faults = p->total_numa_faults;
1720 1919
1721 list_add(&p->numa_entry, &grp->task_list);
1722 grp->nr_tasks++; 1920 grp->nr_tasks++;
1723 rcu_assign_pointer(p->numa_group, grp); 1921 rcu_assign_pointer(p->numa_group, grp);
1724 } 1922 }
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1773 double_lock_irq(&my_grp->lock, &grp->lock); 1971 double_lock_irq(&my_grp->lock, &grp->lock);
1774 1972
1775 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { 1973 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1776 my_grp->faults[i] -= p->numa_faults_memory[i]; 1974 my_grp->faults[i] -= p->numa_faults[i];
1777 grp->faults[i] += p->numa_faults_memory[i]; 1975 grp->faults[i] += p->numa_faults[i];
1778 } 1976 }
1779 my_grp->total_faults -= p->total_numa_faults; 1977 my_grp->total_faults -= p->total_numa_faults;
1780 grp->total_faults += p->total_numa_faults; 1978 grp->total_faults += p->total_numa_faults;
1781 1979
1782 list_move(&p->numa_entry, &grp->task_list);
1783 my_grp->nr_tasks--; 1980 my_grp->nr_tasks--;
1784 grp->nr_tasks++; 1981 grp->nr_tasks++;
1785 1982
@@ -1799,27 +1996,23 @@ no_join:
1799void task_numa_free(struct task_struct *p) 1996void task_numa_free(struct task_struct *p)
1800{ 1997{
1801 struct numa_group *grp = p->numa_group; 1998 struct numa_group *grp = p->numa_group;
1802 void *numa_faults = p->numa_faults_memory; 1999 void *numa_faults = p->numa_faults;
1803 unsigned long flags; 2000 unsigned long flags;
1804 int i; 2001 int i;
1805 2002
1806 if (grp) { 2003 if (grp) {
1807 spin_lock_irqsave(&grp->lock, flags); 2004 spin_lock_irqsave(&grp->lock, flags);
1808 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2005 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1809 grp->faults[i] -= p->numa_faults_memory[i]; 2006 grp->faults[i] -= p->numa_faults[i];
1810 grp->total_faults -= p->total_numa_faults; 2007 grp->total_faults -= p->total_numa_faults;
1811 2008
1812 list_del(&p->numa_entry);
1813 grp->nr_tasks--; 2009 grp->nr_tasks--;
1814 spin_unlock_irqrestore(&grp->lock, flags); 2010 spin_unlock_irqrestore(&grp->lock, flags);
1815 RCU_INIT_POINTER(p->numa_group, NULL); 2011 RCU_INIT_POINTER(p->numa_group, NULL);
1816 put_numa_group(grp); 2012 put_numa_group(grp);
1817 } 2013 }
1818 2014
1819 p->numa_faults_memory = NULL; 2015 p->numa_faults = NULL;
1820 p->numa_faults_buffer_memory = NULL;
1821 p->numa_faults_cpu= NULL;
1822 p->numa_faults_buffer_cpu = NULL;
1823 kfree(numa_faults); 2016 kfree(numa_faults);
1824} 2017}
1825 2018
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1842 return; 2035 return;
1843 2036
1844 /* Allocate buffer to track faults on a per-node basis */ 2037 /* Allocate buffer to track faults on a per-node basis */
1845 if (unlikely(!p->numa_faults_memory)) { 2038 if (unlikely(!p->numa_faults)) {
1846 int size = sizeof(*p->numa_faults_memory) * 2039 int size = sizeof(*p->numa_faults) *
1847 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 2040 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1848 2041
1849 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 2042 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1850 if (!p->numa_faults_memory) 2043 if (!p->numa_faults)
1851 return; 2044 return;
1852 2045
1853 BUG_ON(p->numa_faults_buffer_memory);
1854 /*
1855 * The averaged statistics, shared & private, memory & cpu,
1856 * occupy the first half of the array. The second half of the
1857 * array is for current counters, which are averaged into the
1858 * first set by task_numa_placement.
1859 */
1860 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1861 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1862 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1863 p->total_numa_faults = 0; 2046 p->total_numa_faults = 0;
1864 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 2047 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1865 } 2048 }
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1899 if (migrated) 2082 if (migrated)
1900 p->numa_pages_migrated += pages; 2083 p->numa_pages_migrated += pages;
1901 2084
1902 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 2085 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
1903 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 2086 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
1904 p->numa_faults_locality[local] += pages; 2087 p->numa_faults_locality[local] += pages;
1905} 2088}
1906 2089
@@ -3822,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3822 4005
3823static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4006static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3824{ 4007{
4008 /* init_cfs_bandwidth() was not called */
4009 if (!cfs_b->throttled_cfs_rq.next)
4010 return;
4011
3825 hrtimer_cancel(&cfs_b->period_timer); 4012 hrtimer_cancel(&cfs_b->period_timer);
3826 hrtimer_cancel(&cfs_b->slack_timer); 4013 hrtimer_cancel(&cfs_b->slack_timer);
3827} 4014}
@@ -4241,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4241 * wl = S * s'_i; see (2) 4428 * wl = S * s'_i; see (2)
4242 */ 4429 */
4243 if (W > 0 && w < W) 4430 if (W > 0 && w < W)
4244 wl = (w * tg->shares) / W; 4431 wl = (w * (long)tg->shares) / W;
4245 else 4432 else
4246 wl = tg->shares; 4433 wl = tg->shares;
4247 4434
@@ -4469,7 +4656,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4469 latest_idle_timestamp = rq->idle_stamp; 4656 latest_idle_timestamp = rq->idle_stamp;
4470 shallowest_idle_cpu = i; 4657 shallowest_idle_cpu = i;
4471 } 4658 }
4472 } else { 4659 } else if (shallowest_idle_cpu == -1) {
4473 load = weighted_cpuload(i); 4660 load = weighted_cpuload(i);
4474 if (load < min_load || (load == min_load && i == this_cpu)) { 4661 if (load < min_load || (load == min_load && i == this_cpu)) {
4475 min_load = load; 4662 min_load = load;
@@ -4547,9 +4734,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4547 int want_affine = 0; 4734 int want_affine = 0;
4548 int sync = wake_flags & WF_SYNC; 4735 int sync = wake_flags & WF_SYNC;
4549 4736
4550 if (p->nr_cpus_allowed == 1)
4551 return prev_cpu;
4552
4553 if (sd_flag & SD_BALANCE_WAKE) 4737 if (sd_flag & SD_BALANCE_WAKE)
4554 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 4738 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4555 4739
@@ -5189,7 +5373,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5189 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5373 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5190 int src_nid, dst_nid; 5374 int src_nid, dst_nid;
5191 5375
5192 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5376 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5193 !(env->sd->flags & SD_NUMA)) { 5377 !(env->sd->flags & SD_NUMA)) {
5194 return false; 5378 return false;
5195 } 5379 }
@@ -5228,7 +5412,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5228 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5412 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5229 return false; 5413 return false;
5230 5414
5231 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) 5415 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5232 return false; 5416 return false;
5233 5417
5234 src_nid = cpu_to_node(env->src_cpu); 5418 src_nid = cpu_to_node(env->src_cpu);
@@ -6172,8 +6356,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6172 * with a large weight task outweighs the tasks on the system). 6356 * with a large weight task outweighs the tasks on the system).
6173 */ 6357 */
6174 if (prefer_sibling && sds->local && 6358 if (prefer_sibling && sds->local &&
6175 sds->local_stat.group_has_free_capacity) 6359 sds->local_stat.group_has_free_capacity) {
6176 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); 6360 sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6361 sgs->group_type = group_classify(sg, sgs);
6362 }
6177 6363
6178 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6364 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6179 sds->busiest = sg; 6365 sds->busiest = sg;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1301 struct task_struct *curr; 1301 struct task_struct *curr;
1302 struct rq *rq; 1302 struct rq *rq;
1303 1303
1304 if (p->nr_cpus_allowed == 1)
1305 goto out;
1306
1307 /* For anything but wake ups, just return the task_cpu */ 1304 /* For anything but wake ups, just return the task_cpu */
1308 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1305 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1309 goto out; 1306 goto out;
@@ -1351,16 +1348,22 @@ out:
1351 1348
1352static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1349static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1353{ 1350{
1354 if (rq->curr->nr_cpus_allowed == 1) 1351 /*
1352 * Current can't be migrated, useless to reschedule,
1353 * let's hope p can move out.
1354 */
1355 if (rq->curr->nr_cpus_allowed == 1 ||
1356 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1355 return; 1357 return;
1356 1358
1359 /*
1360 * p is migratable, so let's not schedule it and
1361 * see if it is pushed or pulled somewhere else.
1362 */
1357 if (p->nr_cpus_allowed != 1 1363 if (p->nr_cpus_allowed != 1
1358 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1364 && cpupri_find(&rq->rd->cpupri, p, NULL))
1359 return; 1365 return;
1360 1366
1361 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1362 return;
1363
1364 /* 1367 /*
1365 * There appears to be other cpus that can accept 1368 * There appears to be other cpus that can accept
1366 * current and none to run 'p', so lets reschedule 1369 * current and none to run 'p', so lets reschedule
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef067cc5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
176 u64 bw, total_bw; 176 u64 bw, total_bw;
177}; 177};
178 178
179static inline
180void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
181{
182 dl_b->total_bw -= tsk_bw;
183}
184
185static inline
186void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
187{
188 dl_b->total_bw += tsk_bw;
189}
190
191static inline
192bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
193{
194 return dl_b->bw != -1 &&
195 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
196}
197
179extern struct mutex sched_domains_mutex; 198extern struct mutex sched_domains_mutex;
180 199
181#ifdef CONFIG_CGROUP_SCHED 200#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
678 return rq->clock_task; 697 return rq->clock_task;
679} 698}
680 699
700#ifdef CONFIG_NUMA
701enum numa_topology_type {
702 NUMA_DIRECT,
703 NUMA_GLUELESS_MESH,
704 NUMA_BACKPLANE,
705};
706extern enum numa_topology_type sched_numa_topology_type;
707extern int sched_max_numa_distance;
708extern bool find_numa_distance(int distance);
709#endif
710
681#ifdef CONFIG_NUMA_BALANCING 711#ifdef CONFIG_NUMA_BALANCING
712/* The regions in numa_faults array from task_struct */
713enum numa_faults_stats {
714 NUMA_MEM = 0,
715 NUMA_CPU,
716 NUMA_MEMBUF,
717 NUMA_CPUBUF
718};
682extern void sched_setnuma(struct task_struct *p, int node); 719extern void sched_setnuma(struct task_struct *p, int node);
683extern int migrate_task_to(struct task_struct *p, int cpu); 720extern int migrate_task_to(struct task_struct *p, int cpu);
684extern int migrate_swap(struct task_struct *, struct task_struct *); 721extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
1127 void (*task_fork) (struct task_struct *p); 1164 void (*task_fork) (struct task_struct *p);
1128 void (*task_dead) (struct task_struct *p); 1165 void (*task_dead) (struct task_struct *p);
1129 1166
1167 /*
1168 * The switched_from() call is allowed to drop rq->lock, therefore we
1169 * cannot assume the switched_from/switched_to pair is serliazed by
1170 * rq->lock. They are however serialized by p->pi_lock.
1171 */
1130 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1172 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1131 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1173 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1132 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1174 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1504extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 1546extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1505extern void print_cfs_stats(struct seq_file *m, int cpu); 1547extern void print_cfs_stats(struct seq_file *m, int cpu);
1506extern void print_rt_stats(struct seq_file *m, int cpu); 1548extern void print_rt_stats(struct seq_file *m, int cpu);
1549extern void print_dl_stats(struct seq_file *m, int cpu);
1507 1550
1508extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1551extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1509extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1552extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12#include <linux/kthread.h>
12 13
13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) 14void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 15{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
297} 298}
298EXPORT_SYMBOL(autoremove_wake_function); 299EXPORT_SYMBOL(autoremove_wake_function);
299 300
301static inline bool is_kthread_should_stop(void)
302{
303 return (current->flags & PF_KTHREAD) && kthread_should_stop();
304}
305
306/*
307 * DEFINE_WAIT_FUNC(wait, woken_wake_func);
308 *
309 * add_wait_queue(&wq, &wait);
310 * for (;;) {
311 * if (condition)
312 * break;
313 *
314 * p->state = mode; condition = true;
315 * smp_mb(); // A smp_wmb(); // C
316 * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
317 * schedule() try_to_wake_up();
318 * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
319 * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
320 * smp_mb() // B smp_wmb(); // C
321 * wait->flags |= WQ_FLAG_WOKEN;
322 * }
323 * remove_wait_queue(&wq, &wait);
324 *
325 */
326long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
327{
328 set_current_state(mode); /* A */
329 /*
330 * The above implies an smp_mb(), which matches with the smp_wmb() from
331 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
332 * also observe all state before the wakeup.
333 */
334 if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
335 timeout = schedule_timeout(timeout);
336 __set_current_state(TASK_RUNNING);
337
338 /*
339 * The below implies an smp_mb(), it too pairs with the smp_wmb() from
340 * woken_wake_function() such that we must either observe the wait
341 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
342 * an event.
343 */
344 set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
345
346 return timeout;
347}
348EXPORT_SYMBOL(wait_woken);
349
350int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
351{
352 /*
353 * Although this function is called under waitqueue lock, LOCK
354 * doesn't imply write barrier and the users expects write
355 * barrier semantics on wakeup functions. The following
356 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
357 * and is paired with set_mb() in wait_woken().
358 */
359 smp_wmb(); /* C */
360 wait->flags |= WQ_FLAG_WOKEN;
361
362 return default_wake_function(wait, mode, sync, key);
363}
364EXPORT_SYMBOL(woken_wake_function);
365
300int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) 366int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
301{ 367{
302 struct wait_bit_key *key = arg; 368 struct wait_bit_key *key = arg;
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..16a305295256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1275 local_irq_restore(*flags); 1275 local_irq_restore(*flags);
1276 break; 1276 break;
1277 } 1277 }
1278 1278 /*
1279 * This sighand can be already freed and even reused, but
1280 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
1281 * initializes ->siglock: this slab can't go away, it has
1282 * the same object type, ->siglock can't be reinitialized.
1283 *
1284 * We need to ensure that tsk->sighand is still the same
1285 * after we take the lock, we can race with de_thread() or
1286 * __exit_signal(). In the latter case the next iteration
1287 * must see ->sighand == NULL.
1288 */
1279 spin_lock(&sighand->siglock); 1289 spin_lock(&sighand->siglock);
1280 if (likely(sighand == tsk->sighand)) { 1290 if (likely(sighand == tsk->sighand)) {
1281 rcu_read_unlock(); 1291 rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
1331 int error = -ESRCH; 1341 int error = -ESRCH;
1332 struct task_struct *p; 1342 struct task_struct *p;
1333 1343
1334 rcu_read_lock(); 1344 for (;;) {
1335retry: 1345 rcu_read_lock();
1336 p = pid_task(pid, PIDTYPE_PID); 1346 p = pid_task(pid, PIDTYPE_PID);
1337 if (p) { 1347 if (p)
1338 error = group_send_sig_info(sig, info, p); 1348 error = group_send_sig_info(sig, info, p);
1339 if (unlikely(error == -ESRCH)) 1349 rcu_read_unlock();
1340 /* 1350 if (likely(!p || error != -ESRCH))
1341 * The task was unhashed in between, try again. 1351 return error;
1342 * If it is dead, pid_task() will return NULL,
1343 * if we race with de_thread() it will find the
1344 * new leader.
1345 */
1346 goto retry;
1347 }
1348 rcu_read_unlock();
1349 1352
1350 return error; 1353 /*
1354 * The task was unhashed in between, try again. If it
1355 * is dead, pid_task() will return NULL, if we race with
1356 * de_thread() it will find the new leader.
1357 */
1358 }
1351} 1359}
1352 1360
1353int kill_proc_info(int sig, struct siginfo *info, pid_t pid) 1361int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2748 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2756 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2749 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2757 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2750#endif 2758#endif
2759#ifdef SEGV_BNDERR
2760 err |= __put_user(from->si_lower, &to->si_lower);
2761 err |= __put_user(from->si_upper, &to->si_upper);
2762#endif
2751 break; 2763 break;
2752 case __SI_CHLD: 2764 case __SI_CHLD:
2753 err |= __put_user(from->si_pid, &to->si_pid); 2765 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
110 set_current_state(TASK_INTERRUPTIBLE); 110 set_current_state(TASK_INTERRUPTIBLE);
111 preempt_disable(); 111 preempt_disable();
112 if (kthread_should_stop()) { 112 if (kthread_should_stop()) {
113 set_current_state(TASK_RUNNING); 113 __set_current_state(TASK_RUNNING);
114 preempt_enable(); 114 preempt_enable();
115 if (ht->cleanup) 115 if (ht->cleanup)
116 ht->cleanup(td->cpu, cpu_online(td->cpu)); 116 ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
136 /* Check for state change setup */ 136 /* Check for state change setup */
137 switch (td->status) { 137 switch (td->status) {
138 case HP_THREAD_NONE: 138 case HP_THREAD_NONE:
139 __set_current_state(TASK_RUNNING);
139 preempt_enable(); 140 preempt_enable();
140 if (ht->setup) 141 if (ht->setup)
141 ht->setup(td->cpu); 142 ht->setup(td->cpu);
142 td->status = HP_THREAD_ACTIVE; 143 td->status = HP_THREAD_ACTIVE;
143 preempt_disable(); 144 continue;
144 break; 145
145 case HP_THREAD_PARKED: 146 case HP_THREAD_PARKED:
147 __set_current_state(TASK_RUNNING);
146 preempt_enable(); 148 preempt_enable();
147 if (ht->unpark) 149 if (ht->unpark)
148 ht->unpark(td->cpu); 150 ht->unpark(td->cpu);
149 td->status = HP_THREAD_ACTIVE; 151 td->status = HP_THREAD_ACTIVE;
150 preempt_disable(); 152 continue;
151 break;
152 } 153 }
153 154
154 if (!ht->thread_should_run(td->cpu)) { 155 if (!ht->thread_should_run(td->cpu)) {
155 preempt_enable(); 156 preempt_enable_no_resched();
156 schedule(); 157 schedule();
157 } else { 158 } else {
158 set_current_state(TASK_RUNNING); 159 __set_current_state(TASK_RUNNING);
159 preempt_enable(); 160 preempt_enable();
160 ht->thread_fn(td->cpu); 161 ht->thread_fn(td->cpu);
161 } 162 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add19164..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
656 * in the task stack here. 656 * in the task stack here.
657 */ 657 */
658 __do_softirq(); 658 __do_softirq();
659 rcu_note_context_switch(cpu); 659 rcu_note_context_switch();
660 local_irq_enable(); 660 local_irq_enable();
661 cond_resched(); 661 cond_resched();
662 return; 662 return;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
25} 25}
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28int snprint_stack_trace(char *buf, size_t size,
29 struct stack_trace *trace, int spaces)
30{
31 int i;
32 unsigned long ip;
33 int generated;
34 int total = 0;
35
36 if (WARN_ON(!trace->entries))
37 return 0;
38
39 for (i = 0; i < trace->nr_entries; i++) {
40 ip = trace->entries[i];
41 generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
42 1 + spaces, ' ', (void *) ip, (void *) ip);
43
44 total += generated;
45
46 /* Assume that generated isn't a negative number */
47 if (generated >= size) {
48 buf += size;
49 size = 0;
50 } else {
51 buf += generated;
52 size -= generated;
53 }
54 }
55
56 return total;
57}
58EXPORT_SYMBOL_GPL(snprint_stack_trace);
59
28/* 60/*
29 * Architectures that do not implement save_stack_trace_tsk or 61 * Architectures that do not implement save_stack_trace_tsk or
30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning 62 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys.c b/kernel/sys.c
index 1eaa2f0b0246..ea9c88109894 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -91,6 +91,12 @@
91#ifndef SET_TSC_CTL 91#ifndef SET_TSC_CTL
92# define SET_TSC_CTL(a) (-EINVAL) 92# define SET_TSC_CTL(a) (-EINVAL)
93#endif 93#endif
94#ifndef MPX_ENABLE_MANAGEMENT
95# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
96#endif
97#ifndef MPX_DISABLE_MANAGEMENT
98# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
99#endif
94 100
95/* 101/*
96 * this is where the system-wide overflow UID and GID are defined, for 102 * this is where the system-wide overflow UID and GID are defined, for
@@ -2203,6 +2209,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2203 me->mm->def_flags &= ~VM_NOHUGEPAGE; 2209 me->mm->def_flags &= ~VM_NOHUGEPAGE;
2204 up_write(&me->mm->mmap_sem); 2210 up_write(&me->mm->mmap_sem);
2205 break; 2211 break;
2212 case PR_MPX_ENABLE_MANAGEMENT:
2213 if (arg2 || arg3 || arg4 || arg5)
2214 return -EINVAL;
2215 error = MPX_ENABLE_MANAGEMENT(me);
2216 break;
2217 case PR_MPX_DISABLE_MANAGEMENT:
2218 if (arg2 || arg3 || arg4 || arg5)
2219 return -EINVAL;
2220 error = MPX_DISABLE_MANAGEMENT(me);
2221 break;
2206 default: 2222 default:
2207 error = -EINVAL; 2223 error = -EINVAL;
2208 break; 2224 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
169cond_syscall(sys_spu_run); 169cond_syscall(sys_spu_run);
170cond_syscall(sys_spu_create); 170cond_syscall(sys_spu_create);
171cond_syscall(sys_subpage_prot); 171cond_syscall(sys_subpage_prot);
172cond_syscall(sys_s390_pci_mmio_read);
173cond_syscall(sys_s390_pci_mmio_write);
172 174
173/* mmu depending weak syscall entries */ 175/* mmu depending weak syscall entries */
174cond_syscall(sys_mprotect); 176cond_syscall(sys_mprotect);
@@ -224,3 +226,6 @@ cond_syscall(sys_seccomp);
224 226
225/* access BPF programs and maps */ 227/* access BPF programs and maps */
226cond_syscall(sys_bpf); 228cond_syscall(sys_bpf);
229
230/* execveat */
231cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = {
623 .mode = 0644, 623 .mode = 0644,
624 .proc_handler = proc_dointvec, 624 .proc_handler = proc_dointvec,
625 }, 625 },
626 {
627 .procname = "tracepoint_printk",
628 .data = &tracepoint_printk,
629 .maxlen = sizeof(tracepoint_printk),
630 .mode = 0644,
631 .proc_handler = proc_dointvec,
632 },
626#endif 633#endif
627#ifdef CONFIG_KEXEC 634#ifdef CONFIG_KEXEC
628 { 635 {
@@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = {
1104 .proc_handler = proc_dointvec, 1111 .proc_handler = proc_dointvec,
1105 }, 1112 },
1106#endif 1113#endif
1114 {
1115 .procname = "panic_on_warn",
1116 .data = &panic_on_warn,
1117 .maxlen = sizeof(int),
1118 .mode = 0644,
1119 .proc_handler = proc_dointvec_minmax,
1120 .extra1 = &zero,
1121 .extra2 = &one,
1122 },
1107 { } 1123 { }
1108}; 1124};
1109 1125
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
140 {} 141 {}
141}; 142};
142 143
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
459 stats = nla_data(na); 459 stats = nla_data(na);
460 memset(stats, 0, sizeof(*stats)); 460 memset(stats, 0, sizeof(*stats));
461 461
462 rc = cgroupstats_build(stats, f.file->f_dentry); 462 rc = cgroupstats_build(stats, f.file->f_path.dentry);
463 if (rc < 0) { 463 if (rc < 0) {
464 nlmsg_free(rep_skb); 464 nlmsg_free(rep_skb);
465 goto err; 465 goto err;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
14obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o 16obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
17 17
18$(obj)/time.o: $(obj)/timeconst.h 18$(obj)/time.o: $(obj)/timeconst.h
19 19
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
792 /* Initialize mult/shift and max_idle_ns */ 792 /* Initialize mult/shift and max_idle_ns */
793 __clocksource_updatefreq_scale(cs, scale, freq); 793 __clocksource_updatefreq_scale(cs, scale, freq);
794 794
795 /* Add clocksource to the clcoksource list */ 795 /* Add clocksource to the clocksource list */
796 mutex_lock(&clocksource_mutex); 796 mutex_lock(&clocksource_mutex);
797 clocksource_enqueue(cs); 797 clocksource_enqueue(cs);
798 clocksource_enqueue_watchdog(cs); 798 clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..28bf91c60a0b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc)
633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) 633 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
634 return -EPERM; 634 return -EPERM;
635 635
636 if (txc->modes & ADJ_FREQUENCY) {
637 if (LONG_MIN / PPM_SCALE > txc->freq)
638 return -EINVAL;
639 if (LONG_MAX / PPM_SCALE < txc->freq)
640 return -EINVAL;
641 }
642
636 return 0; 643 return 0;
637} 644}
638 645
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b5741fc4110..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
235 if (!tick_nohz_full_cpu(smp_processor_id())) 235 if (!tick_nohz_full_cpu(smp_processor_id()))
236 return; 236 return;
237 237
238 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 238 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
239} 239}
240 240
241/* 241/*
@@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
585 last_jiffies = jiffies; 585 last_jiffies = jiffies;
586 } while (read_seqretry(&jiffies_lock, seq)); 586 } while (read_seqretry(&jiffies_lock, seq));
587 587
588 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 588 if (rcu_needs_cpu(&rcu_delta_jiffies) ||
589 arch_needs_cpu() || irq_work_needs_cpu()) { 589 arch_needs_cpu() || irq_work_needs_cpu()) {
590 next_jiffies = last_jiffies + 1; 590 next_jiffies = last_jiffies + 1;
591 delta_jiffies = 1; 591 delta_jiffies = 1;
@@ -847,7 +847,6 @@ void tick_nohz_idle_enter(void)
847 847
848 local_irq_enable(); 848 local_irq_enable();
849} 849}
850EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
851 850
852/** 851/**
853 * tick_nohz_irq_exit - update next tick event from interrupt exit 852 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -974,7 +973,6 @@ void tick_nohz_idle_exit(void)
974 973
975 local_irq_enable(); 974 local_irq_enable();
976} 975}
977EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
978 976
979static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 977static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
980{ 978{
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
196 if (tv) { 196 if (tv) {
197 if (copy_from_user(&user_tv, tv, sizeof(*tv))) 197 if (copy_from_user(&user_tv, tv, sizeof(*tv)))
198 return -EFAULT; 198 return -EFAULT;
199
200 if (!timeval_valid(&user_tv))
201 return -EINVAL;
202
199 new_ts.tv_sec = user_tv.tv_sec; 203 new_ts.tv_sec = user_tv.tv_sec;
200 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; 204 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
201 } 205 }
@@ -304,7 +308,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
304} 308}
305EXPORT_SYMBOL(timespec_trunc); 309EXPORT_SYMBOL(timespec_trunc);
306 310
307/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 311/*
312 * mktime64 - Converts date to seconds.
313 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
308 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 314 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
309 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. 315 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
310 * 316 *
@@ -314,15 +320,10 @@ EXPORT_SYMBOL(timespec_trunc);
314 * -year/100+year/400 terms, and add 10.] 320 * -year/100+year/400 terms, and add 10.]
315 * 321 *
316 * This algorithm was first published by Gauss (I think). 322 * This algorithm was first published by Gauss (I think).
317 *
318 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
319 * machines where long is 32-bit! (However, as time_t is signed, we
320 * will already get problems at other places on 2038-01-19 03:14:08)
321 */ 323 */
322unsigned long 324time64_t mktime64(const unsigned int year0, const unsigned int mon0,
323mktime(const unsigned int year0, const unsigned int mon0, 325 const unsigned int day, const unsigned int hour,
324 const unsigned int day, const unsigned int hour, 326 const unsigned int min, const unsigned int sec)
325 const unsigned int min, const unsigned int sec)
326{ 327{
327 unsigned int mon = mon0, year = year0; 328 unsigned int mon = mon0, year = year0;
328 329
@@ -332,15 +333,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
332 year -= 1; 333 year -= 1;
333 } 334 }
334 335
335 return ((((unsigned long) 336 return ((((time64_t)
336 (year/4 - year/100 + year/400 + 367*mon/12 + day) + 337 (year/4 - year/100 + year/400 + 367*mon/12 + day) +
337 year*365 - 719499 338 year*365 - 719499
338 )*24 + hour /* now have hours */ 339 )*24 + hour /* now have hours */
339 )*60 + min /* now have minutes */ 340 )*60 + min /* now have minutes */
340 )*60 + sec; /* finally seconds */ 341 )*60 + sec; /* finally seconds */
341} 342}
342 343EXPORT_SYMBOL(mktime64);
343EXPORT_SYMBOL(mktime);
344 344
345/** 345/**
346 * set_normalized_timespec - set timespec sec and nsec parts and normalize 346 * set_normalized_timespec - set timespec sec and nsec parts and normalize
@@ -745,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n)
745 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); 745 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
746#endif 746#endif
747} 747}
748EXPORT_SYMBOL(nsecs_to_jiffies64);
748 749
749/** 750/**
750 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 751 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..6a931852082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
417 */ 417 */
418static inline void tk_update_ktime_data(struct timekeeper *tk) 418static inline void tk_update_ktime_data(struct timekeeper *tk)
419{ 419{
420 s64 nsec; 420 u64 seconds;
421 u32 nsec;
421 422
422 /* 423 /*
423 * The xtime based monotonic readout is: 424 * The xtime based monotonic readout is:
@@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
426 * nsec = base_mono + now(); 427 * nsec = base_mono + now();
427 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec 428 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
428 */ 429 */
429 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); 430 seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
430 nsec *= NSEC_PER_SEC; 431 nsec = (u32) tk->wall_to_monotonic.tv_nsec;
431 nsec += tk->wall_to_monotonic.tv_nsec; 432 tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
432 tk->tkr.base_mono = ns_to_ktime(nsec);
433 433
434 /* Update the monotonic raw base */ 434 /* Update the monotonic raw base */
435 tk->base_raw = timespec64_to_ktime(tk->raw_time); 435 tk->base_raw = timespec64_to_ktime(tk->raw_time);
436
437 /*
438 * The sum of the nanoseconds portions of xtime and
439 * wall_to_monotonic can be greater/equal one second. Take
440 * this into account before updating tk->ktime_sec.
441 */
442 nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
443 if (nsec >= NSEC_PER_SEC)
444 seconds++;
445 tk->ktime_sec = seconds;
436} 446}
437 447
438/* must hold timekeeper_lock */ 448/* must hold timekeeper_lock */
@@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
519 529
520/** 530/**
521 * getnstimeofday64 - Returns the time of day in a timespec64. 531 * getnstimeofday64 - Returns the time of day in a timespec64.
522 * @ts: pointer to the timespec to be set 532 * @ts: pointer to the timespec64 to be set
523 * 533 *
524 * Returns the time of day in a timespec (WARN if suspended). 534 * Returns the time of day in a timespec64 (WARN if suspended).
525 */ 535 */
526void getnstimeofday64(struct timespec64 *ts) 536void getnstimeofday64(struct timespec64 *ts)
527{ 537{
@@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
623 * 633 *
624 * The function calculates the monotonic clock from the realtime 634 * The function calculates the monotonic clock from the realtime
625 * clock and the wall_to_monotonic offset and stores the result 635 * clock and the wall_to_monotonic offset and stores the result
626 * in normalized timespec format in the variable pointed to by @ts. 636 * in normalized timespec64 format in the variable pointed to by @ts.
627 */ 637 */
628void ktime_get_ts64(struct timespec64 *ts) 638void ktime_get_ts64(struct timespec64 *ts)
629{ 639{
@@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts)
648} 658}
649EXPORT_SYMBOL_GPL(ktime_get_ts64); 659EXPORT_SYMBOL_GPL(ktime_get_ts64);
650 660
661/**
662 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
663 *
664 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
665 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
666 * works on both 32 and 64 bit systems. On 32 bit systems the readout
667 * covers ~136 years of uptime which should be enough to prevent
668 * premature wrap arounds.
669 */
670time64_t ktime_get_seconds(void)
671{
672 struct timekeeper *tk = &tk_core.timekeeper;
673
674 WARN_ON(timekeeping_suspended);
675 return tk->ktime_sec;
676}
677EXPORT_SYMBOL_GPL(ktime_get_seconds);
678
679/**
680 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
681 *
682 * Returns the wall clock seconds since 1970. This replaces the
683 * get_seconds() interface which is not y2038 safe on 32bit systems.
684 *
685 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
686 * 32bit systems the access must be protected with the sequence
687 * counter to provide "atomic" access to the 64bit tk->xtime_sec
688 * value.
689 */
690time64_t ktime_get_real_seconds(void)
691{
692 struct timekeeper *tk = &tk_core.timekeeper;
693 time64_t seconds;
694 unsigned int seq;
695
696 if (IS_ENABLED(CONFIG_64BIT))
697 return tk->xtime_sec;
698
699 do {
700 seq = read_seqcount_begin(&tk_core.seq);
701 seconds = tk->xtime_sec;
702
703 } while (read_seqcount_retry(&tk_core.seq, seq));
704
705 return seconds;
706}
707EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
708
651#ifdef CONFIG_NTP_PPS 709#ifdef CONFIG_NTP_PPS
652 710
653/** 711/**
@@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv)
703EXPORT_SYMBOL(do_gettimeofday); 761EXPORT_SYMBOL(do_gettimeofday);
704 762
705/** 763/**
706 * do_settimeofday - Sets the time of day 764 * do_settimeofday64 - Sets the time of day.
707 * @tv: pointer to the timespec variable containing the new time 765 * @ts: pointer to the timespec64 variable containing the new time
708 * 766 *
709 * Sets the time of day to the new time and update NTP and notify hrtimers 767 * Sets the time of day to the new time and update NTP and notify hrtimers
710 */ 768 */
711int do_settimeofday(const struct timespec *tv) 769int do_settimeofday64(const struct timespec64 *ts)
712{ 770{
713 struct timekeeper *tk = &tk_core.timekeeper; 771 struct timekeeper *tk = &tk_core.timekeeper;
714 struct timespec64 ts_delta, xt, tmp; 772 struct timespec64 ts_delta, xt;
715 unsigned long flags; 773 unsigned long flags;
716 774
717 if (!timespec_valid_strict(tv)) 775 if (!timespec64_valid_strict(ts))
718 return -EINVAL; 776 return -EINVAL;
719 777
720 raw_spin_lock_irqsave(&timekeeper_lock, flags); 778 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv)
723 timekeeping_forward_now(tk); 781 timekeeping_forward_now(tk);
724 782
725 xt = tk_xtime(tk); 783 xt = tk_xtime(tk);
726 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 784 ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
727 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 785 ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
728 786
729 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); 787 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
730 788
731 tmp = timespec_to_timespec64(*tv); 789 tk_set_xtime(tk, ts);
732 tk_set_xtime(tk, &tmp);
733 790
734 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 791 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
735 792
@@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv)
741 798
742 return 0; 799 return 0;
743} 800}
744EXPORT_SYMBOL(do_settimeofday); 801EXPORT_SYMBOL(do_settimeofday64);
745 802
746/** 803/**
747 * timekeeping_inject_offset - Adds or subtracts from the current time. 804 * timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock)
895} 952}
896 953
897/** 954/**
898 * getrawmonotonic - Returns the raw monotonic time in a timespec 955 * getrawmonotonic64 - Returns the raw monotonic time in a timespec
899 * @ts: pointer to the timespec to be set 956 * @ts: pointer to the timespec64 to be set
900 * 957 *
901 * Returns the raw monotonic time (completely un-modified by ntp) 958 * Returns the raw monotonic time (completely un-modified by ntp)
902 */ 959 */
903void getrawmonotonic(struct timespec *ts) 960void getrawmonotonic64(struct timespec64 *ts)
904{ 961{
905 struct timekeeper *tk = &tk_core.timekeeper; 962 struct timekeeper *tk = &tk_core.timekeeper;
906 struct timespec64 ts64; 963 struct timespec64 ts64;
@@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts)
915 } while (read_seqcount_retry(&tk_core.seq, seq)); 972 } while (read_seqcount_retry(&tk_core.seq, seq));
916 973
917 timespec64_add_ns(&ts64, nsecs); 974 timespec64_add_ns(&ts64, nsecs);
918 *ts = timespec64_to_timespec(ts64); 975 *ts = ts64;
919} 976}
920EXPORT_SYMBOL(getrawmonotonic); 977EXPORT_SYMBOL(getrawmonotonic64);
978
921 979
922/** 980/**
923 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 981 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1068} 1126}
1069 1127
1070/** 1128/**
1071 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 1129 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
1072 * @delta: pointer to a timespec delta value 1130 * @delta: pointer to a timespec64 delta value
1073 * 1131 *
1074 * This hook is for architectures that cannot support read_persistent_clock 1132 * This hook is for architectures that cannot support read_persistent_clock
1075 * because their RTC/persistent clock is only accessible when irqs are enabled. 1133 * because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
1077 * This function should only be called by rtc_resume(), and allows 1135 * This function should only be called by rtc_resume(), and allows
1078 * a suspend offset to be injected into the timekeeping values. 1136 * a suspend offset to be injected into the timekeeping values.
1079 */ 1137 */
1080void timekeeping_inject_sleeptime(struct timespec *delta) 1138void timekeeping_inject_sleeptime64(struct timespec64 *delta)
1081{ 1139{
1082 struct timekeeper *tk = &tk_core.timekeeper; 1140 struct timekeeper *tk = &tk_core.timekeeper;
1083 struct timespec64 tmp;
1084 unsigned long flags; 1141 unsigned long flags;
1085 1142
1086 /* 1143 /*
@@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
1095 1152
1096 timekeeping_forward_now(tk); 1153 timekeeping_forward_now(tk);
1097 1154
1098 tmp = timespec_to_timespec64(*delta); 1155 __timekeeping_inject_sleeptime(tk, delta);
1099 __timekeeping_inject_sleeptime(tk, &tmp);
1100 1156
1101 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1157 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
1102 1158
@@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1332 * 1388 *
1333 * XXX - TODO: Doc ntp_error calculation. 1389 * XXX - TODO: Doc ntp_error calculation.
1334 */ 1390 */
1391 if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
1392 /* NTP adjustment caused clocksource mult overflow */
1393 WARN_ON_ONCE(1);
1394 return;
1395 }
1396
1335 tk->tkr.mult += mult_adj; 1397 tk->tkr.mult += mult_adj;
1336 tk->xtime_interval += interval; 1398 tk->xtime_interval += interval;
1337 tk->tkr.xtime_nsec -= offset; 1399 tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1397 } 1459 }
1398 1460
1399 if (unlikely(tk->tkr.clock->maxadj && 1461 if (unlikely(tk->tkr.clock->maxadj &&
1400 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { 1462 (abs(tk->tkr.mult - tk->tkr.clock->mult)
1463 > tk->tkr.clock->maxadj))) {
1401 printk_once(KERN_WARNING 1464 printk_once(KERN_WARNING
1402 "Adjusting %s more than 11%% (%ld vs %ld)\n", 1465 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1403 tk->tkr.clock->name, (long)tk->tkr.mult, 1466 tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void)
1646} 1709}
1647EXPORT_SYMBOL(current_kernel_time); 1710EXPORT_SYMBOL(current_kernel_time);
1648 1711
1649struct timespec get_monotonic_coarse(void) 1712struct timespec64 get_monotonic_coarse64(void)
1650{ 1713{
1651 struct timekeeper *tk = &tk_core.timekeeper; 1714 struct timekeeper *tk = &tk_core.timekeeper;
1652 struct timespec64 now, mono; 1715 struct timespec64 now, mono;
@@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void)
1662 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, 1725 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1663 now.tv_nsec + mono.tv_nsec); 1726 now.tv_nsec + mono.tv_nsec);
1664 1727
1665 return timespec64_to_timespec(now); 1728 return now;
1666} 1729}
1667 1730
1668/* 1731/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3260ffdb368f..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1377void update_process_times(int user_tick) 1377void update_process_times(int user_tick)
1378{ 1378{
1379 struct task_struct *p = current; 1379 struct task_struct *p = current;
1380 int cpu = smp_processor_id();
1381 1380
1382 /* Note: this timer irq context must be accounted for as well. */ 1381 /* Note: this timer irq context must be accounted for as well. */
1383 account_process_tick(p, user_tick); 1382 account_process_tick(p, user_tick);
1384 run_local_timers(); 1383 run_local_timers();
1385 rcu_check_callbacks(cpu, user_tick); 1384 rcu_check_callbacks(user_tick);
1386#ifdef CONFIG_IRQ_WORK 1385#ifdef CONFIG_IRQ_WORK
1387 if (in_irq()) 1386 if (in_irq())
1388 irq_work_tick(); 1387 irq_work_tick();
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y) 58ifeq ($(CONFIG_PM),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o 59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif 60endif
61ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
1142 r->sector_from = be64_to_cpu(sector_from); 1142 r->sector_from = be64_to_cpu(sector_from);
1143} 1143}
1144 1144
1145typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1145typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1146 1146
1147static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1147static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148{ 1148{
1149 char rwbs[RWBS_LEN]; 1149 char rwbs[RWBS_LEN];
1150 unsigned long long ts = iter->ts; 1150 unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1154 1154
1155 fill_rwbs(rwbs, t); 1155 fill_rwbs(rwbs, t);
1156 1156
1157 return trace_seq_printf(&iter->seq, 1157 trace_seq_printf(&iter->seq,
1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", 1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1159 MAJOR(t->device), MINOR(t->device), iter->cpu, 1159 MAJOR(t->device), MINOR(t->device), iter->cpu,
1160 secs, nsec_rem, iter->ent->pid, act, rwbs); 1160 secs, nsec_rem, iter->ent->pid, act, rwbs);
1161} 1161}
1162 1162
1163static int blk_log_action(struct trace_iterator *iter, const char *act) 1163static void blk_log_action(struct trace_iterator *iter, const char *act)
1164{ 1164{
1165 char rwbs[RWBS_LEN]; 1165 char rwbs[RWBS_LEN];
1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1167 1167
1168 fill_rwbs(rwbs, t); 1168 fill_rwbs(rwbs, t);
1169 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", 1169 trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1170 MAJOR(t->device), MINOR(t->device), act, rwbs); 1170 MAJOR(t->device), MINOR(t->device), act, rwbs);
1171} 1171}
1172 1172
1173static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) 1173static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1174{ 1174{
1175 const unsigned char *pdu_buf; 1175 const unsigned char *pdu_buf;
1176 int pdu_len; 1176 int pdu_len;
1177 int i, end, ret; 1177 int i, end;
1178 1178
1179 pdu_buf = pdu_start(ent); 1179 pdu_buf = pdu_start(ent);
1180 pdu_len = te_blk_io_trace(ent)->pdu_len; 1180 pdu_len = te_blk_io_trace(ent)->pdu_len;
1181 1181
1182 if (!pdu_len) 1182 if (!pdu_len)
1183 return 1; 1183 return;
1184 1184
1185 /* find the last zero that needs to be printed */ 1185 /* find the last zero that needs to be printed */
1186 for (end = pdu_len - 1; end >= 0; end--) 1186 for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1188 break; 1188 break;
1189 end++; 1189 end++;
1190 1190
1191 if (!trace_seq_putc(s, '(')) 1191 trace_seq_putc(s, '(');
1192 return 0;
1193 1192
1194 for (i = 0; i < pdu_len; i++) { 1193 for (i = 0; i < pdu_len; i++) {
1195 1194
1196 ret = trace_seq_printf(s, "%s%02x", 1195 trace_seq_printf(s, "%s%02x",
1197 i == 0 ? "" : " ", pdu_buf[i]); 1196 i == 0 ? "" : " ", pdu_buf[i]);
1198 if (!ret)
1199 return ret;
1200 1197
1201 /* 1198 /*
1202 * stop when the rest is just zeroes and indicate so 1199 * stop when the rest is just zeroes and indicate so
1203 * with a ".." appended 1200 * with a ".." appended
1204 */ 1201 */
1205 if (i == end && end != pdu_len - 1) 1202 if (i == end && end != pdu_len - 1) {
1206 return trace_seq_puts(s, " ..) "); 1203 trace_seq_puts(s, " ..) ");
1204 return;
1205 }
1207 } 1206 }
1208 1207
1209 return trace_seq_puts(s, ") "); 1208 trace_seq_puts(s, ") ");
1210} 1209}
1211 1210
1212static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1211static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1213{ 1212{
1214 char cmd[TASK_COMM_LEN]; 1213 char cmd[TASK_COMM_LEN];
1215 1214
1216 trace_find_cmdline(ent->pid, cmd); 1215 trace_find_cmdline(ent->pid, cmd);
1217 1216
1218 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1217 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1219 int ret; 1218 trace_seq_printf(s, "%u ", t_bytes(ent));
1220 1219 blk_log_dump_pdu(s, ent);
1221 ret = trace_seq_printf(s, "%u ", t_bytes(ent)); 1220 trace_seq_printf(s, "[%s]\n", cmd);
1222 if (!ret)
1223 return 0;
1224 ret = blk_log_dump_pdu(s, ent);
1225 if (!ret)
1226 return 0;
1227 return trace_seq_printf(s, "[%s]\n", cmd);
1228 } else { 1221 } else {
1229 if (t_sec(ent)) 1222 if (t_sec(ent))
1230 return trace_seq_printf(s, "%llu + %u [%s]\n", 1223 trace_seq_printf(s, "%llu + %u [%s]\n",
1231 t_sector(ent), t_sec(ent), cmd); 1224 t_sector(ent), t_sec(ent), cmd);
1232 return trace_seq_printf(s, "[%s]\n", cmd); 1225 else
1226 trace_seq_printf(s, "[%s]\n", cmd);
1233 } 1227 }
1234} 1228}
1235 1229
1236static int blk_log_with_error(struct trace_seq *s, 1230static void blk_log_with_error(struct trace_seq *s,
1237 const struct trace_entry *ent) 1231 const struct trace_entry *ent)
1238{ 1232{
1239 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1233 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1240 int ret; 1234 blk_log_dump_pdu(s, ent);
1241 1235 trace_seq_printf(s, "[%d]\n", t_error(ent));
1242 ret = blk_log_dump_pdu(s, ent);
1243 if (ret)
1244 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1245 return 0;
1246 } else { 1236 } else {
1247 if (t_sec(ent)) 1237 if (t_sec(ent))
1248 return trace_seq_printf(s, "%llu + %u [%d]\n", 1238 trace_seq_printf(s, "%llu + %u [%d]\n",
1249 t_sector(ent), 1239 t_sector(ent),
1250 t_sec(ent), t_error(ent)); 1240 t_sec(ent), t_error(ent));
1251 return trace_seq_printf(s, "%llu [%d]\n", 1241 else
1252 t_sector(ent), t_error(ent)); 1242 trace_seq_printf(s, "%llu [%d]\n",
1243 t_sector(ent), t_error(ent));
1253 } 1244 }
1254} 1245}
1255 1246
1256static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1247static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1257{ 1248{
1258 struct blk_io_trace_remap r = { .device_from = 0, }; 1249 struct blk_io_trace_remap r = { .device_from = 0, };
1259 1250
1260 get_pdu_remap(ent, &r); 1251 get_pdu_remap(ent, &r);
1261 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1252 trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1262 t_sector(ent), t_sec(ent), 1253 t_sector(ent), t_sec(ent),
1263 MAJOR(r.device_from), MINOR(r.device_from), 1254 MAJOR(r.device_from), MINOR(r.device_from),
1264 (unsigned long long)r.sector_from); 1255 (unsigned long long)r.sector_from);
1265} 1256}
1266 1257
1267static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1258static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1268{ 1259{
1269 char cmd[TASK_COMM_LEN]; 1260 char cmd[TASK_COMM_LEN];
1270 1261
1271 trace_find_cmdline(ent->pid, cmd); 1262 trace_find_cmdline(ent->pid, cmd);
1272 1263
1273 return trace_seq_printf(s, "[%s]\n", cmd); 1264 trace_seq_printf(s, "[%s]\n", cmd);
1274} 1265}
1275 1266
1276static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) 1267static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1277{ 1268{
1278 char cmd[TASK_COMM_LEN]; 1269 char cmd[TASK_COMM_LEN];
1279 1270
1280 trace_find_cmdline(ent->pid, cmd); 1271 trace_find_cmdline(ent->pid, cmd);
1281 1272
1282 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); 1273 trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1283} 1274}
1284 1275
1285static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) 1276static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1286{ 1277{
1287 char cmd[TASK_COMM_LEN]; 1278 char cmd[TASK_COMM_LEN];
1288 1279
1289 trace_find_cmdline(ent->pid, cmd); 1280 trace_find_cmdline(ent->pid, cmd);
1290 1281
1291 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), 1282 trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1292 get_pdu_int(ent), cmd); 1283 get_pdu_int(ent), cmd);
1293} 1284}
1294 1285
1295static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) 1286static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1296{ 1287{
1297 int ret;
1298 const struct blk_io_trace *t = te_blk_io_trace(ent); 1288 const struct blk_io_trace *t = te_blk_io_trace(ent);
1299 1289
1300 ret = trace_seq_putmem(s, t + 1, t->pdu_len); 1290 trace_seq_putmem(s, t + 1, t->pdu_len);
1301 if (ret) 1291 trace_seq_putc(s, '\n');
1302 return trace_seq_putc(s, '\n');
1303 return ret;
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
1339 1327
1340static const struct { 1328static const struct {
1341 const char *act[2]; 1329 const char *act[2];
1342 int (*print)(struct trace_seq *s, const struct trace_entry *ent); 1330 void (*print)(struct trace_seq *s, const struct trace_entry *ent);
1343} what2act[] = { 1331} what2act[] = {
1344 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, 1332 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1345 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, 1333 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1364 struct trace_seq *s = &iter->seq; 1352 struct trace_seq *s = &iter->seq;
1365 const struct blk_io_trace *t; 1353 const struct blk_io_trace *t;
1366 u16 what; 1354 u16 what;
1367 int ret;
1368 bool long_act; 1355 bool long_act;
1369 blk_log_action_t *log_action; 1356 blk_log_action_t *log_action;
1370 1357
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1374 log_action = classic ? &blk_log_action_classic : &blk_log_action; 1361 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1375 1362
1376 if (t->action == BLK_TN_MESSAGE) { 1363 if (t->action == BLK_TN_MESSAGE) {
1377 ret = log_action(iter, long_act ? "message" : "m"); 1364 log_action(iter, long_act ? "message" : "m");
1378 if (ret) 1365 blk_log_msg(s, iter->ent);
1379 ret = blk_log_msg(s, iter->ent);
1380 goto out;
1381 } 1366 }
1382 1367
1383 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1368 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1384 ret = trace_seq_printf(s, "Unknown action %x\n", what); 1369 trace_seq_printf(s, "Unknown action %x\n", what);
1385 else { 1370 else {
1386 ret = log_action(iter, what2act[what].act[long_act]); 1371 log_action(iter, what2act[what].act[long_act]);
1387 if (ret) 1372 what2act[what].print(s, iter->ent);
1388 ret = what2act[what].print(s, iter->ent);
1389 } 1373 }
1390out: 1374
1391 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1375 return trace_handle_return(s);
1392} 1376}
1393 1377
1394static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1378static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1397 return print_one_line(iter, false); 1381 return print_one_line(iter, false);
1398} 1382}
1399 1383
1400static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) 1384static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1401{ 1385{
1402 struct trace_seq *s = &iter->seq; 1386 struct trace_seq *s = &iter->seq;
1403 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; 1387 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1407 .time = iter->ts, 1391 .time = iter->ts,
1408 }; 1392 };
1409 1393
1410 if (!trace_seq_putmem(s, &old, offset)) 1394 trace_seq_putmem(s, &old, offset);
1411 return 0; 1395 trace_seq_putmem(s, &t->sector,
1412 return trace_seq_putmem(s, &t->sector, 1396 sizeof(old) - offset + t->pdu_len);
1413 sizeof(old) - offset + t->pdu_len);
1414} 1397}
1415 1398
1416static enum print_line_t 1399static enum print_line_t
1417blk_trace_event_print_binary(struct trace_iterator *iter, int flags, 1400blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1418 struct trace_event *event) 1401 struct trace_event *event)
1419{ 1402{
1420 return blk_trace_synthesize_old_trace(iter) ? 1403 blk_trace_synthesize_old_trace(iter);
1421 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1404
1405 return trace_handle_return(&iter->seq);
1422} 1406}
1423 1407
1424static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) 1408static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
1493 if (atomic_dec_and_test(&blk_probes_ref)) 1477 if (atomic_dec_and_test(&blk_probes_ref))
1494 blk_unregister_tracepoints(); 1478 blk_unregister_tracepoints();
1495 1479
1496 spin_lock_irq(&running_trace_lock);
1497 list_del(&bt->running_list);
1498 spin_unlock_irq(&running_trace_lock);
1499 blk_trace_free(bt); 1480 blk_trace_free(bt);
1500 return 0; 1481 return 0;
1501} 1482}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31c90fec4158..224e768bdc73 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
387 return ret; 387 return ret;
388} 388}
389 389
390static void ftrace_update_trampoline(struct ftrace_ops *ops);
391
390static int __register_ftrace_function(struct ftrace_ops *ops) 392static int __register_ftrace_function(struct ftrace_ops *ops)
391{ 393{
392 if (ops->flags & FTRACE_OPS_FL_DELETED) 394 if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
416 if (control_ops_alloc(ops)) 418 if (control_ops_alloc(ops))
417 return -ENOMEM; 419 return -ENOMEM;
418 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 420 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
421 /* The control_ops needs the trampoline update */
422 ops = &control_ops;
419 } else 423 } else
420 add_ftrace_ops(&ftrace_ops_list, ops); 424 add_ftrace_ops(&ftrace_ops_list, ops);
421 425
426 ftrace_update_trampoline(ops);
427
422 if (ftrace_enabled) 428 if (ftrace_enabled)
423 update_ftrace_function(); 429 update_ftrace_function();
424 430
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
565static int function_stat_headers(struct seq_file *m) 571static int function_stat_headers(struct seq_file *m)
566{ 572{
567#ifdef CONFIG_FUNCTION_GRAPH_TRACER 573#ifdef CONFIG_FUNCTION_GRAPH_TRACER
568 seq_printf(m, " Function " 574 seq_puts(m, " Function "
569 "Hit Time Avg s^2\n" 575 "Hit Time Avg s^2\n"
570 " -------- " 576 " -------- "
571 "--- ---- --- ---\n"); 577 "--- ---- --- ---\n");
572#else 578#else
573 seq_printf(m, " Function Hit\n" 579 seq_puts(m, " Function Hit\n"
574 " -------- ---\n"); 580 " -------- ---\n");
575#endif 581#endif
576 return 0; 582 return 0;
577} 583}
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
598 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 604 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
599 605
600#ifdef CONFIG_FUNCTION_GRAPH_TRACER 606#ifdef CONFIG_FUNCTION_GRAPH_TRACER
601 seq_printf(m, " "); 607 seq_puts(m, " ");
602 avg = rec->time; 608 avg = rec->time;
603 do_div(avg, rec->counter); 609 do_div(avg, rec->counter);
604 610
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
1111 FTRACE_OPS_FL_INITIALIZED, 1117 FTRACE_OPS_FL_INITIALIZED,
1112}; 1118};
1113 1119
1120/*
1121 * This is used by __kernel_text_address() to return true if the
1122 * address is on a dynamically allocated trampoline that would
1123 * not return true for either core_kernel_text() or
1124 * is_module_text_address().
1125 */
1126bool is_ftrace_trampoline(unsigned long addr)
1127{
1128 struct ftrace_ops *op;
1129 bool ret = false;
1130
1131 /*
1132 * Some of the ops may be dynamically allocated,
1133 * they are freed after a synchronize_sched().
1134 */
1135 preempt_disable_notrace();
1136
1137 do_for_each_ftrace_op(op, ftrace_ops_list) {
1138 /*
1139 * This is to check for dynamically allocated trampolines.
1140 * Trampolines that are in kernel text will have
1141 * core_kernel_text() return true.
1142 */
1143 if (op->trampoline && op->trampoline_size)
1144 if (addr >= op->trampoline &&
1145 addr < op->trampoline + op->trampoline_size) {
1146 ret = true;
1147 goto out;
1148 }
1149 } while_for_each_ftrace_op(op);
1150
1151 out:
1152 preempt_enable_notrace();
1153
1154 return ret;
1155}
1156
1114struct ftrace_page { 1157struct ftrace_page {
1115 struct ftrace_page *next; 1158 struct ftrace_page *next;
1116 struct dyn_ftrace *records; 1159 struct dyn_ftrace *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
1315static void 1358static void
1316ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); 1359ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
1317 1360
1361static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1362 struct ftrace_hash *new_hash);
1363
1318static int 1364static int
1319ftrace_hash_move(struct ftrace_ops *ops, int enable, 1365ftrace_hash_move(struct ftrace_ops *ops, int enable,
1320 struct ftrace_hash **dst, struct ftrace_hash *src) 1366 struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1325 struct ftrace_hash *new_hash; 1371 struct ftrace_hash *new_hash;
1326 int size = src->count; 1372 int size = src->count;
1327 int bits = 0; 1373 int bits = 0;
1374 int ret;
1328 int i; 1375 int i;
1329 1376
1377 /* Reject setting notrace hash on IPMODIFY ftrace_ops */
1378 if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
1379 return -EINVAL;
1380
1330 /* 1381 /*
1331 * If the new source is empty, just free dst and assign it 1382 * If the new source is empty, just free dst and assign it
1332 * the empty_hash. 1383 * the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1360 } 1411 }
1361 1412
1362update: 1413update:
1414 /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
1415 if (enable) {
1416 /* IPMODIFY should be updated only when filter_hash updating */
1417 ret = ftrace_hash_ipmodify_update(ops, new_hash);
1418 if (ret < 0) {
1419 free_ftrace_hash(new_hash);
1420 return ret;
1421 }
1422 }
1423
1363 /* 1424 /*
1364 * Remove the current set, update the hash and add 1425 * Remove the current set, update the hash and add
1365 * them back. 1426 * them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
1724 ftrace_hash_rec_update_modify(ops, filter_hash, 1); 1785 ftrace_hash_rec_update_modify(ops, filter_hash, 1);
1725} 1786}
1726 1787
1788/*
1789 * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
1790 * or no-needed to update, -EBUSY if it detects a conflict of the flag
1791 * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
1792 * Note that old_hash and new_hash has below meanings
1793 * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
1794 * - If the hash is EMPTY_HASH, it hits nothing
1795 * - Anything else hits the recs which match the hash entries.
1796 */
1797static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
1798 struct ftrace_hash *old_hash,
1799 struct ftrace_hash *new_hash)
1800{
1801 struct ftrace_page *pg;
1802 struct dyn_ftrace *rec, *end = NULL;
1803 int in_old, in_new;
1804
1805 /* Only update if the ops has been registered */
1806 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1807 return 0;
1808
1809 if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
1810 return 0;
1811
1812 /*
1813 * Since the IPMODIFY is a very address sensitive action, we do not
1814 * allow ftrace_ops to set all functions to new hash.
1815 */
1816 if (!new_hash || !old_hash)
1817 return -EINVAL;
1818
1819 /* Update rec->flags */
1820 do_for_each_ftrace_rec(pg, rec) {
1821 /* We need to update only differences of filter_hash */
1822 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1823 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1824 if (in_old == in_new)
1825 continue;
1826
1827 if (in_new) {
1828 /* New entries must ensure no others are using it */
1829 if (rec->flags & FTRACE_FL_IPMODIFY)
1830 goto rollback;
1831 rec->flags |= FTRACE_FL_IPMODIFY;
1832 } else /* Removed entry */
1833 rec->flags &= ~FTRACE_FL_IPMODIFY;
1834 } while_for_each_ftrace_rec();
1835
1836 return 0;
1837
1838rollback:
1839 end = rec;
1840
1841 /* Roll back what we did above */
1842 do_for_each_ftrace_rec(pg, rec) {
1843 if (rec == end)
1844 goto err_out;
1845
1846 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1847 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1848 if (in_old == in_new)
1849 continue;
1850
1851 if (in_new)
1852 rec->flags &= ~FTRACE_FL_IPMODIFY;
1853 else
1854 rec->flags |= FTRACE_FL_IPMODIFY;
1855 } while_for_each_ftrace_rec();
1856
1857err_out:
1858 return -EBUSY;
1859}
1860
1861static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
1862{
1863 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1864
1865 if (ftrace_hash_empty(hash))
1866 hash = NULL;
1867
1868 return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
1869}
1870
1871/* Disabling always succeeds */
1872static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
1873{
1874 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1875
1876 if (ftrace_hash_empty(hash))
1877 hash = NULL;
1878
1879 __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
1880}
1881
1882static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1883 struct ftrace_hash *new_hash)
1884{
1885 struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
1886
1887 if (ftrace_hash_empty(old_hash))
1888 old_hash = NULL;
1889
1890 if (ftrace_hash_empty(new_hash))
1891 new_hash = NULL;
1892
1893 return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
1894}
1895
1727static void print_ip_ins(const char *fmt, unsigned char *p) 1896static void print_ip_ins(const char *fmt, unsigned char *p)
1728{ 1897{
1729 int i; 1898 int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1734 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1903 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1735} 1904}
1736 1905
1906static struct ftrace_ops *
1907ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
1908
1737/** 1909/**
1738 * ftrace_bug - report and shutdown function tracer 1910 * ftrace_bug - report and shutdown function tracer
1739 * @failed: The failed type (EFAULT, EINVAL, EPERM) 1911 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1740 * @ip: The address that failed 1912 * @rec: The record that failed
1741 * 1913 *
1742 * The arch code that enables or disables the function tracing 1914 * The arch code that enables or disables the function tracing
1743 * can call ftrace_bug() when it has detected a problem in 1915 * can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1746 * EINVAL - if what is read at @ip is not what was expected 1918 * EINVAL - if what is read at @ip is not what was expected
1747 * EPERM - if the problem happens on writting to the @ip address 1919 * EPERM - if the problem happens on writting to the @ip address
1748 */ 1920 */
1749void ftrace_bug(int failed, unsigned long ip) 1921void ftrace_bug(int failed, struct dyn_ftrace *rec)
1750{ 1922{
1923 unsigned long ip = rec ? rec->ip : 0;
1924
1751 switch (failed) { 1925 switch (failed) {
1752 case -EFAULT: 1926 case -EFAULT:
1753 FTRACE_WARN_ON_ONCE(1); 1927 FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
1759 pr_info("ftrace failed to modify "); 1933 pr_info("ftrace failed to modify ");
1760 print_ip_sym(ip); 1934 print_ip_sym(ip);
1761 print_ip_ins(" actual: ", (unsigned char *)ip); 1935 print_ip_ins(" actual: ", (unsigned char *)ip);
1762 printk(KERN_CONT "\n"); 1936 pr_cont("\n");
1763 break; 1937 break;
1764 case -EPERM: 1938 case -EPERM:
1765 FTRACE_WARN_ON_ONCE(1); 1939 FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
1771 pr_info("ftrace faulted on unknown error "); 1945 pr_info("ftrace faulted on unknown error ");
1772 print_ip_sym(ip); 1946 print_ip_sym(ip);
1773 } 1947 }
1948 if (rec) {
1949 struct ftrace_ops *ops = NULL;
1950
1951 pr_info("ftrace record flags: %lx\n", rec->flags);
1952 pr_cont(" (%ld)%s", ftrace_rec_count(rec),
1953 rec->flags & FTRACE_FL_REGS ? " R" : " ");
1954 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1955 ops = ftrace_find_tramp_ops_any(rec);
1956 if (ops)
1957 pr_cont("\ttramp: %pS",
1958 (void *)ops->trampoline);
1959 else
1960 pr_cont("\ttramp: ERROR!");
1961
1962 }
1963 ip = ftrace_get_addr_curr(rec);
1964 pr_cont(" expected tramp: %lx\n", ip);
1965 }
1774} 1966}
1775 1967
1776static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1968static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
2093 do_for_each_ftrace_rec(pg, rec) { 2285 do_for_each_ftrace_rec(pg, rec) {
2094 failed = __ftrace_replace_code(rec, enable); 2286 failed = __ftrace_replace_code(rec, enable);
2095 if (failed) { 2287 if (failed) {
2096 ftrace_bug(failed, rec->ip); 2288 ftrace_bug(failed, rec);
2097 /* Stop processing */ 2289 /* Stop processing */
2098 return; 2290 return;
2099 } 2291 }
@@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
2175static int 2367static int
2176ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 2368ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
2177{ 2369{
2178 unsigned long ip;
2179 int ret; 2370 int ret;
2180 2371
2181 ip = rec->ip;
2182
2183 if (unlikely(ftrace_disabled)) 2372 if (unlikely(ftrace_disabled))
2184 return 0; 2373 return 0;
2185 2374
2186 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 2375 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
2187 if (ret) { 2376 if (ret) {
2188 ftrace_bug(ret, ip); 2377 ftrace_bug(ret, rec);
2189 return 0; 2378 return 0;
2190 } 2379 }
2191 return 1; 2380 return 1;
@@ -2308,18 +2497,24 @@ static void ftrace_run_update_code(int command)
2308} 2497}
2309 2498
2310static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, 2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2311 struct ftrace_hash *old_hash) 2500 struct ftrace_ops_hash *old_hash)
2312{ 2501{
2313 ops->flags |= FTRACE_OPS_FL_MODIFYING; 2502 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2314 ops->old_hash.filter_hash = old_hash; 2503 ops->old_hash.filter_hash = old_hash->filter_hash;
2504 ops->old_hash.notrace_hash = old_hash->notrace_hash;
2315 ftrace_run_update_code(command); 2505 ftrace_run_update_code(command);
2316 ops->old_hash.filter_hash = NULL; 2506 ops->old_hash.filter_hash = NULL;
2507 ops->old_hash.notrace_hash = NULL;
2317 ops->flags &= ~FTRACE_OPS_FL_MODIFYING; 2508 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2318} 2509}
2319 2510
2320static ftrace_func_t saved_ftrace_func; 2511static ftrace_func_t saved_ftrace_func;
2321static int ftrace_start_up; 2512static int ftrace_start_up;
2322 2513
2514void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
2515{
2516}
2517
2323static void control_ops_free(struct ftrace_ops *ops) 2518static void control_ops_free(struct ftrace_ops *ops)
2324{ 2519{
2325 free_percpu(ops->disabled); 2520 free_percpu(ops->disabled);
@@ -2369,6 +2564,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2369 */ 2564 */
2370 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; 2565 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
2371 2566
2567 ret = ftrace_hash_ipmodify_enable(ops);
2568 if (ret < 0) {
2569 /* Rollback registration process */
2570 __unregister_ftrace_function(ops);
2571 ftrace_start_up--;
2572 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2573 return ret;
2574 }
2575
2372 ftrace_hash_rec_enable(ops, 1); 2576 ftrace_hash_rec_enable(ops, 1);
2373 2577
2374 ftrace_startup_enable(command); 2578 ftrace_startup_enable(command);
@@ -2397,6 +2601,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2397 */ 2601 */
2398 WARN_ON_ONCE(ftrace_start_up < 0); 2602 WARN_ON_ONCE(ftrace_start_up < 0);
2399 2603
2604 /* Disabling ipmodify never fails */
2605 ftrace_hash_ipmodify_disable(ops);
2400 ftrace_hash_rec_disable(ops, 1); 2606 ftrace_hash_rec_disable(ops, 1);
2401 2607
2402 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2608 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2471,6 +2677,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2471 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { 2677 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2472 schedule_on_each_cpu(ftrace_sync); 2678 schedule_on_each_cpu(ftrace_sync);
2473 2679
2680 arch_ftrace_trampoline_free(ops);
2681
2474 if (ops->flags & FTRACE_OPS_FL_CONTROL) 2682 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2475 control_ops_free(ops); 2683 control_ops_free(ops);
2476 } 2684 }
@@ -2623,7 +2831,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2623 if (ftrace_start_up && cnt) { 2831 if (ftrace_start_up && cnt) {
2624 int failed = __ftrace_replace_code(p, 1); 2832 int failed = __ftrace_replace_code(p, 1);
2625 if (failed) 2833 if (failed)
2626 ftrace_bug(failed, p->ip); 2834 ftrace_bug(failed, p);
2627 } 2835 }
2628 } 2836 }
2629 } 2837 }
@@ -2948,6 +3156,22 @@ static void t_stop(struct seq_file *m, void *p)
2948 mutex_unlock(&ftrace_lock); 3156 mutex_unlock(&ftrace_lock);
2949} 3157}
2950 3158
3159void * __weak
3160arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
3161{
3162 return NULL;
3163}
3164
3165static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
3166 struct dyn_ftrace *rec)
3167{
3168 void *ptr;
3169
3170 ptr = arch_ftrace_trampoline_func(ops, rec);
3171 if (ptr)
3172 seq_printf(m, " ->%pS", ptr);
3173}
3174
2951static int t_show(struct seq_file *m, void *v) 3175static int t_show(struct seq_file *m, void *v)
2952{ 3176{
2953 struct ftrace_iterator *iter = m->private; 3177 struct ftrace_iterator *iter = m->private;
@@ -2958,9 +3182,9 @@ static int t_show(struct seq_file *m, void *v)
2958 3182
2959 if (iter->flags & FTRACE_ITER_PRINTALL) { 3183 if (iter->flags & FTRACE_ITER_PRINTALL) {
2960 if (iter->flags & FTRACE_ITER_NOTRACE) 3184 if (iter->flags & FTRACE_ITER_NOTRACE)
2961 seq_printf(m, "#### no functions disabled ####\n"); 3185 seq_puts(m, "#### no functions disabled ####\n");
2962 else 3186 else
2963 seq_printf(m, "#### all functions enabled ####\n"); 3187 seq_puts(m, "#### all functions enabled ####\n");
2964 return 0; 3188 return 0;
2965 } 3189 }
2966 3190
@@ -2971,22 +3195,25 @@ static int t_show(struct seq_file *m, void *v)
2971 3195
2972 seq_printf(m, "%ps", (void *)rec->ip); 3196 seq_printf(m, "%ps", (void *)rec->ip);
2973 if (iter->flags & FTRACE_ITER_ENABLED) { 3197 if (iter->flags & FTRACE_ITER_ENABLED) {
2974 seq_printf(m, " (%ld)%s", 3198 struct ftrace_ops *ops = NULL;
3199
3200 seq_printf(m, " (%ld)%s%s",
2975 ftrace_rec_count(rec), 3201 ftrace_rec_count(rec),
2976 rec->flags & FTRACE_FL_REGS ? " R" : " "); 3202 rec->flags & FTRACE_FL_REGS ? " R" : " ",
3203 rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
2977 if (rec->flags & FTRACE_FL_TRAMP_EN) { 3204 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2978 struct ftrace_ops *ops;
2979
2980 ops = ftrace_find_tramp_ops_any(rec); 3205 ops = ftrace_find_tramp_ops_any(rec);
2981 if (ops) 3206 if (ops)
2982 seq_printf(m, "\ttramp: %pS", 3207 seq_printf(m, "\ttramp: %pS",
2983 (void *)ops->trampoline); 3208 (void *)ops->trampoline);
2984 else 3209 else
2985 seq_printf(m, "\ttramp: ERROR!"); 3210 seq_puts(m, "\ttramp: ERROR!");
3211
2986 } 3212 }
3213 add_trampoline_func(m, ops, rec);
2987 } 3214 }
2988 3215
2989 seq_printf(m, "\n"); 3216 seq_putc(m, '\n');
2990 3217
2991 return 0; 3218 return 0;
2992} 3219}
@@ -3020,9 +3247,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
3020{ 3247{
3021 struct ftrace_iterator *iter; 3248 struct ftrace_iterator *iter;
3022 3249
3023 if (unlikely(ftrace_disabled))
3024 return -ENODEV;
3025
3026 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3250 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3027 if (iter) { 3251 if (iter) {
3028 iter->pg = ftrace_pages_start; 3252 iter->pg = ftrace_pages_start;
@@ -3357,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3357 3581
3358static int ftrace_probe_registered; 3582static int ftrace_probe_registered;
3359 3583
3360static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) 3584static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
3361{ 3585{
3362 int ret; 3586 int ret;
3363 int i; 3587 int i;
@@ -3415,6 +3639,7 @@ int
3415register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3639register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3416 void *data) 3640 void *data)
3417{ 3641{
3642 struct ftrace_ops_hash old_hash_ops;
3418 struct ftrace_func_probe *entry; 3643 struct ftrace_func_probe *entry;
3419 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 3644 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3420 struct ftrace_hash *old_hash = *orig_hash; 3645 struct ftrace_hash *old_hash = *orig_hash;
@@ -3436,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3436 3661
3437 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 3662 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3438 3663
3664 old_hash_ops.filter_hash = old_hash;
3665 /* Probes only have filters */
3666 old_hash_ops.notrace_hash = NULL;
3667
3439 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); 3668 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3440 if (!hash) { 3669 if (!hash) {
3441 count = -ENOMEM; 3670 count = -ENOMEM;
@@ -3496,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3496 3725
3497 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3726 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3498 3727
3499 __enable_ftrace_function_probe(old_hash); 3728 __enable_ftrace_function_probe(&old_hash_ops);
3500 3729
3501 if (!ret) 3730 if (!ret)
3502 free_ftrace_hash_rcu(old_hash); 3731 free_ftrace_hash_rcu(old_hash);
@@ -3784,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3784} 4013}
3785 4014
3786static void ftrace_ops_update_code(struct ftrace_ops *ops, 4015static void ftrace_ops_update_code(struct ftrace_ops *ops,
3787 struct ftrace_hash *old_hash) 4016 struct ftrace_ops_hash *old_hash)
3788{ 4017{
3789 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 4018 struct ftrace_ops *op;
4019
4020 if (!ftrace_enabled)
4021 return;
4022
4023 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
3790 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); 4024 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
4025 return;
4026 }
4027
4028 /*
4029 * If this is the shared global_ops filter, then we need to
4030 * check if there is another ops that shares it, is enabled.
4031 * If so, we still need to run the modify code.
4032 */
4033 if (ops->func_hash != &global_ops.local_hash)
4034 return;
4035
4036 do_for_each_ftrace_op(op, ftrace_ops_list) {
4037 if (op->func_hash == &global_ops.local_hash &&
4038 op->flags & FTRACE_OPS_FL_ENABLED) {
4039 ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
4040 /* Only need to do this once */
4041 return;
4042 }
4043 } while_for_each_ftrace_op(op);
3791} 4044}
3792 4045
3793static int 4046static int
@@ -3795,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3795 unsigned long ip, int remove, int reset, int enable) 4048 unsigned long ip, int remove, int reset, int enable)
3796{ 4049{
3797 struct ftrace_hash **orig_hash; 4050 struct ftrace_hash **orig_hash;
4051 struct ftrace_ops_hash old_hash_ops;
3798 struct ftrace_hash *old_hash; 4052 struct ftrace_hash *old_hash;
3799 struct ftrace_hash *hash; 4053 struct ftrace_hash *hash;
3800 int ret; 4054 int ret;
@@ -3831,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3831 4085
3832 mutex_lock(&ftrace_lock); 4086 mutex_lock(&ftrace_lock);
3833 old_hash = *orig_hash; 4087 old_hash = *orig_hash;
4088 old_hash_ops.filter_hash = ops->func_hash->filter_hash;
4089 old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
3834 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 4090 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3835 if (!ret) { 4091 if (!ret) {
3836 ftrace_ops_update_code(ops, old_hash); 4092 ftrace_ops_update_code(ops, &old_hash_ops);
3837 free_ftrace_hash_rcu(old_hash); 4093 free_ftrace_hash_rcu(old_hash);
3838 } 4094 }
3839 mutex_unlock(&ftrace_lock); 4095 mutex_unlock(&ftrace_lock);
@@ -3975,6 +4231,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3975static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 4231static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3976static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 4232static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3977 4233
4234static unsigned long save_global_trampoline;
4235static unsigned long save_global_flags;
4236
3978static int __init set_graph_function(char *str) 4237static int __init set_graph_function(char *str)
3979{ 4238{
3980 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 4239 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4042,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
4042int ftrace_regex_release(struct inode *inode, struct file *file) 4301int ftrace_regex_release(struct inode *inode, struct file *file)
4043{ 4302{
4044 struct seq_file *m = (struct seq_file *)file->private_data; 4303 struct seq_file *m = (struct seq_file *)file->private_data;
4304 struct ftrace_ops_hash old_hash_ops;
4045 struct ftrace_iterator *iter; 4305 struct ftrace_iterator *iter;
4046 struct ftrace_hash **orig_hash; 4306 struct ftrace_hash **orig_hash;
4047 struct ftrace_hash *old_hash; 4307 struct ftrace_hash *old_hash;
@@ -4075,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4075 4335
4076 mutex_lock(&ftrace_lock); 4336 mutex_lock(&ftrace_lock);
4077 old_hash = *orig_hash; 4337 old_hash = *orig_hash;
4338 old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
4339 old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
4078 ret = ftrace_hash_move(iter->ops, filter_hash, 4340 ret = ftrace_hash_move(iter->ops, filter_hash,
4079 orig_hash, iter->hash); 4341 orig_hash, iter->hash);
4080 if (!ret) { 4342 if (!ret) {
4081 ftrace_ops_update_code(iter->ops, old_hash); 4343 ftrace_ops_update_code(iter->ops, &old_hash_ops);
4082 free_ftrace_hash_rcu(old_hash); 4344 free_ftrace_hash_rcu(old_hash);
4083 } 4345 }
4084 mutex_unlock(&ftrace_lock); 4346 mutex_unlock(&ftrace_lock);
@@ -4183,9 +4445,9 @@ static int g_show(struct seq_file *m, void *v)
4183 struct ftrace_graph_data *fgd = m->private; 4445 struct ftrace_graph_data *fgd = m->private;
4184 4446
4185 if (fgd->table == ftrace_graph_funcs) 4447 if (fgd->table == ftrace_graph_funcs)
4186 seq_printf(m, "#### all functions enabled ####\n"); 4448 seq_puts(m, "#### all functions enabled ####\n");
4187 else 4449 else
4188 seq_printf(m, "#### no functions disabled ####\n"); 4450 seq_puts(m, "#### no functions disabled ####\n");
4189 return 0; 4451 return 0;
4190 } 4452 }
4191 4453
@@ -4696,6 +4958,32 @@ void __init ftrace_init(void)
4696 ftrace_disabled = 1; 4958 ftrace_disabled = 1;
4697} 4959}
4698 4960
4961/* Do nothing if arch does not support this */
4962void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
4963{
4964}
4965
4966static void ftrace_update_trampoline(struct ftrace_ops *ops)
4967{
4968
4969/*
4970 * Currently there's no safe way to free a trampoline when the kernel
4971 * is configured with PREEMPT. That is because a task could be preempted
4972 * when it jumped to the trampoline, it may be preempted for a long time
4973 * depending on the system load, and currently there's no way to know
4974 * when it will be off the trampoline. If the trampoline is freed
4975 * too early, when the task runs again, it will be executing on freed
4976 * memory and crash.
4977 */
4978#ifdef CONFIG_PREEMPT
4979 /* Currently, only non dynamic ops can have a trampoline */
4980 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
4981 return;
4982#endif
4983
4984 arch_ftrace_update_trampoline(ops);
4985}
4986
4699#else 4987#else
4700 4988
4701static struct ftrace_ops global_ops = { 4989static struct ftrace_ops global_ops = {
@@ -4738,6 +5026,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4738 return 1; 5026 return 1;
4739} 5027}
4740 5028
5029static void ftrace_update_trampoline(struct ftrace_ops *ops)
5030{
5031}
5032
4741#endif /* CONFIG_DYNAMIC_FTRACE */ 5033#endif /* CONFIG_DYNAMIC_FTRACE */
4742 5034
4743__init void ftrace_init_global_array_ops(struct trace_array *tr) 5035__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5075,12 +5367,12 @@ static int fpid_show(struct seq_file *m, void *v)
5075 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 5367 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
5076 5368
5077 if (v == (void *)1) { 5369 if (v == (void *)1) {
5078 seq_printf(m, "no pid\n"); 5370 seq_puts(m, "no pid\n");
5079 return 0; 5371 return 0;
5080 } 5372 }
5081 5373
5082 if (fpid->pid == ftrace_swapper_pid) 5374 if (fpid->pid == ftrace_swapper_pid)
5083 seq_printf(m, "swapper tasks\n"); 5375 seq_puts(m, "swapper tasks\n");
5084 else 5376 else
5085 seq_printf(m, "%u\n", pid_vnr(fpid->pid)); 5377 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
5086 5378
@@ -5293,6 +5585,7 @@ static struct ftrace_ops graph_ops = {
5293 FTRACE_OPS_FL_STUB, 5585 FTRACE_OPS_FL_STUB,
5294#ifdef FTRACE_GRAPH_TRAMP_ADDR 5586#ifdef FTRACE_GRAPH_TRAMP_ADDR
5295 .trampoline = FTRACE_GRAPH_TRAMP_ADDR, 5587 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
5588 /* trampoline_size is only needed for dynamically allocated tramps */
5296#endif 5589#endif
5297 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) 5590 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
5298}; 5591};
@@ -5522,7 +5815,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5522 update_function_graph_func(); 5815 update_function_graph_func();
5523 5816
5524 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); 5817 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
5525
5526out: 5818out:
5527 mutex_unlock(&ftrace_lock); 5819 mutex_unlock(&ftrace_lock);
5528 return ret; 5820 return ret;
@@ -5543,6 +5835,17 @@ void unregister_ftrace_graph(void)
5543 unregister_pm_notifier(&ftrace_suspend_notifier); 5835 unregister_pm_notifier(&ftrace_suspend_notifier);
5544 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5836 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5545 5837
5838#ifdef CONFIG_DYNAMIC_FTRACE
5839 /*
5840 * Function graph does not allocate the trampoline, but
5841 * other global_ops do. We need to reset the ALLOC_TRAMP flag
5842 * if one was used.
5843 */
5844 global_ops.trampoline = save_global_trampoline;
5845 if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
5846 global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
5847#endif
5848
5546 out: 5849 out:
5547 mutex_unlock(&ftrace_lock); 5850 mutex_unlock(&ftrace_lock);
5548} 5851}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a56e07c8d15b..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
34 */ 34 */
35int ring_buffer_print_entry_header(struct trace_seq *s) 35int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 trace_seq_puts(s, "# compressed entry header\n");
38 38 trace_seq_puts(s, "\ttype_len : 5 bits\n");
39 ret = trace_seq_puts(s, "# compressed entry header\n"); 39 trace_seq_puts(s, "\ttime_delta : 27 bits\n");
40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); 40 trace_seq_puts(s, "\tarray : 32 bits\n");
41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); 41 trace_seq_putc(s, '\n');
42 ret = trace_seq_puts(s, "\tarray : 32 bits\n"); 42 trace_seq_printf(s, "\tpadding : type == %d\n",
43 ret = trace_seq_putc(s, '\n'); 43 RINGBUF_TYPE_PADDING);
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 trace_seq_printf(s, "\ttime_extend : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_TIME_EXTEND);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 trace_seq_printf(s, "\tdata max type_len == %d\n",
47 RINGBUF_TYPE_TIME_EXTEND); 47 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
48 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
49 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
50 48
51 return ret; 49 return !trace_seq_has_overflowed(s);
52} 50}
53 51
54/* 52/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
419int ring_buffer_print_page_header(struct trace_seq *s) 417int ring_buffer_print_page_header(struct trace_seq *s)
420{ 418{
421 struct buffer_data_page field; 419 struct buffer_data_page field;
422 int ret;
423
424 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
425 "offset:0;\tsize:%u;\tsigned:%u;\n",
426 (unsigned int)sizeof(field.time_stamp),
427 (unsigned int)is_signed_type(u64));
428
429 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
430 "offset:%u;\tsize:%u;\tsigned:%u;\n",
431 (unsigned int)offsetof(typeof(field), commit),
432 (unsigned int)sizeof(field.commit),
433 (unsigned int)is_signed_type(long));
434
435 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
436 "offset:%u;\tsize:%u;\tsigned:%u;\n",
437 (unsigned int)offsetof(typeof(field), commit),
438 1,
439 (unsigned int)is_signed_type(long));
440
441 ret = trace_seq_printf(s, "\tfield: char data;\t"
442 "offset:%u;\tsize:%u;\tsigned:%u;\n",
443 (unsigned int)offsetof(typeof(field), data),
444 (unsigned int)BUF_PAGE_SIZE,
445 (unsigned int)is_signed_type(char));
446 420
447 return ret; 421 trace_seq_printf(s, "\tfield: u64 timestamp;\t"
422 "offset:0;\tsize:%u;\tsigned:%u;\n",
423 (unsigned int)sizeof(field.time_stamp),
424 (unsigned int)is_signed_type(u64));
425
426 trace_seq_printf(s, "\tfield: local_t commit;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 (unsigned int)sizeof(field.commit),
430 (unsigned int)is_signed_type(long));
431
432 trace_seq_printf(s, "\tfield: int overwrite;\t"
433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
434 (unsigned int)offsetof(typeof(field), commit),
435 1,
436 (unsigned int)is_signed_type(long));
437
438 trace_seq_printf(s, "\tfield: char data;\t"
439 "offset:%u;\tsize:%u;\tsigned:%u;\n",
440 (unsigned int)offsetof(typeof(field), data),
441 (unsigned int)BUF_PAGE_SIZE,
442 (unsigned int)is_signed_type(char));
443
444 return !trace_seq_has_overflowed(s);
448} 445}
449 446
450struct rb_irq_work { 447struct rb_irq_work {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92f4a6cee172..4a9079b9f082 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
63 */ 63 */
64bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
65 65
66/* Pipe tracepoints to printk */
67struct trace_iterator *tracepoint_print_iter;
68int tracepoint_printk;
69
66/* For tracers that don't implement custom flags */ 70/* For tracers that don't implement custom flags */
67static struct tracer_opt dummy_tracer_opt[] = { 71static struct tracer_opt dummy_tracer_opt[] = {
68 { } 72 { }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
155 159
156static int __init stop_trace_on_warning(char *str) 160static int __init stop_trace_on_warning(char *str)
157{ 161{
158 __disable_trace_on_warning = 1; 162 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
163 __disable_trace_on_warning = 1;
159 return 1; 164 return 1;
160} 165}
161__setup("traceoff_on_warning=", stop_trace_on_warning); 166__setup("traceoff_on_warning", stop_trace_on_warning);
162 167
163static int __init boot_alloc_snapshot(char *str) 168static int __init boot_alloc_snapshot(char *str)
164{ 169{
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
192} 197}
193__setup("trace_clock=", set_trace_boot_clock); 198__setup("trace_clock=", set_trace_boot_clock);
194 199
200static int __init set_tracepoint_printk(char *str)
201{
202 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
203 tracepoint_printk = 1;
204 return 1;
205}
206__setup("tp_printk", set_tracepoint_printk);
195 207
196unsigned long long ns2usecs(cycle_t nsec) 208unsigned long long ns2usecs(cycle_t nsec)
197{ 209{
@@ -938,19 +950,20 @@ out:
938 return ret; 950 return ret;
939} 951}
940 952
953/* TODO add a seq_buf_to_buffer() */
941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 954static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
942{ 955{
943 int len; 956 int len;
944 957
945 if (s->len <= s->readpos) 958 if (trace_seq_used(s) <= s->seq.readpos)
946 return -EBUSY; 959 return -EBUSY;
947 960
948 len = s->len - s->readpos; 961 len = trace_seq_used(s) - s->seq.readpos;
949 if (cnt > len) 962 if (cnt > len)
950 cnt = len; 963 cnt = len;
951 memcpy(buf, s->buffer + s->readpos, cnt); 964 memcpy(buf, s->buffer + s->seq.readpos, cnt);
952 965
953 s->readpos += cnt; 966 s->seq.readpos += cnt;
954 return cnt; 967 return cnt;
955} 968}
956 969
@@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void)
2029 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2030 pr_warning("** **\n"); 2043 pr_warning("** **\n");
2031 pr_warning("** This means that this is a DEBUG kernel and it is **\n"); 2044 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2032 pr_warning("** unsafe for produciton use. **\n"); 2045 pr_warning("** unsafe for production use. **\n");
2033 pr_warning("** **\n"); 2046 pr_warning("** **\n");
2034 pr_warning("** If you see this message and you are not debugging **\n"); 2047 pr_warning("** If you see this message and you are not debugging **\n");
2035 pr_warning("** the kernel, report this immediately to your vendor! **\n"); 2048 pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2158 goto out; 2171 goto out;
2159 } 2172 }
2160 2173
2161 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 2174 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
2162 if (len > TRACE_BUF_SIZE)
2163 goto out;
2164 2175
2165 local_save_flags(flags); 2176 local_save_flags(flags);
2166 size = sizeof(*entry) + len + 1; 2177 size = sizeof(*entry) + len + 1;
@@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2171 entry = ring_buffer_event_data(event); 2182 entry = ring_buffer_event_data(event);
2172 entry->ip = ip; 2183 entry->ip = ip;
2173 2184
2174 memcpy(&entry->buf, tbuffer, len); 2185 memcpy(&entry->buf, tbuffer, len + 1);
2175 entry->buf[len] = '\0';
2176 if (!call_filter_check_discard(call, entry, buffer, event)) { 2186 if (!call_filter_check_discard(call, entry, buffer, event)) {
2177 __buffer_unlock_commit(buffer, event); 2187 __buffer_unlock_commit(buffer, event);
2178 ftrace_trace_stack(buffer, flags, 6, pc); 2188 ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
2509 2519
2510static void print_lat_help_header(struct seq_file *m) 2520static void print_lat_help_header(struct seq_file *m)
2511{ 2521{
2512 seq_puts(m, "# _------=> CPU# \n"); 2522 seq_puts(m, "# _------=> CPU# \n"
2513 seq_puts(m, "# / _-----=> irqs-off \n"); 2523 "# / _-----=> irqs-off \n"
2514 seq_puts(m, "# | / _----=> need-resched \n"); 2524 "# | / _----=> need-resched \n"
2515 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 2525 "# || / _---=> hardirq/softirq \n"
2516 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 2526 "# ||| / _--=> preempt-depth \n"
2517 seq_puts(m, "# |||| / delay \n"); 2527 "# |||| / delay \n"
2518 seq_puts(m, "# cmd pid ||||| time | caller \n"); 2528 "# cmd pid ||||| time | caller \n"
2519 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2529 "# \\ / ||||| \\ | / \n");
2520} 2530}
2521 2531
2522static void print_event_info(struct trace_buffer *buf, struct seq_file *m) 2532static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2533static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 2543static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2534{ 2544{
2535 print_event_info(buf, m); 2545 print_event_info(buf, m);
2536 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2546 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
2537 seq_puts(m, "# | | | | |\n"); 2547 "# | | | | |\n");
2538} 2548}
2539 2549
2540static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 2550static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2541{ 2551{
2542 print_event_info(buf, m); 2552 print_event_info(buf, m);
2543 seq_puts(m, "# _-----=> irqs-off\n"); 2553 seq_puts(m, "# _-----=> irqs-off\n"
2544 seq_puts(m, "# / _----=> need-resched\n"); 2554 "# / _----=> need-resched\n"
2545 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2555 "# | / _---=> hardirq/softirq\n"
2546 seq_puts(m, "# || / _--=> preempt-depth\n"); 2556 "# || / _--=> preempt-depth\n"
2547 seq_puts(m, "# ||| / delay\n"); 2557 "# ||| / delay\n"
2548 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); 2558 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
2549 seq_puts(m, "# | | | |||| | |\n"); 2559 "# | | | |||| | |\n");
2550} 2560}
2551 2561
2552void 2562void
@@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
2649 event = ftrace_find_event(entry->type); 2659 event = ftrace_find_event(entry->type);
2650 2660
2651 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2661 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2652 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2662 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2653 if (!trace_print_lat_context(iter)) 2663 trace_print_lat_context(iter);
2654 goto partial; 2664 else
2655 } else { 2665 trace_print_context(iter);
2656 if (!trace_print_context(iter))
2657 goto partial;
2658 }
2659 } 2666 }
2660 2667
2668 if (trace_seq_has_overflowed(s))
2669 return TRACE_TYPE_PARTIAL_LINE;
2670
2661 if (event) 2671 if (event)
2662 return event->funcs->trace(iter, sym_flags, event); 2672 return event->funcs->trace(iter, sym_flags, event);
2663 2673
2664 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 2674 trace_seq_printf(s, "Unknown type %d\n", entry->type);
2665 goto partial;
2666 2675
2667 return TRACE_TYPE_HANDLED; 2676 return trace_handle_return(s);
2668partial:
2669 return TRACE_TYPE_PARTIAL_LINE;
2670} 2677}
2671 2678
2672static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 2679static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2677 2684
2678 entry = iter->ent; 2685 entry = iter->ent;
2679 2686
2680 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2687 if (trace_flags & TRACE_ITER_CONTEXT_INFO)
2681 if (!trace_seq_printf(s, "%d %d %llu ", 2688 trace_seq_printf(s, "%d %d %llu ",
2682 entry->pid, iter->cpu, iter->ts)) 2689 entry->pid, iter->cpu, iter->ts);
2683 goto partial; 2690
2684 } 2691 if (trace_seq_has_overflowed(s))
2692 return TRACE_TYPE_PARTIAL_LINE;
2685 2693
2686 event = ftrace_find_event(entry->type); 2694 event = ftrace_find_event(entry->type);
2687 if (event) 2695 if (event)
2688 return event->funcs->raw(iter, 0, event); 2696 return event->funcs->raw(iter, 0, event);
2689 2697
2690 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 2698 trace_seq_printf(s, "%d ?\n", entry->type);
2691 goto partial;
2692 2699
2693 return TRACE_TYPE_HANDLED; 2700 return trace_handle_return(s);
2694partial:
2695 return TRACE_TYPE_PARTIAL_LINE;
2696} 2701}
2697 2702
2698static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 2703static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2705 entry = iter->ent; 2710 entry = iter->ent;
2706 2711
2707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2712 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2708 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 2713 SEQ_PUT_HEX_FIELD(s, entry->pid);
2709 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 2714 SEQ_PUT_HEX_FIELD(s, iter->cpu);
2710 SEQ_PUT_HEX_FIELD_RET(s, iter->ts); 2715 SEQ_PUT_HEX_FIELD(s, iter->ts);
2716 if (trace_seq_has_overflowed(s))
2717 return TRACE_TYPE_PARTIAL_LINE;
2711 } 2718 }
2712 2719
2713 event = ftrace_find_event(entry->type); 2720 event = ftrace_find_event(entry->type);
@@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2717 return ret; 2724 return ret;
2718 } 2725 }
2719 2726
2720 SEQ_PUT_FIELD_RET(s, newline); 2727 SEQ_PUT_FIELD(s, newline);
2721 2728
2722 return TRACE_TYPE_HANDLED; 2729 return trace_handle_return(s);
2723} 2730}
2724 2731
2725static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2732static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2731 entry = iter->ent; 2738 entry = iter->ent;
2732 2739
2733 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2740 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2734 SEQ_PUT_FIELD_RET(s, entry->pid); 2741 SEQ_PUT_FIELD(s, entry->pid);
2735 SEQ_PUT_FIELD_RET(s, iter->cpu); 2742 SEQ_PUT_FIELD(s, iter->cpu);
2736 SEQ_PUT_FIELD_RET(s, iter->ts); 2743 SEQ_PUT_FIELD(s, iter->ts);
2744 if (trace_seq_has_overflowed(s))
2745 return TRACE_TYPE_PARTIAL_LINE;
2737 } 2746 }
2738 2747
2739 event = ftrace_find_event(entry->type); 2748 event = ftrace_find_event(entry->type);
@@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2779{ 2788{
2780 enum print_line_t ret; 2789 enum print_line_t ret;
2781 2790
2782 if (iter->lost_events && 2791 if (iter->lost_events) {
2783 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2792 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2784 iter->cpu, iter->lost_events)) 2793 iter->cpu, iter->lost_events);
2785 return TRACE_TYPE_PARTIAL_LINE; 2794 if (trace_seq_has_overflowed(&iter->seq))
2795 return TRACE_TYPE_PARTIAL_LINE;
2796 }
2786 2797
2787 if (iter->trace && iter->trace->print_line) { 2798 if (iter->trace && iter->trace->print_line) {
2788 ret = iter->trace->print_line(iter); 2799 ret = iter->trace->print_line(iter);
@@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
2860{ 2871{
2861 if (!ftrace_is_dead()) 2872 if (!ftrace_is_dead())
2862 return; 2873 return;
2863 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); 2874 seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
2864 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2875 "# MAY BE MISSING FUNCTION EVENTS\n");
2865} 2876}
2866 2877
2867#ifdef CONFIG_TRACER_MAX_TRACE 2878#ifdef CONFIG_TRACER_MAX_TRACE
2868static void show_snapshot_main_help(struct seq_file *m) 2879static void show_snapshot_main_help(struct seq_file *m)
2869{ 2880{
2870 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2881 seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
2871 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2882 "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2872 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2883 "# Takes a snapshot of the main buffer.\n"
2873 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); 2884 "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
2874 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2885 "# (Doesn't have to be '2' works with any number that\n"
2875 seq_printf(m, "# is not a '0' or '1')\n"); 2886 "# is not a '0' or '1')\n");
2876} 2887}
2877 2888
2878static void show_snapshot_percpu_help(struct seq_file *m) 2889static void show_snapshot_percpu_help(struct seq_file *m)
2879{ 2890{
2880 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); 2891 seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2881#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP 2892#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2882 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2893 seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2883 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); 2894 "# Takes a snapshot of the main buffer for this cpu.\n");
2884#else 2895#else
2885 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); 2896 seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
2886 seq_printf(m, "# Must use main snapshot file to allocate.\n"); 2897 "# Must use main snapshot file to allocate.\n");
2887#endif 2898#endif
2888 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); 2899 seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
2889 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2900 "# (Doesn't have to be '2' works with any number that\n"
2890 seq_printf(m, "# is not a '0' or '1')\n"); 2901 "# is not a '0' or '1')\n");
2891} 2902}
2892 2903
2893static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2904static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2894{ 2905{
2895 if (iter->tr->allocated_snapshot) 2906 if (iter->tr->allocated_snapshot)
2896 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); 2907 seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
2897 else 2908 else
2898 seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); 2909 seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
2899 2910
2900 seq_printf(m, "# Snapshot commands:\n"); 2911 seq_puts(m, "# Snapshot commands:\n");
2901 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) 2912 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2902 show_snapshot_main_help(m); 2913 show_snapshot_main_help(m);
2903 else 2914 else
@@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
3251 if (!t) 3262 if (!t)
3252 return 0; 3263 return 0;
3253 3264
3254 seq_printf(m, "%s", t->name); 3265 seq_puts(m, t->name);
3255 if (t->next) 3266 if (t->next)
3256 seq_putc(m, ' '); 3267 seq_putc(m, ' ');
3257 else 3268 else
@@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4314 goto out; 4325 goto out;
4315 } 4326 }
4316 4327
4328 trace_seq_init(&iter->seq);
4329
4317 /* 4330 /*
4318 * We make a copy of the current tracer to avoid concurrent 4331 * We make a copy of the current tracer to avoid concurrent
4319 * changes on it while we are reading. 4332 * changes on it while we are reading.
@@ -4507,18 +4520,18 @@ waitagain:
4507 trace_access_lock(iter->cpu_file); 4520 trace_access_lock(iter->cpu_file);
4508 while (trace_find_next_entry_inc(iter) != NULL) { 4521 while (trace_find_next_entry_inc(iter) != NULL) {
4509 enum print_line_t ret; 4522 enum print_line_t ret;
4510 int len = iter->seq.len; 4523 int save_len = iter->seq.seq.len;
4511 4524
4512 ret = print_trace_line(iter); 4525 ret = print_trace_line(iter);
4513 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4526 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4514 /* don't print partial lines */ 4527 /* don't print partial lines */
4515 iter->seq.len = len; 4528 iter->seq.seq.len = save_len;
4516 break; 4529 break;
4517 } 4530 }
4518 if (ret != TRACE_TYPE_NO_CONSUME) 4531 if (ret != TRACE_TYPE_NO_CONSUME)
4519 trace_consume(iter); 4532 trace_consume(iter);
4520 4533
4521 if (iter->seq.len >= cnt) 4534 if (trace_seq_used(&iter->seq) >= cnt)
4522 break; 4535 break;
4523 4536
4524 /* 4537 /*
@@ -4534,7 +4547,7 @@ waitagain:
4534 4547
4535 /* Now copy what we have to the user */ 4548 /* Now copy what we have to the user */
4536 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 4549 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
4537 if (iter->seq.readpos >= iter->seq.len) 4550 if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
4538 trace_seq_init(&iter->seq); 4551 trace_seq_init(&iter->seq);
4539 4552
4540 /* 4553 /*
@@ -4568,20 +4581,33 @@ static size_t
4568tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) 4581tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
4569{ 4582{
4570 size_t count; 4583 size_t count;
4584 int save_len;
4571 int ret; 4585 int ret;
4572 4586
4573 /* Seq buffer is page-sized, exactly what we need. */ 4587 /* Seq buffer is page-sized, exactly what we need. */
4574 for (;;) { 4588 for (;;) {
4575 count = iter->seq.len; 4589 save_len = iter->seq.seq.len;
4576 ret = print_trace_line(iter); 4590 ret = print_trace_line(iter);
4577 count = iter->seq.len - count; 4591
4578 if (rem < count) { 4592 if (trace_seq_has_overflowed(&iter->seq)) {
4579 rem = 0; 4593 iter->seq.seq.len = save_len;
4580 iter->seq.len -= count;
4581 break; 4594 break;
4582 } 4595 }
4596
4597 /*
4598 * This should not be hit, because it should only
4599 * be set if the iter->seq overflowed. But check it
4600 * anyway to be safe.
4601 */
4583 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4602 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4584 iter->seq.len -= count; 4603 iter->seq.seq.len = save_len;
4604 break;
4605 }
4606
4607 count = trace_seq_used(&iter->seq) - save_len;
4608 if (rem < count) {
4609 rem = 0;
4610 iter->seq.seq.len = save_len;
4585 break; 4611 break;
4586 } 4612 }
4587 4613
@@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4662 /* Copy the data into the page, so we can start over. */ 4688 /* Copy the data into the page, so we can start over. */
4663 ret = trace_seq_to_buffer(&iter->seq, 4689 ret = trace_seq_to_buffer(&iter->seq,
4664 page_address(spd.pages[i]), 4690 page_address(spd.pages[i]),
4665 iter->seq.len); 4691 trace_seq_used(&iter->seq));
4666 if (ret < 0) { 4692 if (ret < 0) {
4667 __free_page(spd.pages[i]); 4693 __free_page(spd.pages[i]);
4668 break; 4694 break;
4669 } 4695 }
4670 spd.partial[i].offset = 0; 4696 spd.partial[i].offset = 0;
4671 spd.partial[i].len = iter->seq.len; 4697 spd.partial[i].len = trace_seq_used(&iter->seq);
4672 4698
4673 trace_seq_init(&iter->seq); 4699 trace_seq_init(&iter->seq);
4674 } 4700 }
@@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5668 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); 5694 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
5669 trace_seq_printf(s, "read events: %ld\n", cnt); 5695 trace_seq_printf(s, "read events: %ld\n", cnt);
5670 5696
5671 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5697 count = simple_read_from_buffer(ubuf, count, ppos,
5698 s->buffer, trace_seq_used(s));
5672 5699
5673 kfree(s); 5700 kfree(s);
5674 5701
@@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5749 5776
5750 seq_printf(m, "%ps:", (void *)ip); 5777 seq_printf(m, "%ps:", (void *)ip);
5751 5778
5752 seq_printf(m, "snapshot"); 5779 seq_puts(m, "snapshot");
5753 5780
5754 if (count == -1) 5781 if (count == -1)
5755 seq_printf(m, ":unlimited\n"); 5782 seq_puts(m, ":unlimited\n");
5756 else 5783 else
5757 seq_printf(m, ":count=%ld\n", count); 5784 seq_printf(m, ":count=%ld\n", count);
5758 5785
@@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
6417 int ret; 6444 int ret;
6418 6445
6419 /* Paranoid: Make sure the parent is the "instances" directory */ 6446 /* Paranoid: Make sure the parent is the "instances" directory */
6420 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6447 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6421 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6448 if (WARN_ON_ONCE(parent != trace_instance_dir))
6422 return -ENOENT; 6449 return -ENOENT;
6423 6450
@@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6444 int ret; 6471 int ret;
6445 6472
6446 /* Paranoid: Make sure the parent is the "instances" directory */ 6473 /* Paranoid: Make sure the parent is the "instances" directory */
6447 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6474 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6448 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6475 if (WARN_ON_ONCE(parent != trace_instance_dir))
6449 return -ENOENT; 6476 return -ENOENT;
6450 6477
@@ -6631,11 +6658,19 @@ void
6631trace_printk_seq(struct trace_seq *s) 6658trace_printk_seq(struct trace_seq *s)
6632{ 6659{
6633 /* Probably should print a warning here. */ 6660 /* Probably should print a warning here. */
6634 if (s->len >= TRACE_MAX_PRINT) 6661 if (s->seq.len >= TRACE_MAX_PRINT)
6635 s->len = TRACE_MAX_PRINT; 6662 s->seq.len = TRACE_MAX_PRINT;
6663
6664 /*
6665 * More paranoid code. Although the buffer size is set to
6666 * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
6667 * an extra layer of protection.
6668 */
6669 if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
6670 s->seq.len = s->seq.size - 1;
6636 6671
6637 /* should be zero ended, but we are paranoid. */ 6672 /* should be zero ended, but we are paranoid. */
6638 s->buffer[s->len] = 0; 6673 s->buffer[s->seq.len] = 0;
6639 6674
6640 printk(KERN_TRACE "%s", s->buffer); 6675 printk(KERN_TRACE "%s", s->buffer);
6641 6676
@@ -6874,6 +6909,18 @@ out:
6874 return ret; 6909 return ret;
6875} 6910}
6876 6911
6912void __init trace_init(void)
6913{
6914 if (tracepoint_printk) {
6915 tracepoint_print_iter =
6916 kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
6917 if (WARN_ON(!tracepoint_print_iter))
6918 tracepoint_printk = 0;
6919 }
6920 tracer_alloc_buffers();
6921 trace_event_init();
6922}
6923
6877__init static int clear_boot_tracer(void) 6924__init static int clear_boot_tracer(void)
6878{ 6925{
6879 /* 6926 /*
@@ -6893,6 +6940,5 @@ __init static int clear_boot_tracer(void)
6893 return 0; 6940 return 0;
6894} 6941}
6895 6942
6896early_initcall(tracer_alloc_buffers);
6897fs_initcall(tracer_init_debugfs); 6943fs_initcall(tracer_init_debugfs);
6898late_initcall(clear_boot_tracer); 6944late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
14#include <linux/trace_seq.h> 14#include <linux/trace_seq.h>
15#include <linux/ftrace_event.h> 15#include <linux/ftrace_event.h>
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/trace_seq.h>
17 18
18#ifdef CONFIG_FTRACE_SYSCALLS 19#ifdef CONFIG_FTRACE_SYSCALLS
19#include <asm/unistd.h> /* For NR_SYSCALLS */ 20#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
569 570
570void tracing_iter_reset(struct trace_iterator *iter, int cpu); 571void tracing_iter_reset(struct trace_iterator *iter, int cpu);
571 572
572void tracing_sched_switch_trace(struct trace_array *tr,
573 struct task_struct *prev,
574 struct task_struct *next,
575 unsigned long flags, int pc);
576
577void tracing_sched_wakeup_trace(struct trace_array *tr,
578 struct task_struct *wakee,
579 struct task_struct *cur,
580 unsigned long flags, int pc);
581void trace_function(struct trace_array *tr, 573void trace_function(struct trace_array *tr,
582 unsigned long ip, 574 unsigned long ip,
583 unsigned long parent_ip, 575 unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
597 589
598void tracing_start_cmdline_record(void); 590void tracing_start_cmdline_record(void);
599void tracing_stop_cmdline_record(void); 591void tracing_stop_cmdline_record(void);
600void tracing_sched_switch_assign_trace(struct trace_array *tr);
601void tracing_stop_sched_switch_record(void);
602void tracing_start_sched_switch_record(void);
603int register_tracer(struct tracer *type); 592int register_tracer(struct tracer *type);
604int is_tracing_stopped(void); 593int is_tracing_stopped(void);
605 594
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
719 708
720extern unsigned long trace_flags; 709extern unsigned long trace_flags;
721 710
711extern char trace_find_mark(unsigned long long duration);
712
722/* Standard output formatting function used for function return traces */ 713/* Standard output formatting function used for function return traces */
723#ifdef CONFIG_FUNCTION_GRAPH_TRACER 714#ifdef CONFIG_FUNCTION_GRAPH_TRACER
724 715
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
737extern enum print_line_t 728extern enum print_line_t
738print_graph_function_flags(struct trace_iterator *iter, u32 flags); 729print_graph_function_flags(struct trace_iterator *iter, u32 flags);
739extern void print_graph_headers_flags(struct seq_file *s, u32 flags); 730extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
740extern enum print_line_t 731extern void
741trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 732trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
742extern void graph_trace_open(struct trace_iterator *iter); 733extern void graph_trace_open(struct trace_iterator *iter);
743extern void graph_trace_close(struct trace_iterator *iter); 734extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
1310#define perf_ftrace_event_register NULL 1301#define perf_ftrace_event_register NULL
1311#endif 1302#endif
1312 1303
1304#ifdef CONFIG_FTRACE_SYSCALLS
1305void init_ftrace_syscalls(void);
1306#else
1307static inline void init_ftrace_syscalls(void) { }
1308#endif
1309
1310#ifdef CONFIG_EVENT_TRACING
1311void trace_event_init(void);
1312#else
1313static inline void __init trace_event_init(void) { }
1314#endif
1315
1316extern struct trace_iterator *tracepoint_print_iter;
1317
1313#endif /* _LINUX_KERNEL_TRACE_H */ 1318#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
151 151
152 trace_assign_type(field, iter->ent); 152 trace_assign_type(field, iter->ent);
153 153
154 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", 154 trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
155 field->correct ? " ok " : " MISS ", 155 field->correct ? " ok " : " MISS ",
156 field->func, 156 field->func,
157 field->file, 157 field->file,
158 field->line)) 158 field->line);
159 return TRACE_TYPE_PARTIAL_LINE; 159
160 160 return trace_handle_return(&iter->seq);
161 return TRACE_TYPE_HANDLED;
162} 161}
163 162
164static void branch_print_header(struct seq_file *s) 163static void branch_print_header(struct seq_file *s)
165{ 164{
166 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" 165 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
167 " FUNC:FILE:LINE\n"); 166 " FUNC:FILE:LINE\n"
168 seq_puts(s, "# | | | | | " 167 "# | | | | | "
169 " |\n"); 168 " |\n");
170} 169}
171 170
172static struct trace_event_functions trace_branch_funcs = { 171static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
233 232
234static int annotated_branch_stat_headers(struct seq_file *m) 233static int annotated_branch_stat_headers(struct seq_file *m)
235{ 234{
236 seq_printf(m, " correct incorrect %% "); 235 seq_puts(m, " correct incorrect % "
237 seq_printf(m, " Function " 236 " Function "
238 " File Line\n" 237 " File Line\n"
239 " ------- --------- - " 238 " ------- --------- - "
240 " -------- " 239 " -------- "
241 " ---- ----\n"); 240 " ---- ----\n");
242 return 0; 241 return 0;
243} 242}
244 243
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
274 273
275 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 274 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
276 if (percent < 0) 275 if (percent < 0)
277 seq_printf(m, " X "); 276 seq_puts(m, " X ");
278 else 277 else
279 seq_printf(m, "%3ld ", percent); 278 seq_printf(m, "%3ld ", percent);
280 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); 279 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
362 361
363static int all_branch_stat_headers(struct seq_file *m) 362static int all_branch_stat_headers(struct seq_file *m)
364{ 363{
365 seq_printf(m, " miss hit %% "); 364 seq_puts(m, " miss hit % "
366 seq_printf(m, " Function " 365 " Function "
367 " File Line\n" 366 " File Line\n"
368 " ------- --------- - " 367 " ------- --------- - "
369 " -------- " 368 " -------- "
370 " ---- ----\n"); 369 " ---- ----\n");
371 return 0; 370 return 0;
372} 371}
373 372
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..b03a0ea77b99 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
212} 212}
213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); 213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
214 214
215static DEFINE_SPINLOCK(tracepoint_iter_lock);
216
217static void output_printk(struct ftrace_event_buffer *fbuffer)
218{
219 struct ftrace_event_call *event_call;
220 struct trace_event *event;
221 unsigned long flags;
222 struct trace_iterator *iter = tracepoint_print_iter;
223
224 if (!iter)
225 return;
226
227 event_call = fbuffer->ftrace_file->event_call;
228 if (!event_call || !event_call->event.funcs ||
229 !event_call->event.funcs->trace)
230 return;
231
232 event = &fbuffer->ftrace_file->event_call->event;
233
234 spin_lock_irqsave(&tracepoint_iter_lock, flags);
235 trace_seq_init(&iter->seq);
236 iter->ent = fbuffer->entry;
237 event_call->event.funcs->trace(iter, 0, event);
238 trace_seq_putc(&iter->seq, 0);
239 printk("%s", iter->seq.buffer);
240
241 spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
242}
243
215void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) 244void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
216{ 245{
246 if (tracepoint_printk)
247 output_printk(fbuffer);
248
217 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, 249 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
218 fbuffer->event, fbuffer->entry, 250 fbuffer->event, fbuffer->entry,
219 fbuffer->flags, fbuffer->pc); 251 fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
461 493
462 if (dir) { 494 if (dir) {
463 spin_lock(&dir->d_lock); /* probably unneeded */ 495 spin_lock(&dir->d_lock); /* probably unneeded */
464 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { 496 list_for_each_entry(child, &dir->d_subdirs, d_child) {
465 if (child->d_inode) /* probably unneeded */ 497 if (child->d_inode) /* probably unneeded */
466 child->d_inode->i_private = NULL; 498 child->d_inode->i_private = NULL;
467 } 499 }
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
918 case FORMAT_HEADER: 950 case FORMAT_HEADER:
919 seq_printf(m, "name: %s\n", ftrace_event_name(call)); 951 seq_printf(m, "name: %s\n", ftrace_event_name(call));
920 seq_printf(m, "ID: %d\n", call->event.type); 952 seq_printf(m, "ID: %d\n", call->event.type);
921 seq_printf(m, "format:\n"); 953 seq_puts(m, "format:\n");
922 return 0; 954 return 0;
923 955
924 case FORMAT_FIELD_SEPERATOR: 956 case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1044 mutex_unlock(&event_mutex); 1076 mutex_unlock(&event_mutex);
1045 1077
1046 if (file) 1078 if (file)
1047 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1079 r = simple_read_from_buffer(ubuf, cnt, ppos,
1080 s->buffer, trace_seq_used(s));
1048 1081
1049 kfree(s); 1082 kfree(s);
1050 1083
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1210 trace_seq_init(s); 1243 trace_seq_init(s);
1211 1244
1212 print_subsystem_event_filter(system, s); 1245 print_subsystem_event_filter(system, s);
1213 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1246 r = simple_read_from_buffer(ubuf, cnt, ppos,
1247 s->buffer, trace_seq_used(s));
1214 1248
1215 kfree(s); 1249 kfree(s);
1216 1250
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1265 trace_seq_init(s); 1299 trace_seq_init(s);
1266 1300
1267 func(s); 1301 func(s);
1268 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1302 r = simple_read_from_buffer(ubuf, cnt, ppos,
1303 s->buffer, trace_seq_used(s));
1269 1304
1270 kfree(s); 1305 kfree(s);
1271 1306
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
1988 ftrace_event_name(data->file->event_call)); 2023 ftrace_event_name(data->file->event_call));
1989 2024
1990 if (data->count == -1) 2025 if (data->count == -1)
1991 seq_printf(m, ":unlimited\n"); 2026 seq_puts(m, ":unlimited\n");
1992 else 2027 else
1993 seq_printf(m, ":count=%ld\n", data->count); 2028 seq_printf(m, ":count=%ld\n", data->count);
1994 2029
@@ -2394,12 +2429,39 @@ static __init int event_trace_memsetup(void)
2394 return 0; 2429 return 0;
2395} 2430}
2396 2431
2432static __init void
2433early_enable_events(struct trace_array *tr, bool disable_first)
2434{
2435 char *buf = bootup_event_buf;
2436 char *token;
2437 int ret;
2438
2439 while (true) {
2440 token = strsep(&buf, ",");
2441
2442 if (!token)
2443 break;
2444 if (!*token)
2445 continue;
2446
2447 /* Restarting syscalls requires that we stop them first */
2448 if (disable_first)
2449 ftrace_set_clr_event(tr, token, 0);
2450
2451 ret = ftrace_set_clr_event(tr, token, 1);
2452 if (ret)
2453 pr_warn("Failed to enable trace event: %s\n", token);
2454
2455 /* Put back the comma to allow this to be called again */
2456 if (buf)
2457 *(buf - 1) = ',';
2458 }
2459}
2460
2397static __init int event_trace_enable(void) 2461static __init int event_trace_enable(void)
2398{ 2462{
2399 struct trace_array *tr = top_trace_array(); 2463 struct trace_array *tr = top_trace_array();
2400 struct ftrace_event_call **iter, *call; 2464 struct ftrace_event_call **iter, *call;
2401 char *buf = bootup_event_buf;
2402 char *token;
2403 int ret; 2465 int ret;
2404 2466
2405 if (!tr) 2467 if (!tr)
@@ -2421,18 +2483,7 @@ static __init int event_trace_enable(void)
2421 */ 2483 */
2422 __trace_early_add_events(tr); 2484 __trace_early_add_events(tr);
2423 2485
2424 while (true) { 2486 early_enable_events(tr, false);
2425 token = strsep(&buf, ",");
2426
2427 if (!token)
2428 break;
2429 if (!*token)
2430 continue;
2431
2432 ret = ftrace_set_clr_event(tr, token, 1);
2433 if (ret)
2434 pr_warn("Failed to enable trace event: %s\n", token);
2435 }
2436 2487
2437 trace_printk_start_comm(); 2488 trace_printk_start_comm();
2438 2489
@@ -2443,6 +2494,31 @@ static __init int event_trace_enable(void)
2443 return 0; 2494 return 0;
2444} 2495}
2445 2496
2497/*
2498 * event_trace_enable() is called from trace_event_init() first to
2499 * initialize events and perhaps start any events that are on the
2500 * command line. Unfortunately, there are some events that will not
2501 * start this early, like the system call tracepoints that need
2502 * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
2503 * is called before pid 1 starts, and this flag is never set, making
2504 * the syscall tracepoint never get reached, but the event is enabled
2505 * regardless (and not doing anything).
2506 */
2507static __init int event_trace_enable_again(void)
2508{
2509 struct trace_array *tr;
2510
2511 tr = top_trace_array();
2512 if (!tr)
2513 return -ENODEV;
2514
2515 early_enable_events(tr, true);
2516
2517 return 0;
2518}
2519
2520early_initcall(event_trace_enable_again);
2521
2446static __init int event_trace_init(void) 2522static __init int event_trace_init(void)
2447{ 2523{
2448 struct trace_array *tr; 2524 struct trace_array *tr;
@@ -2477,8 +2553,14 @@ static __init int event_trace_init(void)
2477#endif 2553#endif
2478 return 0; 2554 return 0;
2479} 2555}
2480early_initcall(event_trace_memsetup); 2556
2481core_initcall(event_trace_enable); 2557void __init trace_event_init(void)
2558{
2559 event_trace_memsetup();
2560 init_ftrace_syscalls();
2561 event_trace_enable();
2562}
2563
2482fs_initcall(event_trace_init); 2564fs_initcall(event_trace_init);
2483 2565
2484#ifdef CONFIG_FTRACE_STARTUP_TEST 2566#ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND, 47 OP_BAND,
48 OP_NOT,
48 OP_NONE, 49 OP_NONE,
49 OP_OPEN_PAREN, 50 OP_OPEN_PAREN,
50}; 51};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
67 { OP_GT, ">", 5 }, 68 { OP_GT, ">", 5 },
68 { OP_GE, ">=", 5 }, 69 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 }, 70 { OP_BAND, "&", 6 },
71 { OP_NOT, "!", 6 },
70 { OP_NONE, "OP_NONE", 0 }, 72 { OP_NONE, "OP_NONE", 0 },
71 { OP_OPEN_PAREN, "(", 0 }, 73 { OP_OPEN_PAREN, "(", 0 },
72}; 74};
@@ -85,6 +87,7 @@ enum {
85 FILT_ERR_MISSING_FIELD, 87 FILT_ERR_MISSING_FIELD,
86 FILT_ERR_INVALID_FILTER, 88 FILT_ERR_INVALID_FILTER,
87 FILT_ERR_IP_FIELD_ONLY, 89 FILT_ERR_IP_FIELD_ONLY,
90 FILT_ERR_ILLEGAL_NOT_OP,
88}; 91};
89 92
90static char *err_text[] = { 93static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
101 "Missing field name and/or value", 104 "Missing field name and/or value",
102 "Meaningless filter expression", 105 "Meaningless filter expression",
103 "Only 'ip' field is supported for function trace", 106 "Only 'ip' field is supported for function trace",
107 "Illegal use of '!'",
104}; 108};
105 109
106struct opstack_op { 110struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
139 int index; 143 int index;
140}; 144};
141 145
146/* If not of not match is equal to not of not, then it is a match */
142#define DEFINE_COMPARISON_PRED(type) \ 147#define DEFINE_COMPARISON_PRED(type) \
143static int filter_pred_##type(struct filter_pred *pred, void *event) \ 148static int filter_pred_##type(struct filter_pred *pred, void *event) \
144{ \ 149{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
166 break; \ 171 break; \
167 } \ 172 } \
168 \ 173 \
169 return match; \ 174 return !!match == !pred->not; \
170} 175}
171 176
172#define DEFINE_EQUALITY_PRED(size) \ 177#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
484 if (!WARN_ON_ONCE(!pred->fn)) 489 if (!WARN_ON_ONCE(!pred->fn))
485 match = pred->fn(pred, rec); 490 match = pred->fn(pred, rec);
486 if (!!match == type) 491 if (!!match == type)
487 return match; 492 break;
488 } 493 }
489 return match; 494 /* If not of not match is equal to not of not, then it is a match */
495 return !!match == !op->not;
490} 496}
491 497
492struct filter_match_preds_data { 498struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
735 * then this op can be folded. 741 * then this op can be folded.
736 */ 742 */
737 if (left->index & FILTER_PRED_FOLD && 743 if (left->index & FILTER_PRED_FOLD &&
738 (left->op == dest->op || 744 ((left->op == dest->op && !left->not) ||
739 left->left == FILTER_PRED_INVALID) && 745 left->left == FILTER_PRED_INVALID) &&
740 right->index & FILTER_PRED_FOLD && 746 right->index & FILTER_PRED_FOLD &&
741 (right->op == dest->op || 747 ((right->op == dest->op && !right->not) ||
742 right->left == FILTER_PRED_INVALID)) 748 right->left == FILTER_PRED_INVALID))
743 dest->index |= FILTER_PRED_FOLD; 749 dest->index |= FILTER_PRED_FOLD;
744 750
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
1028 } 1034 }
1029 1035
1030 if (pred->op == OP_NE) 1036 if (pred->op == OP_NE)
1031 pred->not = 1; 1037 pred->not ^= 1;
1032 1038
1033 pred->fn = fn; 1039 pred->fn = fn;
1034 return 0; 1040 return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
1590 continue; 1596 continue;
1591 } 1597 }
1592 1598
1599 if (elt->op == OP_NOT) {
1600 if (!n_preds || operand1 || operand2) {
1601 parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
1602 err = -EINVAL;
1603 goto fail;
1604 }
1605 if (!dry_run)
1606 filter->preds[n_preds - 1].not ^= 1;
1607 continue;
1608 }
1609
1593 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { 1610 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1594 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1611 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1595 err = -ENOSPC; 1612 err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
373{ 373{
374 long count = (long)data; 374 long count = (long)data;
375 375
376 seq_printf(m, "%s", name); 376 seq_puts(m, name);
377 377
378 if (count == -1) 378 if (count == -1)
379 seq_puts(m, ":unlimited"); 379 seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
383 if (filter_str) 383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str); 384 seq_printf(m, " if %s\n", filter_str);
385 else 385 else
386 seq_puts(m, "\n"); 386 seq_putc(m, '\n');
387 387
388 return 0; 388 return 0;
389} 389}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1105 if (data->filter_str) 1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str); 1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else 1107 else
1108 seq_puts(m, "\n"); 1108 seq_putc(m, '\n');
1109 1109
1110 return 0; 1110 return 0;
1111} 1111}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
261}; 261};
262 262
263#ifdef CONFIG_DYNAMIC_FTRACE 263#ifdef CONFIG_DYNAMIC_FTRACE
264static int update_count(void **data) 264static void update_traceon_count(void **data, bool on)
265{ 265{
266 unsigned long *count = (long *)data; 266 long *count = (long *)data;
267 long old_count = *count;
267 268
268 if (!*count) 269 /*
269 return 0; 270 * Tracing gets disabled (or enabled) once per count.
271 * This function can be called at the same time on multiple CPUs.
272 * It is fine if both disable (or enable) tracing, as disabling
273 * (or enabling) the second time doesn't do anything as the
274 * state of the tracer is already disabled (or enabled).
275 * What needs to be synchronized in this case is that the count
276 * only gets decremented once, even if the tracer is disabled
277 * (or enabled) twice, as the second one is really a nop.
278 *
279 * The memory barriers guarantee that we only decrement the
280 * counter once. First the count is read to a local variable
281 * and a read barrier is used to make sure that it is loaded
282 * before checking if the tracer is in the state we want.
283 * If the tracer is not in the state we want, then the count
284 * is guaranteed to be the old count.
285 *
286 * Next the tracer is set to the state we want (disabled or enabled)
287 * then a write memory barrier is used to make sure that
288 * the new state is visible before changing the counter by
289 * one minus the old counter. This guarantees that another CPU
290 * executing this code will see the new state before seeing
291 * the new counter value, and would not do anything if the new
292 * counter is seen.
293 *
294 * Note, there is no synchronization between this and a user
295 * setting the tracing_on file. But we currently don't care
296 * about that.
297 */
298 if (!old_count)
299 return;
270 300
271 if (*count != -1) 301 /* Make sure we see count before checking tracing state */
272 (*count)--; 302 smp_rmb();
273 303
274 return 1; 304 if (on == !!tracing_is_on())
305 return;
306
307 if (on)
308 tracing_on();
309 else
310 tracing_off();
311
312 /* unlimited? */
313 if (old_count == -1)
314 return;
315
316 /* Make sure tracing state is visible before updating count */
317 smp_wmb();
318
319 *count = old_count - 1;
275} 320}
276 321
277static void 322static void
278ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) 323ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
279{ 324{
280 if (tracing_is_on()) 325 update_traceon_count(data, 1);
281 return;
282
283 if (update_count(data))
284 tracing_on();
285} 326}
286 327
287static void 328static void
288ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) 329ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
289{ 330{
290 if (!tracing_is_on()) 331 update_traceon_count(data, 0);
291 return;
292
293 if (update_count(data))
294 tracing_off();
295} 332}
296 333
297static void 334static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
330static void 367static void
331ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) 368ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
332{ 369{
333 if (!tracing_is_on()) 370 long *count = (long *)data;
334 return; 371 long old_count;
372 long new_count;
335 373
336 if (update_count(data)) 374 /*
337 trace_dump_stack(STACK_SKIP); 375 * Stack traces should only execute the number of times the
376 * user specified in the counter.
377 */
378 do {
379
380 if (!tracing_is_on())
381 return;
382
383 old_count = *count;
384
385 if (!old_count)
386 return;
387
388 /* unlimited? */
389 if (old_count == -1) {
390 trace_dump_stack(STACK_SKIP);
391 return;
392 }
393
394 new_count = old_count - 1;
395 new_count = cmpxchg(count, old_count, new_count);
396 if (new_count == old_count)
397 trace_dump_stack(STACK_SKIP);
398
399 } while (new_count != old_count);
400}
401
402static int update_count(void **data)
403{
404 unsigned long *count = (long *)data;
405
406 if (!*count)
407 return 0;
408
409 if (*count != -1)
410 (*count)--;
411
412 return 1;
338} 413}
339 414
340static void 415static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
361 seq_printf(m, "%ps:%s", (void *)ip, name); 436 seq_printf(m, "%ps:%s", (void *)ip, name);
362 437
363 if (count == -1) 438 if (count == -1)
364 seq_printf(m, ":unlimited\n"); 439 seq_puts(m, ":unlimited\n");
365 else 440 else
366 seq_printf(m, ":count=%ld\n", count); 441 seq_printf(m, ":count=%ld\n", count);
367 442
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, 107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
108}; 108};
109 109
110static enum print_line_t 110static void
111print_graph_duration(unsigned long long duration, struct trace_seq *s, 111print_graph_duration(unsigned long long duration, struct trace_seq *s,
112 u32 flags); 112 u32 flags);
113 113
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
483 483
484static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
485 485
486static enum print_line_t 486static void print_graph_cpu(struct trace_seq *s, int cpu)
487print_graph_cpu(struct trace_seq *s, int cpu)
488{ 487{
489 int ret;
490
491 /* 488 /*
492 * Start with a space character - to make it stand out 489 * Start with a space character - to make it stand out
493 * to the right a bit when trace output is pasted into 490 * to the right a bit when trace output is pasted into
494 * email: 491 * email:
495 */ 492 */
496 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); 493 trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
497 if (!ret)
498 return TRACE_TYPE_PARTIAL_LINE;
499
500 return TRACE_TYPE_HANDLED;
501} 494}
502 495
503#define TRACE_GRAPH_PROCINFO_LENGTH 14 496#define TRACE_GRAPH_PROCINFO_LENGTH 14
504 497
505static enum print_line_t 498static void print_graph_proc(struct trace_seq *s, pid_t pid)
506print_graph_proc(struct trace_seq *s, pid_t pid)
507{ 499{
508 char comm[TASK_COMM_LEN]; 500 char comm[TASK_COMM_LEN];
509 /* sign + log10(MAX_INT) + '\0' */ 501 /* sign + log10(MAX_INT) + '\0' */
510 char pid_str[11]; 502 char pid_str[11];
511 int spaces = 0; 503 int spaces = 0;
512 int ret;
513 int len; 504 int len;
514 int i; 505 int i;
515 506
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
524 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; 515 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
525 516
526 /* First spaces to align center */ 517 /* First spaces to align center */
527 for (i = 0; i < spaces / 2; i++) { 518 for (i = 0; i < spaces / 2; i++)
528 ret = trace_seq_putc(s, ' '); 519 trace_seq_putc(s, ' ');
529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE;
531 }
532 520
533 ret = trace_seq_printf(s, "%s-%s", comm, pid_str); 521 trace_seq_printf(s, "%s-%s", comm, pid_str);
534 if (!ret)
535 return TRACE_TYPE_PARTIAL_LINE;
536 522
537 /* Last spaces to align center */ 523 /* Last spaces to align center */
538 for (i = 0; i < spaces - (spaces / 2); i++) { 524 for (i = 0; i < spaces - (spaces / 2); i++)
539 ret = trace_seq_putc(s, ' '); 525 trace_seq_putc(s, ' ');
540 if (!ret)
541 return TRACE_TYPE_PARTIAL_LINE;
542 }
543 return TRACE_TYPE_HANDLED;
544} 526}
545 527
546 528
547static enum print_line_t 529static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
548print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
549{ 530{
550 if (!trace_seq_putc(s, ' ')) 531 trace_seq_putc(s, ' ');
551 return 0; 532 trace_print_lat_fmt(s, entry);
552
553 return trace_print_lat_fmt(s, entry);
554} 533}
555 534
556/* If the pid changed since the last trace, output this event */ 535/* If the pid changed since the last trace, output this event */
557static enum print_line_t 536static void
558verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 537verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
559{ 538{
560 pid_t prev_pid; 539 pid_t prev_pid;
561 pid_t *last_pid; 540 pid_t *last_pid;
562 int ret;
563 541
564 if (!data) 542 if (!data)
565 return TRACE_TYPE_HANDLED; 543 return;
566 544
567 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 545 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
568 546
569 if (*last_pid == pid) 547 if (*last_pid == pid)
570 return TRACE_TYPE_HANDLED; 548 return;
571 549
572 prev_pid = *last_pid; 550 prev_pid = *last_pid;
573 *last_pid = pid; 551 *last_pid = pid;
574 552
575 if (prev_pid == -1) 553 if (prev_pid == -1)
576 return TRACE_TYPE_HANDLED; 554 return;
577/* 555/*
578 * Context-switch trace line: 556 * Context-switch trace line:
579 557
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
582 ------------------------------------------ 560 ------------------------------------------
583 561
584 */ 562 */
585 ret = trace_seq_puts(s, 563 trace_seq_puts(s, " ------------------------------------------\n");
586 " ------------------------------------------\n"); 564 print_graph_cpu(s, cpu);
587 if (!ret) 565 print_graph_proc(s, prev_pid);
588 return TRACE_TYPE_PARTIAL_LINE; 566 trace_seq_puts(s, " => ");
589 567 print_graph_proc(s, pid);
590 ret = print_graph_cpu(s, cpu); 568 trace_seq_puts(s, "\n ------------------------------------------\n\n");
591 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE;
593
594 ret = print_graph_proc(s, prev_pid);
595 if (ret == TRACE_TYPE_PARTIAL_LINE)
596 return TRACE_TYPE_PARTIAL_LINE;
597
598 ret = trace_seq_puts(s, " => ");
599 if (!ret)
600 return TRACE_TYPE_PARTIAL_LINE;
601
602 ret = print_graph_proc(s, pid);
603 if (ret == TRACE_TYPE_PARTIAL_LINE)
604 return TRACE_TYPE_PARTIAL_LINE;
605
606 ret = trace_seq_puts(s,
607 "\n ------------------------------------------\n\n");
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610
611 return TRACE_TYPE_HANDLED;
612} 569}
613 570
614static struct ftrace_graph_ret_entry * 571static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
682 return next; 639 return next;
683} 640}
684 641
685static int print_graph_abs_time(u64 t, struct trace_seq *s) 642static void print_graph_abs_time(u64 t, struct trace_seq *s)
686{ 643{
687 unsigned long usecs_rem; 644 unsigned long usecs_rem;
688 645
689 usecs_rem = do_div(t, NSEC_PER_SEC); 646 usecs_rem = do_div(t, NSEC_PER_SEC);
690 usecs_rem /= 1000; 647 usecs_rem /= 1000;
691 648
692 return trace_seq_printf(s, "%5lu.%06lu | ", 649 trace_seq_printf(s, "%5lu.%06lu | ",
693 (unsigned long)t, usecs_rem); 650 (unsigned long)t, usecs_rem);
694} 651}
695 652
696static enum print_line_t 653static void
697print_graph_irq(struct trace_iterator *iter, unsigned long addr, 654print_graph_irq(struct trace_iterator *iter, unsigned long addr,
698 enum trace_type type, int cpu, pid_t pid, u32 flags) 655 enum trace_type type, int cpu, pid_t pid, u32 flags)
699{ 656{
700 int ret;
701 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
658 struct trace_entry *ent = iter->ent;
702 659
703 if (addr < (unsigned long)__irqentry_text_start || 660 if (addr < (unsigned long)__irqentry_text_start ||
704 addr >= (unsigned long)__irqentry_text_end) 661 addr >= (unsigned long)__irqentry_text_end)
705 return TRACE_TYPE_UNHANDLED; 662 return;
706 663
707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 664 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
708 /* Absolute time */ 665 /* Absolute time */
709 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 666 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
710 ret = print_graph_abs_time(iter->ts, s); 667 print_graph_abs_time(iter->ts, s);
711 if (!ret)
712 return TRACE_TYPE_PARTIAL_LINE;
713 }
714 668
715 /* Cpu */ 669 /* Cpu */
716 if (flags & TRACE_GRAPH_PRINT_CPU) { 670 if (flags & TRACE_GRAPH_PRINT_CPU)
717 ret = print_graph_cpu(s, cpu); 671 print_graph_cpu(s, cpu);
718 if (ret == TRACE_TYPE_PARTIAL_LINE)
719 return TRACE_TYPE_PARTIAL_LINE;
720 }
721 672
722 /* Proc */ 673 /* Proc */
723 if (flags & TRACE_GRAPH_PRINT_PROC) { 674 if (flags & TRACE_GRAPH_PRINT_PROC) {
724 ret = print_graph_proc(s, pid); 675 print_graph_proc(s, pid);
725 if (ret == TRACE_TYPE_PARTIAL_LINE) 676 trace_seq_puts(s, " | ");
726 return TRACE_TYPE_PARTIAL_LINE;
727 ret = trace_seq_puts(s, " | ");
728 if (!ret)
729 return TRACE_TYPE_PARTIAL_LINE;
730 } 677 }
678
679 /* Latency format */
680 if (trace_flags & TRACE_ITER_LATENCY_FMT)
681 print_graph_lat_fmt(s, ent);
731 } 682 }
732 683
733 /* No overhead */ 684 /* No overhead */
734 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); 685 print_graph_duration(0, s, flags | FLAGS_FILL_START);
735 if (ret != TRACE_TYPE_HANDLED)
736 return ret;
737 686
738 if (type == TRACE_GRAPH_ENT) 687 if (type == TRACE_GRAPH_ENT)
739 ret = trace_seq_puts(s, "==========>"); 688 trace_seq_puts(s, "==========>");
740 else 689 else
741 ret = trace_seq_puts(s, "<=========="); 690 trace_seq_puts(s, "<==========");
742
743 if (!ret)
744 return TRACE_TYPE_PARTIAL_LINE;
745
746 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
747 if (ret != TRACE_TYPE_HANDLED)
748 return ret;
749
750 ret = trace_seq_putc(s, '\n');
751 691
752 if (!ret) 692 print_graph_duration(0, s, flags | FLAGS_FILL_END);
753 return TRACE_TYPE_PARTIAL_LINE; 693 trace_seq_putc(s, '\n');
754 return TRACE_TYPE_HANDLED;
755} 694}
756 695
757enum print_line_t 696void
758trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) 697trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
759{ 698{
760 unsigned long nsecs_rem = do_div(duration, 1000); 699 unsigned long nsecs_rem = do_div(duration, 1000);
761 /* log10(ULONG_MAX) + '\0' */ 700 /* log10(ULONG_MAX) + '\0' */
762 char msecs_str[21]; 701 char usecs_str[21];
763 char nsecs_str[5]; 702 char nsecs_str[5];
764 int ret, len; 703 int len;
765 int i; 704 int i;
766 705
767 sprintf(msecs_str, "%lu", (unsigned long) duration); 706 sprintf(usecs_str, "%lu", (unsigned long) duration);
768 707
769 /* Print msecs */ 708 /* Print msecs */
770 ret = trace_seq_printf(s, "%s", msecs_str); 709 trace_seq_printf(s, "%s", usecs_str);
771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE;
773 710
774 len = strlen(msecs_str); 711 len = strlen(usecs_str);
775 712
776 /* Print nsecs (we don't want to exceed 7 numbers) */ 713 /* Print nsecs (we don't want to exceed 7 numbers) */
777 if (len < 7) { 714 if (len < 7) {
778 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); 715 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
779 716
780 snprintf(nsecs_str, slen, "%03lu", nsecs_rem); 717 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
781 ret = trace_seq_printf(s, ".%s", nsecs_str); 718 trace_seq_printf(s, ".%s", nsecs_str);
782 if (!ret)
783 return TRACE_TYPE_PARTIAL_LINE;
784 len += strlen(nsecs_str); 719 len += strlen(nsecs_str);
785 } 720 }
786 721
787 ret = trace_seq_puts(s, " us "); 722 trace_seq_puts(s, " us ");
788 if (!ret)
789 return TRACE_TYPE_PARTIAL_LINE;
790 723
791 /* Print remaining spaces to fit the row's width */ 724 /* Print remaining spaces to fit the row's width */
792 for (i = len; i < 7; i++) { 725 for (i = len; i < 7; i++)
793 ret = trace_seq_putc(s, ' '); 726 trace_seq_putc(s, ' ');
794 if (!ret)
795 return TRACE_TYPE_PARTIAL_LINE;
796 }
797 return TRACE_TYPE_HANDLED;
798} 727}
799 728
800static enum print_line_t 729static void
801print_graph_duration(unsigned long long duration, struct trace_seq *s, 730print_graph_duration(unsigned long long duration, struct trace_seq *s,
802 u32 flags) 731 u32 flags)
803{ 732{
804 int ret = -1;
805
806 if (!(flags & TRACE_GRAPH_PRINT_DURATION) || 733 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
807 !(trace_flags & TRACE_ITER_CONTEXT_INFO)) 734 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
808 return TRACE_TYPE_HANDLED; 735 return;
809 736
810 /* No real adata, just filling the column with spaces */ 737 /* No real adata, just filling the column with spaces */
811 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { 738 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
812 case FLAGS_FILL_FULL: 739 case FLAGS_FILL_FULL:
813 ret = trace_seq_puts(s, " | "); 740 trace_seq_puts(s, " | ");
814 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return;
815 case FLAGS_FILL_START: 742 case FLAGS_FILL_START:
816 ret = trace_seq_puts(s, " "); 743 trace_seq_puts(s, " ");
817 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 744 return;
818 case FLAGS_FILL_END: 745 case FLAGS_FILL_END:
819 ret = trace_seq_puts(s, " |"); 746 trace_seq_puts(s, " |");
820 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 747 return;
821 } 748 }
822 749
823 /* Signal a overhead of time execution to the output */ 750 /* Signal a overhead of time execution to the output */
824 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 751 if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
825 /* Duration exceeded 100 msecs */ 752 trace_seq_printf(s, "%c ", trace_find_mark(duration));
826 if (duration > 100000ULL) 753 else
827 ret = trace_seq_puts(s, "! "); 754 trace_seq_puts(s, " ");
828 /* Duration exceeded 10 msecs */
829 else if (duration > 10000ULL)
830 ret = trace_seq_puts(s, "+ ");
831 }
832
833 /*
834 * The -1 means we either did not exceed the duration tresholds
835 * or we dont want to print out the overhead. Either way we need
836 * to fill out the space.
837 */
838 if (ret == -1)
839 ret = trace_seq_puts(s, " ");
840
841 /* Catching here any failure happenned above */
842 if (!ret)
843 return TRACE_TYPE_PARTIAL_LINE;
844
845 ret = trace_print_graph_duration(duration, s);
846 if (ret != TRACE_TYPE_HANDLED)
847 return ret;
848
849 ret = trace_seq_puts(s, "| ");
850 if (!ret)
851 return TRACE_TYPE_PARTIAL_LINE;
852 755
853 return TRACE_TYPE_HANDLED; 756 trace_print_graph_duration(duration, s);
757 trace_seq_puts(s, "| ");
854} 758}
855 759
856/* Case of a leaf function on its call entry */ 760/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
864 struct ftrace_graph_ret *graph_ret; 768 struct ftrace_graph_ret *graph_ret;
865 struct ftrace_graph_ent *call; 769 struct ftrace_graph_ent *call;
866 unsigned long long duration; 770 unsigned long long duration;
867 int ret;
868 int i; 771 int i;
869 772
870 graph_ret = &ret_entry->ret; 773 graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
890 } 793 }
891 794
892 /* Overhead and duration */ 795 /* Overhead and duration */
893 ret = print_graph_duration(duration, s, flags); 796 print_graph_duration(duration, s, flags);
894 if (ret == TRACE_TYPE_PARTIAL_LINE)
895 return TRACE_TYPE_PARTIAL_LINE;
896 797
897 /* Function */ 798 /* Function */
898 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 799 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
899 ret = trace_seq_putc(s, ' '); 800 trace_seq_putc(s, ' ');
900 if (!ret)
901 return TRACE_TYPE_PARTIAL_LINE;
902 }
903 801
904 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); 802 trace_seq_printf(s, "%ps();\n", (void *)call->func);
905 if (!ret)
906 return TRACE_TYPE_PARTIAL_LINE;
907 803
908 return TRACE_TYPE_HANDLED; 804 return trace_handle_return(s);
909} 805}
910 806
911static enum print_line_t 807static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
915{ 811{
916 struct ftrace_graph_ent *call = &entry->graph_ent; 812 struct ftrace_graph_ent *call = &entry->graph_ent;
917 struct fgraph_data *data = iter->private; 813 struct fgraph_data *data = iter->private;
918 int ret;
919 int i; 814 int i;
920 815
921 if (data) { 816 if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
931 } 826 }
932 827
933 /* No time */ 828 /* No time */
934 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 829 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
935 if (ret != TRACE_TYPE_HANDLED)
936 return ret;
937 830
938 /* Function */ 831 /* Function */
939 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 832 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
940 ret = trace_seq_putc(s, ' '); 833 trace_seq_putc(s, ' ');
941 if (!ret) 834
942 return TRACE_TYPE_PARTIAL_LINE; 835 trace_seq_printf(s, "%ps() {\n", (void *)call->func);
943 }
944 836
945 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); 837 if (trace_seq_has_overflowed(s))
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE; 838 return TRACE_TYPE_PARTIAL_LINE;
948 839
949 /* 840 /*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
953 return TRACE_TYPE_NO_CONSUME; 844 return TRACE_TYPE_NO_CONSUME;
954} 845}
955 846
956static enum print_line_t 847static void
957print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 848print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
958 int type, unsigned long addr, u32 flags) 849 int type, unsigned long addr, u32 flags)
959{ 850{
960 struct fgraph_data *data = iter->private; 851 struct fgraph_data *data = iter->private;
961 struct trace_entry *ent = iter->ent; 852 struct trace_entry *ent = iter->ent;
962 int cpu = iter->cpu; 853 int cpu = iter->cpu;
963 int ret;
964 854
965 /* Pid */ 855 /* Pid */
966 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) 856 verif_pid(s, ent->pid, cpu, data);
967 return TRACE_TYPE_PARTIAL_LINE;
968 857
969 if (type) { 858 if (type)
970 /* Interrupt */ 859 /* Interrupt */
971 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); 860 print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
972 if (ret == TRACE_TYPE_PARTIAL_LINE)
973 return TRACE_TYPE_PARTIAL_LINE;
974 }
975 861
976 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) 862 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
977 return 0; 863 return;
978 864
979 /* Absolute time */ 865 /* Absolute time */
980 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 866 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
981 ret = print_graph_abs_time(iter->ts, s); 867 print_graph_abs_time(iter->ts, s);
982 if (!ret)
983 return TRACE_TYPE_PARTIAL_LINE;
984 }
985 868
986 /* Cpu */ 869 /* Cpu */
987 if (flags & TRACE_GRAPH_PRINT_CPU) { 870 if (flags & TRACE_GRAPH_PRINT_CPU)
988 ret = print_graph_cpu(s, cpu); 871 print_graph_cpu(s, cpu);
989 if (ret == TRACE_TYPE_PARTIAL_LINE)
990 return TRACE_TYPE_PARTIAL_LINE;
991 }
992 872
993 /* Proc */ 873 /* Proc */
994 if (flags & TRACE_GRAPH_PRINT_PROC) { 874 if (flags & TRACE_GRAPH_PRINT_PROC) {
995 ret = print_graph_proc(s, ent->pid); 875 print_graph_proc(s, ent->pid);
996 if (ret == TRACE_TYPE_PARTIAL_LINE) 876 trace_seq_puts(s, " | ");
997 return TRACE_TYPE_PARTIAL_LINE;
998
999 ret = trace_seq_puts(s, " | ");
1000 if (!ret)
1001 return TRACE_TYPE_PARTIAL_LINE;
1002 } 877 }
1003 878
1004 /* Latency format */ 879 /* Latency format */
1005 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 880 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1006 ret = print_graph_lat_fmt(s, ent); 881 print_graph_lat_fmt(s, ent);
1007 if (ret == TRACE_TYPE_PARTIAL_LINE)
1008 return TRACE_TYPE_PARTIAL_LINE;
1009 }
1010 882
1011 return 0; 883 return;
1012} 884}
1013 885
1014/* 886/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
1126 if (check_irq_entry(iter, flags, call->func, call->depth)) 998 if (check_irq_entry(iter, flags, call->func, call->depth))
1127 return TRACE_TYPE_HANDLED; 999 return TRACE_TYPE_HANDLED;
1128 1000
1129 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1001 print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
1130 return TRACE_TYPE_PARTIAL_LINE;
1131 1002
1132 leaf_ret = get_return_for_leaf(iter, field); 1003 leaf_ret = get_return_for_leaf(iter, field);
1133 if (leaf_ret) 1004 if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1160 pid_t pid = ent->pid; 1031 pid_t pid = ent->pid;
1161 int cpu = iter->cpu; 1032 int cpu = iter->cpu;
1162 int func_match = 1; 1033 int func_match = 1;
1163 int ret;
1164 int i; 1034 int i;
1165 1035
1166 if (check_irq_return(iter, flags, trace->depth)) 1036 if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1186 } 1056 }
1187 } 1057 }
1188 1058
1189 if (print_graph_prologue(iter, s, 0, 0, flags)) 1059 print_graph_prologue(iter, s, 0, 0, flags);
1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1060
1192 /* Overhead and duration */ 1061 /* Overhead and duration */
1193 ret = print_graph_duration(duration, s, flags); 1062 print_graph_duration(duration, s, flags);
1194 if (ret == TRACE_TYPE_PARTIAL_LINE)
1195 return TRACE_TYPE_PARTIAL_LINE;
1196 1063
1197 /* Closing brace */ 1064 /* Closing brace */
1198 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1065 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
1199 ret = trace_seq_putc(s, ' '); 1066 trace_seq_putc(s, ' ');
1200 if (!ret)
1201 return TRACE_TYPE_PARTIAL_LINE;
1202 }
1203 1067
1204 /* 1068 /*
1205 * If the return function does not have a matching entry, 1069 * If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1208 * belongs to, write out the function name. Always do 1072 * belongs to, write out the function name. Always do
1209 * that if the funcgraph-tail option is enabled. 1073 * that if the funcgraph-tail option is enabled.
1210 */ 1074 */
1211 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { 1075 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
1212 ret = trace_seq_puts(s, "}\n"); 1076 trace_seq_puts(s, "}\n");
1213 if (!ret) 1077 else
1214 return TRACE_TYPE_PARTIAL_LINE; 1078 trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1215 } else {
1216 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1217 if (!ret)
1218 return TRACE_TYPE_PARTIAL_LINE;
1219 }
1220 1079
1221 /* Overrun */ 1080 /* Overrun */
1222 if (flags & TRACE_GRAPH_PRINT_OVERRUN) { 1081 if (flags & TRACE_GRAPH_PRINT_OVERRUN)
1223 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 1082 trace_seq_printf(s, " (Overruns: %lu)\n",
1224 trace->overrun); 1083 trace->overrun);
1225 if (!ret)
1226 return TRACE_TYPE_PARTIAL_LINE;
1227 }
1228 1084
1229 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, 1085 print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
1230 cpu, pid, flags); 1086 cpu, pid, flags);
1231 if (ret == TRACE_TYPE_PARTIAL_LINE)
1232 return TRACE_TYPE_PARTIAL_LINE;
1233 1087
1234 return TRACE_TYPE_HANDLED; 1088 return trace_handle_return(s);
1235} 1089}
1236 1090
1237static enum print_line_t 1091static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1248 if (data) 1102 if (data)
1249 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 1103 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
1250 1104
1251 if (print_graph_prologue(iter, s, 0, 0, flags)) 1105 print_graph_prologue(iter, s, 0, 0, flags);
1252 return TRACE_TYPE_PARTIAL_LINE;
1253 1106
1254 /* No time */ 1107 /* No time */
1255 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 1108 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1256 if (ret != TRACE_TYPE_HANDLED)
1257 return ret;
1258 1109
1259 /* Indentation */ 1110 /* Indentation */
1260 if (depth > 0) 1111 if (depth > 0)
1261 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1112 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
1262 ret = trace_seq_putc(s, ' '); 1113 trace_seq_putc(s, ' ');
1263 if (!ret)
1264 return TRACE_TYPE_PARTIAL_LINE;
1265 }
1266 1114
1267 /* The comment */ 1115 /* The comment */
1268 ret = trace_seq_puts(s, "/* "); 1116 trace_seq_puts(s, "/* ");
1269 if (!ret)
1270 return TRACE_TYPE_PARTIAL_LINE;
1271 1117
1272 switch (iter->ent->type) { 1118 switch (iter->ent->type) {
1273 case TRACE_BPRINT: 1119 case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1290 return ret; 1136 return ret;
1291 } 1137 }
1292 1138
1139 if (trace_seq_has_overflowed(s))
1140 goto out;
1141
1293 /* Strip ending newline */ 1142 /* Strip ending newline */
1294 if (s->buffer[s->len - 1] == '\n') { 1143 if (s->buffer[s->seq.len - 1] == '\n') {
1295 s->buffer[s->len - 1] = '\0'; 1144 s->buffer[s->seq.len - 1] = '\0';
1296 s->len--; 1145 s->seq.len--;
1297 } 1146 }
1298 1147
1299 ret = trace_seq_puts(s, " */\n"); 1148 trace_seq_puts(s, " */\n");
1300 if (!ret) 1149 out:
1301 return TRACE_TYPE_PARTIAL_LINE; 1150 return trace_handle_return(s);
1302
1303 return TRACE_TYPE_HANDLED;
1304} 1151}
1305 1152
1306 1153
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1407 print_lat_header(s, flags); 1254 print_lat_header(s, flags);
1408 1255
1409 /* 1st line */ 1256 /* 1st line */
1410 seq_printf(s, "#"); 1257 seq_putc(s, '#');
1411 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1258 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1412 seq_printf(s, " TIME "); 1259 seq_puts(s, " TIME ");
1413 if (flags & TRACE_GRAPH_PRINT_CPU) 1260 if (flags & TRACE_GRAPH_PRINT_CPU)
1414 seq_printf(s, " CPU"); 1261 seq_puts(s, " CPU");
1415 if (flags & TRACE_GRAPH_PRINT_PROC) 1262 if (flags & TRACE_GRAPH_PRINT_PROC)
1416 seq_printf(s, " TASK/PID "); 1263 seq_puts(s, " TASK/PID ");
1417 if (lat) 1264 if (lat)
1418 seq_printf(s, "||||"); 1265 seq_puts(s, "||||");
1419 if (flags & TRACE_GRAPH_PRINT_DURATION) 1266 if (flags & TRACE_GRAPH_PRINT_DURATION)
1420 seq_printf(s, " DURATION "); 1267 seq_puts(s, " DURATION ");
1421 seq_printf(s, " FUNCTION CALLS\n"); 1268 seq_puts(s, " FUNCTION CALLS\n");
1422 1269
1423 /* 2nd line */ 1270 /* 2nd line */
1424 seq_printf(s, "#"); 1271 seq_putc(s, '#');
1425 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1272 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1426 seq_printf(s, " | "); 1273 seq_puts(s, " | ");
1427 if (flags & TRACE_GRAPH_PRINT_CPU) 1274 if (flags & TRACE_GRAPH_PRINT_CPU)
1428 seq_printf(s, " | "); 1275 seq_puts(s, " | ");
1429 if (flags & TRACE_GRAPH_PRINT_PROC) 1276 if (flags & TRACE_GRAPH_PRINT_PROC)
1430 seq_printf(s, " | | "); 1277 seq_puts(s, " | | ");
1431 if (lat) 1278 if (lat)
1432 seq_printf(s, "||||"); 1279 seq_puts(s, "||||");
1433 if (flags & TRACE_GRAPH_PRINT_DURATION) 1280 if (flags & TRACE_GRAPH_PRINT_DURATION)
1434 seq_printf(s, " | | "); 1281 seq_puts(s, " | | ");
1435 seq_printf(s, " | | | |\n"); 1282 seq_puts(s, " | | | |\n");
1436} 1283}
1437 1284
1438static void print_graph_headers(struct seq_file *s) 1285static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
20{ 20{
21 /* use static because iter can be a bit big for the stack */ 21 /* use static because iter can be a bit big for the stack */
22 static struct trace_iterator iter; 22 static struct trace_iterator iter;
23 static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
23 unsigned int old_userobj; 24 unsigned int old_userobj;
24 int cnt = 0, cpu; 25 int cnt = 0, cpu;
25 26
26 trace_init_global_iter(&iter); 27 trace_init_global_iter(&iter);
28 iter.buffer_iter = buffer_iter;
27 29
28 for_each_tracing_cpu(cpu) { 30 for_each_tracing_cpu(cpu) {
29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 31 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 59 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 60 tracing_iter_reset(&iter, cpu_file);
59 } 61 }
60 if (!trace_empty(&iter)) 62
61 trace_find_next_entry_inc(&iter); 63 while (trace_find_next_entry_inc(&iter)) {
62 while (!trace_empty(&iter)) {
63 if (!cnt) 64 if (!cnt)
64 kdb_printf("---------------------------------\n"); 65 kdb_printf("---------------------------------\n");
65 cnt++; 66 cnt++;
66 67
67 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) 68 if (!skip_lines) {
68 print_trace_line(&iter); 69 print_trace_line(&iter);
69 if (!skip_lines)
70 trace_printk_seq(&iter.seq); 70 trace_printk_seq(&iter.seq);
71 else 71 } else {
72 skip_lines--; 72 skip_lines--;
73 }
74
73 if (KDB_FLAG(CMD_INTERRUPT)) 75 if (KDB_FLAG(CMD_INTERRUPT))
74 goto out; 76 goto out;
75 } 77 }
@@ -86,9 +88,12 @@ out:
86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 88 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 89 }
88 90
89 for_each_tracing_cpu(cpu) 91 for_each_tracing_cpu(cpu) {
90 if (iter.buffer_iter[cpu]) 92 if (iter.buffer_iter[cpu]) {
91 ring_buffer_read_finish(iter.buffer_iter[cpu]); 93 ring_buffer_read_finish(iter.buffer_iter[cpu]);
94 iter.buffer_iter[cpu] = NULL;
95 }
96 }
92} 97}
93 98
94/* 99/*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
127 132
128static __init int kdb_ftrace_register(void) 133static __init int kdb_ftrace_register(void)
129{ 134{
130 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", 135 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
131 "Dump ftrace log", 0, KDB_REPEAT_NONE); 136 "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
132 return 0; 137 return 0;
133} 138}
134 139
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
826 struct trace_kprobe *tk = v; 826 struct trace_kprobe *tk = v;
827 int i; 827 int i;
828 828
829 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); 829 seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
830 seq_printf(m, ":%s/%s", tk->tp.call.class->system, 830 seq_printf(m, ":%s/%s", tk->tp.call.class->system,
831 ftrace_event_name(&tk->tp.call)); 831 ftrace_event_name(&tk->tp.call));
832 832
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
840 840
841 for (i = 0; i < tk->tp.nr_args; i++) 841 for (i = 0; i < tk->tp.nr_args; i++)
842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); 842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
843 seq_printf(m, "\n"); 843 seq_putc(m, '\n');
844 844
845 return 0; 845 return 0;
846} 846}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1024 field = (struct kprobe_trace_entry_head *)iter->ent; 1024 field = (struct kprobe_trace_entry_head *)iter->ent;
1025 tp = container_of(event, struct trace_probe, call.event); 1025 tp = container_of(event, struct trace_probe, call.event);
1026 1026
1027 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1027 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1028 goto partial;
1029 1028
1030 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 1029 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1031 goto partial; 1030 goto out;
1032 1031
1033 if (!trace_seq_puts(s, ")")) 1032 trace_seq_putc(s, ')');
1034 goto partial;
1035 1033
1036 data = (u8 *)&field[1]; 1034 data = (u8 *)&field[1];
1037 for (i = 0; i < tp->nr_args; i++) 1035 for (i = 0; i < tp->nr_args; i++)
1038 if (!tp->args[i].type->print(s, tp->args[i].name, 1036 if (!tp->args[i].type->print(s, tp->args[i].name,
1039 data + tp->args[i].offset, field)) 1037 data + tp->args[i].offset, field))
1040 goto partial; 1038 goto out;
1041
1042 if (!trace_seq_puts(s, "\n"))
1043 goto partial;
1044 1039
1045 return TRACE_TYPE_HANDLED; 1040 trace_seq_putc(s, '\n');
1046partial: 1041 out:
1047 return TRACE_TYPE_PARTIAL_LINE; 1042 return trace_handle_return(s);
1048} 1043}
1049 1044
1050static enum print_line_t 1045static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1060 field = (struct kretprobe_trace_entry_head *)iter->ent; 1055 field = (struct kretprobe_trace_entry_head *)iter->ent;
1061 tp = container_of(event, struct trace_probe, call.event); 1056 tp = container_of(event, struct trace_probe, call.event);
1062 1057
1063 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1058 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1064 goto partial;
1065 1059
1066 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) 1060 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1067 goto partial; 1061 goto out;
1068 1062
1069 if (!trace_seq_puts(s, " <- ")) 1063 trace_seq_puts(s, " <- ");
1070 goto partial;
1071 1064
1072 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) 1065 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1073 goto partial; 1066 goto out;
1074 1067
1075 if (!trace_seq_puts(s, ")")) 1068 trace_seq_putc(s, ')');
1076 goto partial;
1077 1069
1078 data = (u8 *)&field[1]; 1070 data = (u8 *)&field[1];
1079 for (i = 0; i < tp->nr_args; i++) 1071 for (i = 0; i < tp->nr_args; i++)
1080 if (!tp->args[i].type->print(s, tp->args[i].name, 1072 if (!tp->args[i].type->print(s, tp->args[i].name,
1081 data + tp->args[i].offset, field)) 1073 data + tp->args[i].offset, field))
1082 goto partial; 1074 goto out;
1083 1075
1084 if (!trace_seq_puts(s, "\n")) 1076 trace_seq_putc(s, '\n');
1085 goto partial;
1086 1077
1087 return TRACE_TYPE_HANDLED; 1078 out:
1088partial: 1079 return trace_handle_return(s);
1089 return TRACE_TYPE_PARTIAL_LINE;
1090} 1080}
1091 1081
1092 1082
@@ -1158,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1158 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1159 size -= sizeof(u32); 1149 size -= sizeof(u32);
1160 1150
1161 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1162 if (!entry) 1152 if (!entry)
1163 return; 1153 return;
1164 1154
@@ -1189,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1189 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1190 size -= sizeof(u32); 1180 size -= sizeof(u32);
1191 1181
1192 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1193 if (!entry) 1183 if (!entry)
1194 return; 1184 return;
1195 1185
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
59 mmio_reset_data(tr); 59 mmio_reset_data(tr);
60} 60}
61 61
62static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 62static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
63{ 63{
64 int ret = 0;
65 int i; 64 int i;
66 resource_size_t start, end; 65 resource_size_t start, end;
67 const struct pci_driver *drv = pci_dev_driver(dev); 66 const struct pci_driver *drv = pci_dev_driver(dev);
68 67
69 /* XXX: incomplete checks for trace_seq_printf() return value */ 68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
70 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", 69 dev->bus->number, dev->devfn,
71 dev->bus->number, dev->devfn, 70 dev->vendor, dev->device, dev->irq);
72 dev->vendor, dev->device, dev->irq);
73 /* 71 /*
74 * XXX: is pci_resource_to_user() appropriate, since we are 72 * XXX: is pci_resource_to_user() appropriate, since we are
75 * supposed to interpret the __ioremap() phys_addr argument based on 73 * supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
77 */ 75 */
78 for (i = 0; i < 7; i++) { 76 for (i = 0; i < 7; i++) {
79 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 77 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
80 ret += trace_seq_printf(s, " %llx", 78 trace_seq_printf(s, " %llx",
81 (unsigned long long)(start | 79 (unsigned long long)(start |
82 (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); 80 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
83 } 81 }
84 for (i = 0; i < 7; i++) { 82 for (i = 0; i < 7; i++) {
85 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 83 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
86 ret += trace_seq_printf(s, " %llx", 84 trace_seq_printf(s, " %llx",
87 dev->resource[i].start < dev->resource[i].end ? 85 dev->resource[i].start < dev->resource[i].end ?
88 (unsigned long long)(end - start) + 1 : 0); 86 (unsigned long long)(end - start) + 1 : 0);
89 } 87 }
90 if (drv) 88 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 89 trace_seq_printf(s, " %s\n", drv->name);
92 else 90 else
93 ret += trace_seq_puts(s, " \n"); 91 trace_seq_puts(s, " \n");
94 return ret;
95} 92}
96 93
97static void destroy_header_iter(struct header_iter *hiter) 94static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
179 unsigned long long t = ns2usecs(iter->ts); 176 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 177 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
181 unsigned secs = (unsigned long)t; 178 unsigned secs = (unsigned long)t;
182 int ret = 1;
183 179
184 trace_assign_type(field, entry); 180 trace_assign_type(field, entry);
185 rw = &field->rw; 181 rw = &field->rw;
186 182
187 switch (rw->opcode) { 183 switch (rw->opcode) {
188 case MMIO_READ: 184 case MMIO_READ:
189 ret = trace_seq_printf(s, 185 trace_seq_printf(s,
190 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 186 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
191 rw->width, secs, usec_rem, rw->map_id, 187 rw->width, secs, usec_rem, rw->map_id,
192 (unsigned long long)rw->phys, 188 (unsigned long long)rw->phys,
193 rw->value, rw->pc, 0); 189 rw->value, rw->pc, 0);
194 break; 190 break;
195 case MMIO_WRITE: 191 case MMIO_WRITE:
196 ret = trace_seq_printf(s, 192 trace_seq_printf(s,
197 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 193 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
198 rw->width, secs, usec_rem, rw->map_id, 194 rw->width, secs, usec_rem, rw->map_id,
199 (unsigned long long)rw->phys, 195 (unsigned long long)rw->phys,
200 rw->value, rw->pc, 0); 196 rw->value, rw->pc, 0);
201 break; 197 break;
202 case MMIO_UNKNOWN_OP: 198 case MMIO_UNKNOWN_OP:
203 ret = trace_seq_printf(s, 199 trace_seq_printf(s,
204 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," 200 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
205 "%02lx 0x%lx %d\n", 201 "%02lx 0x%lx %d\n",
206 secs, usec_rem, rw->map_id, 202 secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 205 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 206 break;
211 default: 207 default:
212 ret = trace_seq_puts(s, "rw what?\n"); 208 trace_seq_puts(s, "rw what?\n");
213 break; 209 break;
214 } 210 }
215 if (ret) 211
216 return TRACE_TYPE_HANDLED; 212 return trace_handle_return(s);
217 return TRACE_TYPE_PARTIAL_LINE;
218} 213}
219 214
220static enum print_line_t mmio_print_map(struct trace_iterator *iter) 215static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
226 unsigned long long t = ns2usecs(iter->ts); 221 unsigned long long t = ns2usecs(iter->ts);
227 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 222 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
228 unsigned secs = (unsigned long)t; 223 unsigned secs = (unsigned long)t;
229 int ret;
230 224
231 trace_assign_type(field, entry); 225 trace_assign_type(field, entry);
232 m = &field->map; 226 m = &field->map;
233 227
234 switch (m->opcode) { 228 switch (m->opcode) {
235 case MMIO_PROBE: 229 case MMIO_PROBE:
236 ret = trace_seq_printf(s, 230 trace_seq_printf(s,
237 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 231 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
238 secs, usec_rem, m->map_id, 232 secs, usec_rem, m->map_id,
239 (unsigned long long)m->phys, m->virt, m->len, 233 (unsigned long long)m->phys, m->virt, m->len,
240 0UL, 0); 234 0UL, 0);
241 break; 235 break;
242 case MMIO_UNPROBE: 236 case MMIO_UNPROBE:
243 ret = trace_seq_printf(s, 237 trace_seq_printf(s,
244 "UNMAP %u.%06lu %d 0x%lx %d\n", 238 "UNMAP %u.%06lu %d 0x%lx %d\n",
245 secs, usec_rem, m->map_id, 0UL, 0); 239 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 240 break;
247 default: 241 default:
248 ret = trace_seq_puts(s, "map what?\n"); 242 trace_seq_puts(s, "map what?\n");
249 break; 243 break;
250 } 244 }
251 if (ret) 245
252 return TRACE_TYPE_HANDLED; 246 return trace_handle_return(s);
253 return TRACE_TYPE_PARTIAL_LINE;
254} 247}
255 248
256static enum print_line_t mmio_print_mark(struct trace_iterator *iter) 249static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
262 unsigned long long t = ns2usecs(iter->ts); 255 unsigned long long t = ns2usecs(iter->ts);
263 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 256 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
264 unsigned secs = (unsigned long)t; 257 unsigned secs = (unsigned long)t;
265 int ret;
266 258
267 /* The trailing newline must be in the message. */ 259 /* The trailing newline must be in the message. */
268 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); 260 trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
269 if (!ret)
270 return TRACE_TYPE_PARTIAL_LINE;
271 261
272 return TRACE_TYPE_HANDLED; 262 return trace_handle_return(s);
273} 263}
274 264
275static enum print_line_t mmio_print_line(struct trace_iterator *iter) 265static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
25 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
26 struct trace_entry *entry = iter->ent; 26 struct trace_entry *entry = iter->ent;
27 struct bputs_entry *field; 27 struct bputs_entry *field;
28 int ret;
29 28
30 trace_assign_type(field, entry); 29 trace_assign_type(field, entry);
31 30
32 ret = trace_seq_puts(s, field->str); 31 trace_seq_puts(s, field->str);
33 if (!ret)
34 return TRACE_TYPE_PARTIAL_LINE;
35 32
36 return TRACE_TYPE_HANDLED; 33 return trace_handle_return(s);
37} 34}
38 35
39enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
42 struct trace_entry *entry = iter->ent; 39 struct trace_entry *entry = iter->ent;
43 struct bprint_entry *field; 40 struct bprint_entry *field;
44 int ret;
45 41
46 trace_assign_type(field, entry); 42 trace_assign_type(field, entry);
47 43
48 ret = trace_seq_bprintf(s, field->fmt, field->buf); 44 trace_seq_bprintf(s, field->fmt, field->buf);
49 if (!ret)
50 return TRACE_TYPE_PARTIAL_LINE;
51 45
52 return TRACE_TYPE_HANDLED; 46 return trace_handle_return(s);
53} 47}
54 48
55enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) 49enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
57 struct trace_seq *s = &iter->seq; 51 struct trace_seq *s = &iter->seq;
58 struct trace_entry *entry = iter->ent; 52 struct trace_entry *entry = iter->ent;
59 struct print_entry *field; 53 struct print_entry *field;
60 int ret;
61 54
62 trace_assign_type(field, entry); 55 trace_assign_type(field, entry);
63 56
64 ret = trace_seq_puts(s, field->buf); 57 trace_seq_puts(s, field->buf);
65 if (!ret)
66 return TRACE_TYPE_PARTIAL_LINE;
67 58
68 return TRACE_TYPE_HANDLED; 59 return trace_handle_return(s);
69} 60}
70 61
71const char * 62const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
124 115
125 if (ret == (const char *)(trace_seq_buffer_ptr(p))) 116 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
126 trace_seq_printf(p, "0x%lx", val); 117 trace_seq_printf(p, "0x%lx", val);
127 118
128 trace_seq_putc(p, 0); 119 trace_seq_putc(p, 0);
129 120
130 return ret; 121 return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
193 struct trace_seq *s = &iter->seq; 184 struct trace_seq *s = &iter->seq;
194 struct trace_seq *p = &iter->tmp_seq; 185 struct trace_seq *p = &iter->tmp_seq;
195 struct trace_entry *entry; 186 struct trace_entry *entry;
196 int ret;
197 187
198 event = container_of(trace_event, struct ftrace_event_call, event); 188 event = container_of(trace_event, struct ftrace_event_call, event);
199 entry = iter->ent; 189 entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
204 } 194 }
205 195
206 trace_seq_init(p); 196 trace_seq_init(p);
207 ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); 197 trace_seq_printf(s, "%s: ", ftrace_event_name(event));
208 if (!ret)
209 return TRACE_TYPE_PARTIAL_LINE;
210 198
211 return 0; 199 return trace_handle_return(s);
212} 200}
213EXPORT_SYMBOL(ftrace_raw_output_prep); 201EXPORT_SYMBOL(ftrace_raw_output_prep);
214 202
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
216 char *fmt, va_list ap) 204 char *fmt, va_list ap)
217{ 205{
218 struct trace_seq *s = &iter->seq; 206 struct trace_seq *s = &iter->seq;
219 int ret;
220
221 ret = trace_seq_printf(s, "%s: ", name);
222 if (!ret)
223 return TRACE_TYPE_PARTIAL_LINE;
224
225 ret = trace_seq_vprintf(s, fmt, ap);
226 207
227 if (!ret) 208 trace_seq_printf(s, "%s: ", name);
228 return TRACE_TYPE_PARTIAL_LINE; 209 trace_seq_vprintf(s, fmt, ap);
229 210
230 return TRACE_TYPE_HANDLED; 211 return trace_handle_return(s);
231} 212}
232 213
233int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) 214int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
260} 241}
261#endif /* CONFIG_KRETPROBES */ 242#endif /* CONFIG_KRETPROBES */
262 243
263static int 244static void
264seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 245seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
265{ 246{
266#ifdef CONFIG_KALLSYMS 247#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
271 252
272 name = kretprobed(str); 253 name = kretprobed(str);
273 254
274 return trace_seq_printf(s, fmt, name); 255 trace_seq_printf(s, fmt, name);
275#endif 256#endif
276 return 1;
277} 257}
278 258
279static int 259static void
280seq_print_sym_offset(struct trace_seq *s, const char *fmt, 260seq_print_sym_offset(struct trace_seq *s, const char *fmt,
281 unsigned long address) 261 unsigned long address)
282{ 262{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
287 sprint_symbol(str, address); 267 sprint_symbol(str, address);
288 name = kretprobed(str); 268 name = kretprobed(str);
289 269
290 return trace_seq_printf(s, fmt, name); 270 trace_seq_printf(s, fmt, name);
291#endif 271#endif
292 return 1;
293} 272}
294 273
295#ifndef CONFIG_64BIT 274#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
320 if (file) { 299 if (file) {
321 ret = trace_seq_path(s, &file->f_path); 300 ret = trace_seq_path(s, &file->f_path);
322 if (ret) 301 if (ret)
323 ret = trace_seq_printf(s, "[+0x%lx]", 302 trace_seq_printf(s, "[+0x%lx]",
324 ip - vmstart); 303 ip - vmstart);
325 } 304 }
326 up_read(&mm->mmap_sem); 305 up_read(&mm->mmap_sem);
327 } 306 }
328 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) 307 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
329 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 308 trace_seq_printf(s, " <" IP_FMT ">", ip);
330 return ret; 309 return !trace_seq_has_overflowed(s);
331} 310}
332 311
333int 312int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
335 unsigned long sym_flags) 314 unsigned long sym_flags)
336{ 315{
337 struct mm_struct *mm = NULL; 316 struct mm_struct *mm = NULL;
338 int ret = 1;
339 unsigned int i; 317 unsigned int i;
340 318
341 if (trace_flags & TRACE_ITER_SYM_USEROBJ) { 319 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
354 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 332 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
355 unsigned long ip = entry->caller[i]; 333 unsigned long ip = entry->caller[i];
356 334
357 if (ip == ULONG_MAX || !ret) 335 if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
358 break; 336 break;
359 if (ret) 337
360 ret = trace_seq_puts(s, " => "); 338 trace_seq_puts(s, " => ");
339
361 if (!ip) { 340 if (!ip) {
362 if (ret) 341 trace_seq_puts(s, "??");
363 ret = trace_seq_puts(s, "??"); 342 trace_seq_putc(s, '\n');
364 if (ret)
365 ret = trace_seq_putc(s, '\n');
366 continue; 343 continue;
367 } 344 }
368 if (!ret) 345
369 break; 346 seq_print_user_ip(s, mm, ip, sym_flags);
370 if (ret) 347 trace_seq_putc(s, '\n');
371 ret = seq_print_user_ip(s, mm, ip, sym_flags);
372 ret = trace_seq_putc(s, '\n');
373 } 348 }
374 349
375 if (mm) 350 if (mm)
376 mmput(mm); 351 mmput(mm);
377 return ret; 352
353 return !trace_seq_has_overflowed(s);
378} 354}
379 355
380int 356int
381seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 357seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
382{ 358{
383 int ret; 359 if (!ip) {
384 360 trace_seq_putc(s, '0');
385 if (!ip) 361 goto out;
386 return trace_seq_putc(s, '0'); 362 }
387 363
388 if (sym_flags & TRACE_ITER_SYM_OFFSET) 364 if (sym_flags & TRACE_ITER_SYM_OFFSET)
389 ret = seq_print_sym_offset(s, "%s", ip); 365 seq_print_sym_offset(s, "%s", ip);
390 else 366 else
391 ret = seq_print_sym_short(s, "%s", ip); 367 seq_print_sym_short(s, "%s", ip);
392
393 if (!ret)
394 return 0;
395 368
396 if (sym_flags & TRACE_ITER_SYM_ADDR) 369 if (sym_flags & TRACE_ITER_SYM_ADDR)
397 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 370 trace_seq_printf(s, " <" IP_FMT ">", ip);
398 return ret; 371
372 out:
373 return !trace_seq_has_overflowed(s);
399} 374}
400 375
401/** 376/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
413 char irqs_off; 388 char irqs_off;
414 int hardirq; 389 int hardirq;
415 int softirq; 390 int softirq;
416 int ret;
417 391
418 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 392 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
419 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 393 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
445 softirq ? 's' : 419 softirq ? 's' :
446 '.'; 420 '.';
447 421
448 if (!trace_seq_printf(s, "%c%c%c", 422 trace_seq_printf(s, "%c%c%c",
449 irqs_off, need_resched, hardsoft_irq)) 423 irqs_off, need_resched, hardsoft_irq);
450 return 0;
451 424
452 if (entry->preempt_count) 425 if (entry->preempt_count)
453 ret = trace_seq_printf(s, "%x", entry->preempt_count); 426 trace_seq_printf(s, "%x", entry->preempt_count);
454 else 427 else
455 ret = trace_seq_putc(s, '.'); 428 trace_seq_putc(s, '.');
456 429
457 return ret; 430 return !trace_seq_has_overflowed(s);
458} 431}
459 432
460static int 433static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
464 437
465 trace_find_cmdline(entry->pid, comm); 438 trace_find_cmdline(entry->pid, comm);
466 439
467 if (!trace_seq_printf(s, "%8.8s-%-5d %3d", 440 trace_seq_printf(s, "%8.8s-%-5d %3d",
468 comm, entry->pid, cpu)) 441 comm, entry->pid, cpu);
469 return 0;
470 442
471 return trace_print_lat_fmt(s, entry); 443 return trace_print_lat_fmt(s, entry);
472} 444}
473 445
474static unsigned long preempt_mark_thresh_us = 100; 446#undef MARK
447#define MARK(v, s) {.val = v, .sym = s}
448/* trace overhead mark */
449static const struct trace_mark {
450 unsigned long long val; /* unit: nsec */
451 char sym;
452} mark[] = {
453 MARK(1000000000ULL , '$'), /* 1 sec */
454 MARK(1000000ULL , '#'), /* 1000 usecs */
455 MARK(100000ULL , '!'), /* 100 usecs */
456 MARK(10000ULL , '+'), /* 10 usecs */
457};
458#undef MARK
459
460char trace_find_mark(unsigned long long d)
461{
462 int i;
463 int size = ARRAY_SIZE(mark);
464
465 for (i = 0; i < size; i++) {
466 if (d >= mark[i].val)
467 break;
468 }
469
470 return (i == size) ? ' ' : mark[i].sym;
471}
475 472
476static int 473static int
477lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) 474lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
493 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); 490 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
494 unsigned long rel_msec = (unsigned long)rel_ts; 491 unsigned long rel_msec = (unsigned long)rel_ts;
495 492
496 return trace_seq_printf( 493 trace_seq_printf(
497 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", 494 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
498 ns2usecs(iter->ts), 495 ns2usecs(iter->ts),
499 abs_msec, abs_usec, 496 abs_msec, abs_usec,
500 rel_msec, rel_usec); 497 rel_msec, rel_usec);
498
501 } else if (verbose && !in_ns) { 499 } else if (verbose && !in_ns) {
502 return trace_seq_printf( 500 trace_seq_printf(
503 s, "[%016llx] %lld (+%lld): ", 501 s, "[%016llx] %lld (+%lld): ",
504 iter->ts, abs_ts, rel_ts); 502 iter->ts, abs_ts, rel_ts);
503
505 } else if (!verbose && in_ns) { 504 } else if (!verbose && in_ns) {
506 return trace_seq_printf( 505 trace_seq_printf(
507 s, " %4lldus%c: ", 506 s, " %4lldus%c: ",
508 abs_ts, 507 abs_ts,
509 rel_ts > preempt_mark_thresh_us ? '!' : 508 trace_find_mark(rel_ts * NSEC_PER_USEC));
510 rel_ts > 1 ? '+' : ' '); 509
511 } else { /* !verbose && !in_ns */ 510 } else { /* !verbose && !in_ns */
512 return trace_seq_printf(s, " %4lld: ", abs_ts); 511 trace_seq_printf(s, " %4lld: ", abs_ts);
513 } 512 }
513
514 return !trace_seq_has_overflowed(s);
514} 515}
515 516
516int trace_print_context(struct trace_iterator *iter) 517int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
520 unsigned long long t; 521 unsigned long long t;
521 unsigned long secs, usec_rem; 522 unsigned long secs, usec_rem;
522 char comm[TASK_COMM_LEN]; 523 char comm[TASK_COMM_LEN];
523 int ret;
524 524
525 trace_find_cmdline(entry->pid, comm); 525 trace_find_cmdline(entry->pid, comm);
526 526
527 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", 527 trace_seq_printf(s, "%16s-%-5d [%03d] ",
528 comm, entry->pid, iter->cpu); 528 comm, entry->pid, iter->cpu);
529 if (!ret)
530 return 0;
531 529
532 if (trace_flags & TRACE_ITER_IRQ_INFO) { 530 if (trace_flags & TRACE_ITER_IRQ_INFO)
533 ret = trace_print_lat_fmt(s, entry); 531 trace_print_lat_fmt(s, entry);
534 if (!ret)
535 return 0;
536 }
537 532
538 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { 533 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
539 t = ns2usecs(iter->ts); 534 t = ns2usecs(iter->ts);
540 usec_rem = do_div(t, USEC_PER_SEC); 535 usec_rem = do_div(t, USEC_PER_SEC);
541 secs = (unsigned long)t; 536 secs = (unsigned long)t;
542 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); 537 trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
543 } else 538 } else
544 return trace_seq_printf(s, " %12llu: ", iter->ts); 539 trace_seq_printf(s, " %12llu: ", iter->ts);
540
541 return !trace_seq_has_overflowed(s);
545} 542}
546 543
547int trace_print_lat_context(struct trace_iterator *iter) 544int trace_print_lat_context(struct trace_iterator *iter)
548{ 545{
549 u64 next_ts; 546 u64 next_ts;
550 int ret;
551 /* trace_find_next_entry will reset ent_size */ 547 /* trace_find_next_entry will reset ent_size */
552 int ent_size = iter->ent_size; 548 int ent_size = iter->ent_size;
553 struct trace_seq *s = &iter->seq; 549 struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
567 563
568 trace_find_cmdline(entry->pid, comm); 564 trace_find_cmdline(entry->pid, comm);
569 565
570 ret = trace_seq_printf( 566 trace_seq_printf(
571 s, "%16s %5d %3d %d %08x %08lx ", 567 s, "%16s %5d %3d %d %08x %08lx ",
572 comm, entry->pid, iter->cpu, entry->flags, 568 comm, entry->pid, iter->cpu, entry->flags,
573 entry->preempt_count, iter->idx); 569 entry->preempt_count, iter->idx);
574 } else { 570 } else {
575 ret = lat_print_generic(s, entry, iter->cpu); 571 lat_print_generic(s, entry, iter->cpu);
576 } 572 }
577 573
578 if (ret) 574 lat_print_timestamp(iter, next_ts);
579 ret = lat_print_timestamp(iter, next_ts);
580 575
581 return ret; 576 return !trace_seq_has_overflowed(s);
582} 577}
583 578
584static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 579static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
692 goto out; 687 goto out;
693 688
694 } else { 689 } else {
695 690
696 event->type = next_event_type++; 691 event->type = next_event_type++;
697 list = &ftrace_event_list; 692 list = &ftrace_event_list;
698 } 693 }
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
764enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 759enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
765 struct trace_event *event) 760 struct trace_event *event)
766{ 761{
767 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) 762 trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
768 return TRACE_TYPE_PARTIAL_LINE;
769 763
770 return TRACE_TYPE_HANDLED; 764 return trace_handle_return(&iter->seq);
771} 765}
772 766
773/* TRACE_FN */ 767/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
779 773
780 trace_assign_type(field, iter->ent); 774 trace_assign_type(field, iter->ent);
781 775
782 if (!seq_print_ip_sym(s, field->ip, flags)) 776 seq_print_ip_sym(s, field->ip, flags);
783 goto partial;
784 777
785 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 778 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
786 if (!trace_seq_puts(s, " <-")) 779 trace_seq_puts(s, " <-");
787 goto partial; 780 seq_print_ip_sym(s, field->parent_ip, flags);
788 if (!seq_print_ip_sym(s,
789 field->parent_ip,
790 flags))
791 goto partial;
792 } 781 }
793 if (!trace_seq_putc(s, '\n'))
794 goto partial;
795 782
796 return TRACE_TYPE_HANDLED; 783 trace_seq_putc(s, '\n');
797 784
798 partial: 785 return trace_handle_return(s);
799 return TRACE_TYPE_PARTIAL_LINE;
800} 786}
801 787
802static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, 788static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
806 792
807 trace_assign_type(field, iter->ent); 793 trace_assign_type(field, iter->ent);
808 794
809 if (!trace_seq_printf(&iter->seq, "%lx %lx\n", 795 trace_seq_printf(&iter->seq, "%lx %lx\n",
810 field->ip, 796 field->ip,
811 field->parent_ip)) 797 field->parent_ip);
812 return TRACE_TYPE_PARTIAL_LINE;
813 798
814 return TRACE_TYPE_HANDLED; 799 return trace_handle_return(&iter->seq);
815} 800}
816 801
817static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, 802static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
822 807
823 trace_assign_type(field, iter->ent); 808 trace_assign_type(field, iter->ent);
824 809
825 SEQ_PUT_HEX_FIELD_RET(s, field->ip); 810 SEQ_PUT_HEX_FIELD(s, field->ip);
826 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); 811 SEQ_PUT_HEX_FIELD(s, field->parent_ip);
827 812
828 return TRACE_TYPE_HANDLED; 813 return trace_handle_return(s);
829} 814}
830 815
831static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, 816static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
836 821
837 trace_assign_type(field, iter->ent); 822 trace_assign_type(field, iter->ent);
838 823
839 SEQ_PUT_FIELD_RET(s, field->ip); 824 SEQ_PUT_FIELD(s, field->ip);
840 SEQ_PUT_FIELD_RET(s, field->parent_ip); 825 SEQ_PUT_FIELD(s, field->parent_ip);
841 826
842 return TRACE_TYPE_HANDLED; 827 return trace_handle_return(s);
843} 828}
844 829
845static struct trace_event_functions trace_fn_funcs = { 830static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
868 T = task_state_char(field->next_state); 853 T = task_state_char(field->next_state);
869 S = task_state_char(field->prev_state); 854 S = task_state_char(field->prev_state);
870 trace_find_cmdline(field->next_pid, comm); 855 trace_find_cmdline(field->next_pid, comm);
871 if (!trace_seq_printf(&iter->seq, 856 trace_seq_printf(&iter->seq,
872 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 857 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
873 field->prev_pid, 858 field->prev_pid,
874 field->prev_prio, 859 field->prev_prio,
875 S, delim, 860 S, delim,
876 field->next_cpu, 861 field->next_cpu,
877 field->next_pid, 862 field->next_pid,
878 field->next_prio, 863 field->next_prio,
879 T, comm)) 864 T, comm);
880 return TRACE_TYPE_PARTIAL_LINE; 865
881 866 return trace_handle_return(&iter->seq);
882 return TRACE_TYPE_HANDLED;
883} 867}
884 868
885static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, 869static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
904 if (!S) 888 if (!S)
905 S = task_state_char(field->prev_state); 889 S = task_state_char(field->prev_state);
906 T = task_state_char(field->next_state); 890 T = task_state_char(field->next_state);
907 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 891 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
908 field->prev_pid, 892 field->prev_pid,
909 field->prev_prio, 893 field->prev_prio,
910 S, 894 S,
911 field->next_cpu, 895 field->next_cpu,
912 field->next_pid, 896 field->next_pid,
913 field->next_prio, 897 field->next_prio,
914 T)) 898 T);
915 return TRACE_TYPE_PARTIAL_LINE; 899
916 900 return trace_handle_return(&iter->seq);
917 return TRACE_TYPE_HANDLED;
918} 901}
919 902
920static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, 903static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
942 S = task_state_char(field->prev_state); 925 S = task_state_char(field->prev_state);
943 T = task_state_char(field->next_state); 926 T = task_state_char(field->next_state);
944 927
945 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 928 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
946 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 929 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
947 SEQ_PUT_HEX_FIELD_RET(s, S); 930 SEQ_PUT_HEX_FIELD(s, S);
948 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); 931 SEQ_PUT_HEX_FIELD(s, field->next_cpu);
949 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); 932 SEQ_PUT_HEX_FIELD(s, field->next_pid);
950 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); 933 SEQ_PUT_HEX_FIELD(s, field->next_prio);
951 SEQ_PUT_HEX_FIELD_RET(s, T); 934 SEQ_PUT_HEX_FIELD(s, T);
952 935
953 return TRACE_TYPE_HANDLED; 936 return trace_handle_return(s);
954} 937}
955 938
956static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, 939static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
973 956
974 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
975 958
976 SEQ_PUT_FIELD_RET(s, field->prev_pid); 959 SEQ_PUT_FIELD(s, field->prev_pid);
977 SEQ_PUT_FIELD_RET(s, field->prev_prio); 960 SEQ_PUT_FIELD(s, field->prev_prio);
978 SEQ_PUT_FIELD_RET(s, field->prev_state); 961 SEQ_PUT_FIELD(s, field->prev_state);
979 SEQ_PUT_FIELD_RET(s, field->next_pid); 962 SEQ_PUT_FIELD(s, field->next_cpu);
980 SEQ_PUT_FIELD_RET(s, field->next_prio); 963 SEQ_PUT_FIELD(s, field->next_pid);
981 SEQ_PUT_FIELD_RET(s, field->next_state); 964 SEQ_PUT_FIELD(s, field->next_prio);
965 SEQ_PUT_FIELD(s, field->next_state);
982 966
983 return TRACE_TYPE_HANDLED; 967 return trace_handle_return(s);
984} 968}
985 969
986static struct trace_event_functions trace_ctx_funcs = { 970static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1020 trace_assign_type(field, iter->ent); 1004 trace_assign_type(field, iter->ent);
1021 end = (unsigned long *)((long)iter->ent + iter->ent_size); 1005 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1022 1006
1023 if (!trace_seq_puts(s, "<stack trace>\n")) 1007 trace_seq_puts(s, "<stack trace>\n");
1024 goto partial;
1025 1008
1026 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { 1009 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1027 if (!trace_seq_puts(s, " => "))
1028 goto partial;
1029 1010
1030 if (!seq_print_ip_sym(s, *p, flags)) 1011 if (trace_seq_has_overflowed(s))
1031 goto partial; 1012 break;
1032 if (!trace_seq_putc(s, '\n'))
1033 goto partial;
1034 }
1035 1013
1036 return TRACE_TYPE_HANDLED; 1014 trace_seq_puts(s, " => ");
1015 seq_print_ip_sym(s, *p, flags);
1016 trace_seq_putc(s, '\n');
1017 }
1037 1018
1038 partial: 1019 return trace_handle_return(s);
1039 return TRACE_TYPE_PARTIAL_LINE;
1040} 1020}
1041 1021
1042static struct trace_event_functions trace_stack_funcs = { 1022static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1057 1037
1058 trace_assign_type(field, iter->ent); 1038 trace_assign_type(field, iter->ent);
1059 1039
1060 if (!trace_seq_puts(s, "<user stack trace>\n")) 1040 trace_seq_puts(s, "<user stack trace>\n");
1061 goto partial; 1041 seq_print_userip_objs(field, s, flags);
1062
1063 if (!seq_print_userip_objs(field, s, flags))
1064 goto partial;
1065
1066 return TRACE_TYPE_HANDLED;
1067 1042
1068 partial: 1043 return trace_handle_return(s);
1069 return TRACE_TYPE_PARTIAL_LINE;
1070} 1044}
1071 1045
1072static struct trace_event_functions trace_user_stack_funcs = { 1046static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
1089 1063
1090 trace_assign_type(field, entry); 1064 trace_assign_type(field, entry);
1091 1065
1092 if (!seq_print_ip_sym(s, field->ip, flags)) 1066 seq_print_ip_sym(s, field->ip, flags);
1093 goto partial; 1067 trace_seq_puts(s, ": ");
1068 trace_seq_puts(s, field->str);
1094 1069
1095 if (!trace_seq_puts(s, ": ")) 1070 return trace_handle_return(s);
1096 goto partial;
1097
1098 if (!trace_seq_puts(s, field->str))
1099 goto partial;
1100
1101 return TRACE_TYPE_HANDLED;
1102
1103 partial:
1104 return TRACE_TYPE_PARTIAL_LINE;
1105} 1071}
1106 1072
1107 1073
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
1114 1080
1115 trace_assign_type(field, iter->ent); 1081 trace_assign_type(field, iter->ent);
1116 1082
1117 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1083 trace_seq_printf(s, ": %lx : ", field->ip);
1118 goto partial; 1084 trace_seq_puts(s, field->str);
1119
1120 if (!trace_seq_puts(s, field->str))
1121 goto partial;
1122 1085
1123 return TRACE_TYPE_HANDLED; 1086 return trace_handle_return(s);
1124
1125 partial:
1126 return TRACE_TYPE_PARTIAL_LINE;
1127} 1087}
1128 1088
1129static struct trace_event_functions trace_bputs_funcs = { 1089static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
1147 1107
1148 trace_assign_type(field, entry); 1108 trace_assign_type(field, entry);
1149 1109
1150 if (!seq_print_ip_sym(s, field->ip, flags)) 1110 seq_print_ip_sym(s, field->ip, flags);
1151 goto partial; 1111 trace_seq_puts(s, ": ");
1152 1112 trace_seq_bprintf(s, field->fmt, field->buf);
1153 if (!trace_seq_puts(s, ": "))
1154 goto partial;
1155
1156 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1157 goto partial;
1158 1113
1159 return TRACE_TYPE_HANDLED; 1114 return trace_handle_return(s);
1160
1161 partial:
1162 return TRACE_TYPE_PARTIAL_LINE;
1163} 1115}
1164 1116
1165 1117
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
1172 1124
1173 trace_assign_type(field, iter->ent); 1125 trace_assign_type(field, iter->ent);
1174 1126
1175 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1127 trace_seq_printf(s, ": %lx : ", field->ip);
1176 goto partial; 1128 trace_seq_bprintf(s, field->fmt, field->buf);
1177
1178 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1179 goto partial;
1180 1129
1181 return TRACE_TYPE_HANDLED; 1130 return trace_handle_return(s);
1182
1183 partial:
1184 return TRACE_TYPE_PARTIAL_LINE;
1185} 1131}
1186 1132
1187static struct trace_event_functions trace_bprint_funcs = { 1133static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1203 1149
1204 trace_assign_type(field, iter->ent); 1150 trace_assign_type(field, iter->ent);
1205 1151
1206 if (!seq_print_ip_sym(s, field->ip, flags)) 1152 seq_print_ip_sym(s, field->ip, flags);
1207 goto partial; 1153 trace_seq_printf(s, ": %s", field->buf);
1208
1209 if (!trace_seq_printf(s, ": %s", field->buf))
1210 goto partial;
1211 1154
1212 return TRACE_TYPE_HANDLED; 1155 return trace_handle_return(s);
1213
1214 partial:
1215 return TRACE_TYPE_PARTIAL_LINE;
1216} 1156}
1217 1157
1218static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, 1158static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1222 1162
1223 trace_assign_type(field, iter->ent); 1163 trace_assign_type(field, iter->ent);
1224 1164
1225 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) 1165 trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
1226 goto partial;
1227
1228 return TRACE_TYPE_HANDLED;
1229 1166
1230 partial: 1167 return trace_handle_return(&iter->seq);
1231 return TRACE_TYPE_PARTIAL_LINE;
1232} 1168}
1233 1169
1234static struct trace_event_functions trace_print_funcs = { 1170static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD(s, x) \
39do { \ 39 trace_seq_putmem(s, &(x), sizeof(x))
40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40
41 return TRACE_TYPE_PARTIAL_LINE; \ 41#define SEQ_PUT_HEX_FIELD(s, x) \
42} while (0) 42 trace_seq_putmem_hex(s, &(x), sizeof(x))
43
44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
45do { \
46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
47 return TRACE_TYPE_PARTIAL_LINE; \
48} while (0)
49 43
50#endif 44#endif
51 45
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
305 seq_puts(m, "\\t"); 305 seq_puts(m, "\\t");
306 break; 306 break;
307 case '\\': 307 case '\\':
308 seq_puts(m, "\\"); 308 seq_putc(m, '\\');
309 break; 309 break;
310 case '"': 310 case '"':
311 seq_puts(m, "\\\""); 311 seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 void *data, void *ent) \ 41 void *data, void *ent) \
42{ \ 42{ \
43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
44 return !trace_seq_has_overflowed(s); \
44} \ 45} \
45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); 47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
61 int len = *(u32 *)data >> 16; 62 int len = *(u32 *)data >> 16;
62 63
63 if (!len) 64 if (!len)
64 return trace_seq_printf(s, " %s=(fault)", name); 65 trace_seq_printf(s, " %s=(fault)", name);
65 else 66 else
66 return trace_seq_printf(s, " %s=\"%s\"", name, 67 trace_seq_printf(s, " %s=\"%s\"", name,
67 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
69 return !trace_seq_has_overflowed(s);
68} 70}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); 71NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 72
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
14 14
15#include "trace.h" 15#include "trace.h"
16 16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static int sched_ref; 17static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 18static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51 19
52static void 20static void
53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 21probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54{ 22{
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 int cpu;
58 int pc;
59
60 if (unlikely(!sched_ref)) 23 if (unlikely(!sched_ref))
61 return; 24 return;
62 25
63 tracing_record_cmdline(prev); 26 tracing_record_cmdline(prev);
64 tracing_record_cmdline(next); 27 tracing_record_cmdline(next);
65
66 if (!tracer_enabled || sched_stopped)
67 return;
68
69 pc = preempt_count();
70 local_irq_save(flags);
71 cpu = raw_smp_processor_id();
72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73
74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
76
77 local_irq_restore(flags);
78}
79
80void
81tracing_sched_wakeup_trace(struct trace_array *tr,
82 struct task_struct *wakee,
83 struct task_struct *curr,
84 unsigned long flags, int pc)
85{
86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc);
93 if (!event)
94 return;
95 entry = ring_buffer_event_data(event);
96 entry->prev_pid = curr->pid;
97 entry->prev_prio = curr->prio;
98 entry->prev_state = curr->state;
99 entry->next_pid = wakee->pid;
100 entry->next_prio = wakee->prio;
101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee);
103
104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 28}
107 29
108static void 30static void
109probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) 31probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
110{ 32{
111 struct trace_array_cpu *data;
112 unsigned long flags;
113 int cpu, pc;
114
115 if (unlikely(!sched_ref)) 33 if (unlikely(!sched_ref))
116 return; 34 return;
117 35
118 tracing_record_cmdline(current); 36 tracing_record_cmdline(current);
119
120 if (!tracer_enabled || sched_stopped)
121 return;
122
123 pc = preempt_count();
124 local_irq_save(flags);
125 cpu = raw_smp_processor_id();
126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127
128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
130 flags, pc);
131
132 local_irq_restore(flags);
133} 37}
134 38
135static int tracing_sched_register(void) 39static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
197{ 101{
198 tracing_stop_sched_switch(); 102 tracing_stop_sched_switch();
199} 103}
200
201/**
202 * tracing_start_sched_switch_record - start tracing context switches
203 *
204 * Turns on context switch tracing for a tracer.
205 */
206void tracing_start_sched_switch_record(void)
207{
208 if (unlikely(!ctx_trace)) {
209 WARN_ON(1);
210 return;
211 }
212
213 tracing_start_sched_switch();
214
215 mutex_lock(&sched_register_mutex);
216 tracer_enabled++;
217 mutex_unlock(&sched_register_mutex);
218}
219
220/**
221 * tracing_stop_sched_switch_record - start tracing context switches
222 *
223 * Turns off context switch tracing for a tracer.
224 */
225void tracing_stop_sched_switch_record(void)
226{
227 mutex_lock(&sched_register_mutex);
228 tracer_enabled--;
229 WARN_ON(tracer_enabled < 0);
230 mutex_unlock(&sched_register_mutex);
231
232 tracing_stop_sched_switch();
233}
234
235/**
236 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
237 * @tr: trace array pointer to assign
238 *
239 * Some tracers might want to record the context switches in their
240 * trace. This function lets those tracers assign the trace array
241 * to use.
242 */
243void tracing_sched_switch_assign_trace(struct trace_array *tr)
244{
245 ctx_trace = tr;
246}
247
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
365 wakeup_current_cpu = cpu; 365 wakeup_current_cpu = cpu;
366} 366}
367 367
368static void
369tracing_sched_switch_trace(struct trace_array *tr,
370 struct task_struct *prev,
371 struct task_struct *next,
372 unsigned long flags, int pc)
373{
374 struct ftrace_event_call *call = &event_context_switch;
375 struct ring_buffer *buffer = tr->trace_buffer.buffer;
376 struct ring_buffer_event *event;
377 struct ctx_switch_entry *entry;
378
379 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
380 sizeof(*entry), flags, pc);
381 if (!event)
382 return;
383 entry = ring_buffer_event_data(event);
384 entry->prev_pid = prev->pid;
385 entry->prev_prio = prev->prio;
386 entry->prev_state = prev->state;
387 entry->next_pid = next->pid;
388 entry->next_prio = next->prio;
389 entry->next_state = next->state;
390 entry->next_cpu = task_cpu(next);
391
392 if (!call_filter_check_discard(call, entry, buffer, event))
393 trace_buffer_unlock_commit(buffer, event, flags, pc);
394}
395
396static void
397tracing_sched_wakeup_trace(struct trace_array *tr,
398 struct task_struct *wakee,
399 struct task_struct *curr,
400 unsigned long flags, int pc)
401{
402 struct ftrace_event_call *call = &event_wakeup;
403 struct ring_buffer_event *event;
404 struct ctx_switch_entry *entry;
405 struct ring_buffer *buffer = tr->trace_buffer.buffer;
406
407 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
408 sizeof(*entry), flags, pc);
409 if (!event)
410 return;
411 entry = ring_buffer_event_data(event);
412 entry->prev_pid = curr->pid;
413 entry->prev_prio = curr->prio;
414 entry->prev_state = curr->state;
415 entry->next_pid = wakee->pid;
416 entry->next_prio = wakee->prio;
417 entry->next_state = wakee->state;
418 entry->next_cpu = task_cpu(wakee);
419
420 if (!call_filter_check_discard(call, entry, buffer, event))
421 trace_buffer_unlock_commit(buffer, event, flags, pc);
422}
423
368static void notrace 424static void notrace
369probe_wakeup_sched_switch(void *ignore, 425probe_wakeup_sched_switch(void *ignore,
370 struct task_struct *prev, struct task_struct *next) 426 struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
27#include <linux/trace_seq.h> 27#include <linux/trace_seq.h>
28 28
29/* How much buffer is left on the trace_seq? */ 29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) 30#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
31 31
32/* How much buffer is written? */ 32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) 33#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
34
35/*
36 * trace_seq should work with being initialized with 0s.
37 */
38static inline void __trace_seq_init(struct trace_seq *s)
39{
40 if (unlikely(!s->seq.size))
41 trace_seq_init(s);
42}
34 43
35/** 44/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file 45 * trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
43 */ 52 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s) 53int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{ 54{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret; 55 int ret;
48 56
49 ret = seq_write(m, s->buffer, len); 57 __trace_seq_init(s);
58
59 ret = seq_buf_print_seq(m, &s->seq);
50 60
51 /* 61 /*
52 * Only reset this buffer if we successfully wrote to the 62 * Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
69 * trace_seq_printf() is used to store strings into a special 79 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by 80 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer. 81 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */ 82 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) 83void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{ 84{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 85 unsigned int save_len = s->seq.len;
81 va_list ap; 86 va_list ap;
82 int ret;
83 87
84 if (s->full || !len) 88 if (s->full)
85 return 0; 89 return;
90
91 __trace_seq_init(s);
86 92
87 va_start(ap, fmt); 93 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap); 94 seq_buf_vprintf(&s->seq, fmt, ap);
89 va_end(ap); 95 va_end(ap);
90 96
91 /* If we can't write it all, don't bother writing anything */ 97 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) { 98 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
99 s->seq.len = save_len;
93 s->full = 1; 100 s->full = 1;
94 return 0;
95 } 101 }
96
97 s->len += ret;
98
99 return 1;
100} 102}
101EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
102 104
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
107 * @nmaskbits: The number of bits that are valid in @maskp 109 * @nmaskbits: The number of bits that are valid in @maskp
108 * 110 *
109 * Writes a ASCII representation of a bitmask string into @s. 111 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */ 112 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, 113void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits) 114 int nmaskbits)
118{ 115{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 116 unsigned int save_len = s->seq.len;
120 int ret;
121 117
122 if (s->full || !len) 118 if (s->full)
123 return 0; 119 return;
124 120
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); 121 __trace_seq_init(s);
126 s->len += ret;
127 122
128 return 1; 123 seq_buf_bitmask(&s->seq, maskp, nmaskbits);
124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len;
127 s->full = 1;
128 }
129} 129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask); 130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131 131
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
139 * trace_seq_printf is used to store strings into a special 139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by 140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer. 141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */ 142 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) 143void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{ 144{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 145 unsigned int save_len = s->seq.len;
148 int ret;
149 146
150 if (s->full || !len) 147 if (s->full)
151 return 0; 148 return;
152 149
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 150 __trace_seq_init(s);
151
152 seq_buf_vprintf(&s->seq, fmt, args);
154 153
155 /* If we can't write it all, don't bother writing anything */ 154 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) { 155 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
156 s->seq.len = save_len;
157 s->full = 1; 157 s->full = 1;
158 return 0;
159 } 158 }
160
161 s->len += ret;
162
163 return len;
164} 159}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf); 160EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166 161
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
178 * 173 *
179 * This function will take the format and the binary array and finish 174 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer. 175 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */ 176 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 177void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{ 178{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 179 unsigned int save_len = s->seq.len;
187 int ret;
188 180
189 if (s->full || !len) 181 if (s->full)
190 return 0; 182 return;
183
184 __trace_seq_init(s);
191 185
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 186 seq_buf_bprintf(&s->seq, fmt, binary);
193 187
194 /* If we can't write it all, don't bother writing anything */ 188 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) { 189 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
190 s->seq.len = save_len;
196 s->full = 1; 191 s->full = 1;
197 return 0; 192 return;
198 } 193 }
199
200 s->len += ret;
201
202 return len;
203} 194}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf); 195EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205 196
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
212 * copy to user routines. This function records a simple string 203 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer 204 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism. 205 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */ 206 */
218int trace_seq_puts(struct trace_seq *s, const char *str) 207void trace_seq_puts(struct trace_seq *s, const char *str)
219{ 208{
220 unsigned int len = strlen(str); 209 unsigned int len = strlen(str);
221 210
222 if (s->full) 211 if (s->full)
223 return 0; 212 return;
213
214 __trace_seq_init(s);
224 215
225 if (len > TRACE_SEQ_BUF_LEFT(s)) { 216 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1; 217 s->full = 1;
227 return 0; 218 return;
228 } 219 }
229 220
230 memcpy(s->buffer + s->len, str, len); 221 seq_buf_putmem(&s->seq, str, len);
231 s->len += len;
232
233 return len;
234} 222}
235EXPORT_SYMBOL_GPL(trace_seq_puts); 223EXPORT_SYMBOL_GPL(trace_seq_puts);
236 224
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
243 * copy to user routines. This function records a simple charater 231 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer 232 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism. 233 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */ 234 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c) 235void trace_seq_putc(struct trace_seq *s, unsigned char c)
250{ 236{
251 if (s->full) 237 if (s->full)
252 return 0; 238 return;
239
240 __trace_seq_init(s);
253 241
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 242 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1; 243 s->full = 1;
256 return 0; 244 return;
257 } 245 }
258 246
259 s->buffer[s->len++] = c; 247 seq_buf_putc(&s->seq, c);
260
261 return 1;
262} 248}
263EXPORT_SYMBOL_GPL(trace_seq_putc); 249EXPORT_SYMBOL_GPL(trace_seq_putc);
264 250
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
271 * There may be cases where raw memory needs to be written into the 257 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows 258 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases. 259 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */ 260 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) 261void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{ 262{
279 if (s->full) 263 if (s->full)
280 return 0; 264 return;
265
266 __trace_seq_init(s);
281 267
282 if (len > TRACE_SEQ_BUF_LEFT(s)) { 268 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1; 269 s->full = 1;
284 return 0; 270 return;
285 } 271 }
286 272
287 memcpy(s->buffer + s->len, mem, len); 273 seq_buf_putmem(&s->seq, mem, len);
288 s->len += len;
289
290 return len;
291} 274}
292EXPORT_SYMBOL_GPL(trace_seq_putmem); 275EXPORT_SYMBOL_GPL(trace_seq_putmem);
293 276
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/** 277/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex 278 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor 279 * @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
303 * This is similar to trace_seq_putmem() except instead of just copying the 283 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it 284 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters. 285 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */ 286 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, 287void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len) 288 unsigned int len)
311{ 289{
312 unsigned char hex[HEX_CHARS]; 290 unsigned int save_len = s->seq.len;
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317 291
318 if (s->full) 292 if (s->full)
319 return 0; 293 return;
320 294
321 while (len) { 295 __trace_seq_init(s);
322 start_len = min(len, HEX_CHARS - 1); 296
323#ifdef __BIG_ENDIAN 297 /* Each byte is represented by two chars */
324 for (i = 0, j = 0; i < start_len; i++) { 298 if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
325#else 299 s->full = 1;
326 for (i = start_len-1, j = 0; i >= 0; i--) { 300 return;
327#endif 301 }
328 hex[j++] = hex_asc_hi(data[i]); 302
329 hex[j++] = hex_asc_lo(data[i]); 303 /* The added spaces can still cause an overflow */
330 } 304 seq_buf_putmem_hex(&s->seq, mem, len);
331 if (WARN_ON_ONCE(j == 0 || j/2 > len)) 305
332 break; 306 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
333 307 s->seq.len = save_len;
334 /* j increments twice per loop */ 308 s->full = 1;
335 len -= j / 2; 309 return;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 } 310 }
340 return cnt;
341} 311}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); 312EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343 313
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
355 */ 325 */
356int trace_seq_path(struct trace_seq *s, const struct path *path) 326int trace_seq_path(struct trace_seq *s, const struct path *path)
357{ 327{
358 unsigned char *p; 328 unsigned int save_len = s->seq.len;
359 329
360 if (s->full) 330 if (s->full)
361 return 0; 331 return 0;
362 332
333 __trace_seq_init(s);
334
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 335 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1; 336 s->full = 1;
365 return 0; 337 return 0;
366 } 338 }
367 339
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 340 seq_buf_path(&s->seq, path, "\n");
369 if (!IS_ERR(p)) { 341
370 p = mangle_path(s->buffer + s->len, p, "\n"); 342 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
371 if (p) { 343 s->seq.len = save_len;
372 s->len = p - s->buffer; 344 s->full = 1;
373 return 1; 345 return 0;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 } 346 }
379 347
380 s->full = 1; 348 return 1;
381 return 0;
382} 349}
383EXPORT_SYMBOL_GPL(trace_seq_path); 350EXPORT_SYMBOL_GPL(trace_seq_path);
384 351
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
404 */ 371 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) 372int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{ 373{
407 int len; 374 __trace_seq_init(s);
408 int ret; 375 return seq_buf_to_user(&s->seq, ubuf, cnt);
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427} 376}
428EXPORT_SYMBOL_GPL(trace_seq_to_user); 377EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 29228c4d5696..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
114 struct trace_entry *ent = iter->ent; 114 struct trace_entry *ent = iter->ent;
115 struct syscall_trace_enter *trace; 115 struct syscall_trace_enter *trace;
116 struct syscall_metadata *entry; 116 struct syscall_metadata *entry;
117 int i, ret, syscall; 117 int i, syscall;
118 118
119 trace = (typeof(trace))ent; 119 trace = (typeof(trace))ent;
120 syscall = trace->nr; 120 syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
128 goto end; 128 goto end;
129 } 129 }
130 130
131 ret = trace_seq_printf(s, "%s(", entry->name); 131 trace_seq_printf(s, "%s(", entry->name);
132 if (!ret)
133 return TRACE_TYPE_PARTIAL_LINE;
134 132
135 for (i = 0; i < entry->nb_args; i++) { 133 for (i = 0; i < entry->nb_args; i++) {
134
135 if (trace_seq_has_overflowed(s))
136 goto end;
137
136 /* parameter types */ 138 /* parameter types */
137 if (trace_flags & TRACE_ITER_VERBOSE) { 139 if (trace_flags & TRACE_ITER_VERBOSE)
138 ret = trace_seq_printf(s, "%s ", entry->types[i]); 140 trace_seq_printf(s, "%s ", entry->types[i]);
139 if (!ret) 141
140 return TRACE_TYPE_PARTIAL_LINE;
141 }
142 /* parameter values */ 142 /* parameter values */
143 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 143 trace_seq_printf(s, "%s: %lx%s", entry->args[i],
144 trace->args[i], 144 trace->args[i],
145 i == entry->nb_args - 1 ? "" : ", "); 145 i == entry->nb_args - 1 ? "" : ", ");
146 if (!ret)
147 return TRACE_TYPE_PARTIAL_LINE;
148 } 146 }
149 147
150 ret = trace_seq_putc(s, ')'); 148 trace_seq_putc(s, ')');
151 if (!ret)
152 return TRACE_TYPE_PARTIAL_LINE;
153
154end: 149end:
155 ret = trace_seq_putc(s, '\n'); 150 trace_seq_putc(s, '\n');
156 if (!ret)
157 return TRACE_TYPE_PARTIAL_LINE;
158 151
159 return TRACE_TYPE_HANDLED; 152 return trace_handle_return(s);
160} 153}
161 154
162static enum print_line_t 155static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
168 struct syscall_trace_exit *trace; 161 struct syscall_trace_exit *trace;
169 int syscall; 162 int syscall;
170 struct syscall_metadata *entry; 163 struct syscall_metadata *entry;
171 int ret;
172 164
173 trace = (typeof(trace))ent; 165 trace = (typeof(trace))ent;
174 syscall = trace->nr; 166 syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
176 168
177 if (!entry) { 169 if (!entry) {
178 trace_seq_putc(s, '\n'); 170 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 171 goto out;
180 } 172 }
181 173
182 if (entry->exit_event->event.type != ent->type) { 174 if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
184 return TRACE_TYPE_UNHANDLED; 176 return TRACE_TYPE_UNHANDLED;
185 } 177 }
186 178
187 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 179 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
188 trace->ret); 180 trace->ret);
189 if (!ret)
190 return TRACE_TYPE_PARTIAL_LINE;
191 181
192 return TRACE_TYPE_HANDLED; 182 out:
183 return trace_handle_return(s);
193} 184}
194 185
195extern char *__bad_type_size(void); 186extern char *__bad_type_size(void);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
523 return (unsigned long)sys_call_table[nr]; 514 return (unsigned long)sys_call_table[nr];
524} 515}
525 516
526static int __init init_ftrace_syscalls(void) 517void __init init_ftrace_syscalls(void)
527{ 518{
528 struct syscall_metadata *meta; 519 struct syscall_metadata *meta;
529 unsigned long addr; 520 unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
533 GFP_KERNEL); 524 GFP_KERNEL);
534 if (!syscalls_metadata) { 525 if (!syscalls_metadata) {
535 WARN_ON(1); 526 WARN_ON(1);
536 return -ENOMEM; 527 return;
537 } 528 }
538 529
539 for (i = 0; i < NR_syscalls; i++) { 530 for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
545 meta->syscall_nr = i; 536 meta->syscall_nr = i;
546 syscalls_metadata[i] = meta; 537 syscalls_metadata[i] = meta;
547 } 538 }
548
549 return 0;
550} 539}
551early_initcall(init_ftrace_syscalls);
552 540
553#ifdef CONFIG_PERF_EVENTS 541#ifdef CONFIG_PERF_EVENTS
554 542
@@ -586,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
586 size -= sizeof(u32); 574 size -= sizeof(u32);
587 575
588 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
589 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
590 if (!rec) 578 if (!rec)
591 return; 579 return;
592 580
@@ -659,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
659 size -= sizeof(u32); 647 size -= sizeof(u32);
660 648
661 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
662 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
663 if (!rec) 651 if (!rec)
664 return; 652 return;
665 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
552 return ret; 552 return ret;
553 553
554fail_address_parse: 554fail_address_parse:
555 if (inode) 555 iput(inode);
556 iput(inode);
557 556
558 pr_info("Failed to parse address or file.\n"); 557 pr_info("Failed to parse address or file.\n");
559 558
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
606 for (i = 0; i < tu->tp.nr_args; i++) 605 for (i = 0; i < tu->tp.nr_args; i++)
607 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); 606 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
608 607
609 seq_printf(m, "\n"); 608 seq_putc(m, '\n');
610 return 0; 609 return 0;
611} 610}
612 611
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
852 tu = container_of(event, struct trace_uprobe, tp.call.event); 851 tu = container_of(event, struct trace_uprobe, tp.call.event);
853 852
854 if (is_ret_probe(tu)) { 853 if (is_ret_probe(tu)) {
855 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", 854 trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
856 ftrace_event_name(&tu->tp.call), 855 ftrace_event_name(&tu->tp.call),
857 entry->vaddr[1], entry->vaddr[0])) 856 entry->vaddr[1], entry->vaddr[0]);
858 goto partial;
859 data = DATAOF_TRACE_ENTRY(entry, true); 857 data = DATAOF_TRACE_ENTRY(entry, true);
860 } else { 858 } else {
861 if (!trace_seq_printf(s, "%s: (0x%lx)", 859 trace_seq_printf(s, "%s: (0x%lx)",
862 ftrace_event_name(&tu->tp.call), 860 ftrace_event_name(&tu->tp.call),
863 entry->vaddr[0])) 861 entry->vaddr[0]);
864 goto partial;
865 data = DATAOF_TRACE_ENTRY(entry, false); 862 data = DATAOF_TRACE_ENTRY(entry, false);
866 } 863 }
867 864
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
869 struct probe_arg *parg = &tu->tp.args[i]; 866 struct probe_arg *parg = &tu->tp.args[i];
870 867
871 if (!parg->type->print(s, parg->name, data + parg->offset, entry)) 868 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
872 goto partial; 869 goto out;
873 } 870 }
874 871
875 if (trace_seq_puts(s, "\n")) 872 trace_seq_putc(s, '\n');
876 return TRACE_TYPE_HANDLED;
877 873
878partial: 874 out:
879 return TRACE_TYPE_PARTIAL_LINE; 875 return trace_handle_return(s);
880} 876}
881 877
882typedef bool (*filter_func_t)(struct uprobe_consumer *self, 878typedef bool (*filter_func_t)(struct uprobe_consumer *self,
@@ -1115,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1115 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1116 goto out; 1112 goto out;
1117 1113
1118 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1119 if (!entry) 1115 if (!entry)
1120 goto out; 1116 goto out;
1121 1117
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!ns_capable(current_user_ns(), CAP_SETGID)) 179 if (!may_setgroups())
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
50 .count = ATOMIC_INIT(3), 50 .count = ATOMIC_INIT(3),
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .ns.inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_USER_NS
55 .ns.ops = &userns_operations,
56#endif
57 .flags = USERNS_INIT_FLAGS,
54#ifdef CONFIG_PERSISTENT_KEYRINGS 58#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .persistent_keyring_register_sem = 59 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 60 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
24#include <linux/fs_struct.h> 24#include <linux/fs_struct.h>
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27static DEFINE_MUTEX(userns_state_mutex);
27 28
28static bool new_idmap_permitted(const struct file *file, 29static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
86 if (!ns) 87 if (!ns)
87 return -ENOMEM; 88 return -ENOMEM;
88 89
89 ret = proc_alloc_inum(&ns->proc_inum); 90 ret = ns_alloc_inum(&ns->ns);
90 if (ret) { 91 if (ret) {
91 kmem_cache_free(user_ns_cachep, ns); 92 kmem_cache_free(user_ns_cachep, ns);
92 return ret; 93 return ret;
93 } 94 }
95 ns->ns.ops = &userns_operations;
94 96
95 atomic_set(&ns->count, 1); 97 atomic_set(&ns->count, 1);
96 /* Leave the new->user_ns reference with the new user namespace. */ 98 /* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
99 ns->owner = owner; 101 ns->owner = owner;
100 ns->group = group; 102 ns->group = group;
101 103
104 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
105 mutex_lock(&userns_state_mutex);
106 ns->flags = parent_ns->flags;
107 mutex_unlock(&userns_state_mutex);
108
102 set_cred_user_ns(new, ns); 109 set_cred_user_ns(new, ns);
103 110
104#ifdef CONFIG_PERSISTENT_KEYRINGS 111#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
136#ifdef CONFIG_PERSISTENT_KEYRINGS 143#ifdef CONFIG_PERSISTENT_KEYRINGS
137 key_put(ns->persistent_keyring_register); 144 key_put(ns->persistent_keyring_register);
138#endif 145#endif
139 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
140 kmem_cache_free(user_ns_cachep, ns); 147 kmem_cache_free(user_ns_cachep, ns);
141 ns = parent; 148 ns = parent;
142 } while (atomic_dec_and_test(&parent->count)); 149 } while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
583 return false; 590 return false;
584} 591}
585 592
586
587static DEFINE_MUTEX(id_map_mutex);
588
589static ssize_t map_write(struct file *file, const char __user *buf, 593static ssize_t map_write(struct file *file, const char __user *buf,
590 size_t count, loff_t *ppos, 594 size_t count, loff_t *ppos,
591 int cap_setid, 595 int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
602 ssize_t ret = -EINVAL; 606 ssize_t ret = -EINVAL;
603 607
604 /* 608 /*
605 * The id_map_mutex serializes all writes to any given map. 609 * The userns_state_mutex serializes all writes to any given map.
606 * 610 *
607 * Any map is only ever written once. 611 * Any map is only ever written once.
608 * 612 *
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
620 * order and smp_rmb() is guaranteed that we don't have crazy 624 * order and smp_rmb() is guaranteed that we don't have crazy
621 * architectures returning stale data. 625 * architectures returning stale data.
622 */ 626 */
623 mutex_lock(&id_map_mutex); 627 mutex_lock(&userns_state_mutex);
624 628
625 ret = -EPERM; 629 ret = -EPERM;
626 /* Only allow one successful write to the map */ 630 /* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
640 if (!page) 644 if (!page)
641 goto out; 645 goto out;
642 646
643 /* Only allow <= page size writes at the beginning of the file */ 647 /* Only allow < page size writes at the beginning of the file */
644 ret = -EINVAL; 648 ret = -EINVAL;
645 if ((*ppos != 0) || (count >= PAGE_SIZE)) 649 if ((*ppos != 0) || (count >= PAGE_SIZE))
646 goto out; 650 goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
750 *ppos = count; 754 *ppos = count;
751 ret = count; 755 ret = count;
752out: 756out:
753 mutex_unlock(&id_map_mutex); 757 mutex_unlock(&userns_state_mutex);
754 if (page) 758 if (page)
755 free_page(page); 759 free_page(page);
756 return ret; 760 return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
812 struct user_namespace *ns, int cap_setid, 816 struct user_namespace *ns, int cap_setid,
813 struct uid_gid_map *new_map) 817 struct uid_gid_map *new_map)
814{ 818{
815 /* Allow mapping to your own filesystem ids */ 819 const struct cred *cred = file->f_cred;
816 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 820 /* Don't allow mappings that would allow anything that wouldn't
821 * be allowed without the establishment of unprivileged mappings.
822 */
823 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
824 uid_eq(ns->owner, cred->euid)) {
817 u32 id = new_map->extent[0].lower_first; 825 u32 id = new_map->extent[0].lower_first;
818 if (cap_setid == CAP_SETUID) { 826 if (cap_setid == CAP_SETUID) {
819 kuid_t uid = make_kuid(ns->parent, id); 827 kuid_t uid = make_kuid(ns->parent, id);
820 if (uid_eq(uid, file->f_cred->fsuid)) 828 if (uid_eq(uid, cred->euid))
821 return true; 829 return true;
822 } else if (cap_setid == CAP_SETGID) { 830 } else if (cap_setid == CAP_SETGID) {
823 kgid_t gid = make_kgid(ns->parent, id); 831 kgid_t gid = make_kgid(ns->parent, id);
824 if (gid_eq(gid, file->f_cred->fsgid)) 832 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
833 gid_eq(gid, cred->egid))
825 return true; 834 return true;
826 } 835 }
827 } 836 }
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
841 return false; 850 return false;
842} 851}
843 852
844static void *userns_get(struct task_struct *task) 853int proc_setgroups_show(struct seq_file *seq, void *v)
854{
855 struct user_namespace *ns = seq->private;
856 unsigned long userns_flags = ACCESS_ONCE(ns->flags);
857
858 seq_printf(seq, "%s\n",
859 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
860 "allow" : "deny");
861 return 0;
862}
863
864ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
865 size_t count, loff_t *ppos)
866{
867 struct seq_file *seq = file->private_data;
868 struct user_namespace *ns = seq->private;
869 char kbuf[8], *pos;
870 bool setgroups_allowed;
871 ssize_t ret;
872
873 /* Only allow a very narrow range of strings to be written */
874 ret = -EINVAL;
875 if ((*ppos != 0) || (count >= sizeof(kbuf)))
876 goto out;
877
878 /* What was written? */
879 ret = -EFAULT;
880 if (copy_from_user(kbuf, buf, count))
881 goto out;
882 kbuf[count] = '\0';
883 pos = kbuf;
884
885 /* What is being requested? */
886 ret = -EINVAL;
887 if (strncmp(pos, "allow", 5) == 0) {
888 pos += 5;
889 setgroups_allowed = true;
890 }
891 else if (strncmp(pos, "deny", 4) == 0) {
892 pos += 4;
893 setgroups_allowed = false;
894 }
895 else
896 goto out;
897
898 /* Verify there is not trailing junk on the line */
899 pos = skip_spaces(pos);
900 if (*pos != '\0')
901 goto out;
902
903 ret = -EPERM;
904 mutex_lock(&userns_state_mutex);
905 if (setgroups_allowed) {
906 /* Enabling setgroups after setgroups has been disabled
907 * is not allowed.
908 */
909 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
910 goto out_unlock;
911 } else {
912 /* Permanently disabling setgroups after setgroups has
913 * been enabled by writing the gid_map is not allowed.
914 */
915 if (ns->gid_map.nr_extents != 0)
916 goto out_unlock;
917 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
918 }
919 mutex_unlock(&userns_state_mutex);
920
921 /* Report a successful write */
922 *ppos = count;
923 ret = count;
924out:
925 return ret;
926out_unlock:
927 mutex_unlock(&userns_state_mutex);
928 goto out;
929}
930
931bool userns_may_setgroups(const struct user_namespace *ns)
932{
933 bool allowed;
934
935 mutex_lock(&userns_state_mutex);
936 /* It is not safe to use setgroups until a gid mapping in
937 * the user namespace has been established.
938 */
939 allowed = ns->gid_map.nr_extents != 0;
940 /* Is setgroups allowed? */
941 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
942 mutex_unlock(&userns_state_mutex);
943
944 return allowed;
945}
946
947static inline struct user_namespace *to_user_ns(struct ns_common *ns)
948{
949 return container_of(ns, struct user_namespace, ns);
950}
951
952static struct ns_common *userns_get(struct task_struct *task)
845{ 953{
846 struct user_namespace *user_ns; 954 struct user_namespace *user_ns;
847 955
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
849 user_ns = get_user_ns(__task_cred(task)->user_ns); 957 user_ns = get_user_ns(__task_cred(task)->user_ns);
850 rcu_read_unlock(); 958 rcu_read_unlock();
851 959
852 return user_ns; 960 return user_ns ? &user_ns->ns : NULL;
853} 961}
854 962
855static void userns_put(void *ns) 963static void userns_put(struct ns_common *ns)
856{ 964{
857 put_user_ns(ns); 965 put_user_ns(to_user_ns(ns));
858} 966}
859 967
860static int userns_install(struct nsproxy *nsproxy, void *ns) 968static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
861{ 969{
862 struct user_namespace *user_ns = ns; 970 struct user_namespace *user_ns = to_user_ns(ns);
863 struct cred *cred; 971 struct cred *cred;
864 972
865 /* Don't allow gaining capabilities by reentering 973 /* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
888 return commit_creds(cred); 996 return commit_creds(cred);
889} 997}
890 998
891static unsigned int userns_inum(void *ns)
892{
893 struct user_namespace *user_ns = ns;
894 return user_ns->proc_inum;
895}
896
897const struct proc_ns_operations userns_operations = { 999const struct proc_ns_operations userns_operations = {
898 .name = "user", 1000 .name = "user",
899 .type = CLONE_NEWUSER, 1001 .type = CLONE_NEWUSER,
900 .get = userns_get, 1002 .get = userns_get,
901 .put = userns_put, 1003 .put = userns_put,
902 .install = userns_install, 1004 .install = userns_install,
903 .inum = userns_inum,
904}; 1005};
905 1006
906static __init int user_namespaces_init(void) 1007static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
42 if (!ns) 42 if (!ns)
43 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
44 44
45 err = proc_alloc_inum(&ns->proc_inum); 45 err = ns_alloc_inum(&ns->ns);
46 if (err) { 46 if (err) {
47 kfree(ns); 47 kfree(ns);
48 return ERR_PTR(err); 48 return ERR_PTR(err);
49 } 49 }
50 50
51 ns->ns.ops = &utsns_operations;
52
51 down_read(&uts_sem); 53 down_read(&uts_sem);
52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 54 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
53 ns->user_ns = get_user_ns(user_ns); 55 ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
84 86
85 ns = container_of(kref, struct uts_namespace, kref); 87 ns = container_of(kref, struct uts_namespace, kref);
86 put_user_ns(ns->user_ns); 88 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum); 89 ns_free_inum(&ns->ns);
88 kfree(ns); 90 kfree(ns);
89} 91}
90 92
91static void *utsns_get(struct task_struct *task) 93static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
94{
95 return container_of(ns, struct uts_namespace, ns);
96}
97
98static struct ns_common *utsns_get(struct task_struct *task)
92{ 99{
93 struct uts_namespace *ns = NULL; 100 struct uts_namespace *ns = NULL;
94 struct nsproxy *nsproxy; 101 struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
101 } 108 }
102 task_unlock(task); 109 task_unlock(task);
103 110
104 return ns; 111 return ns ? &ns->ns : NULL;
105} 112}
106 113
107static void utsns_put(void *ns) 114static void utsns_put(struct ns_common *ns)
108{ 115{
109 put_uts_ns(ns); 116 put_uts_ns(to_uts_ns(ns));
110} 117}
111 118
112static int utsns_install(struct nsproxy *nsproxy, void *new) 119static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
113{ 120{
114 struct uts_namespace *ns = new; 121 struct uts_namespace *ns = to_uts_ns(new);
115 122
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 123 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 124 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
123 return 0; 130 return 0;
124} 131}
125 132
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
133const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
134 .name = "uts", 134 .name = "uts",
135 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
136 .get = utsns_get, 136 .get = utsns_get,
137 .put = utsns_put, 137 .put = utsns_put,
138 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
140}; 139};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685daee3d..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1804 struct worker_pool *pool = (void *)__pool; 1804 struct worker_pool *pool = (void *)__pool;
1805 struct work_struct *work; 1805 struct work_struct *work;
1806 1806
1807 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ 1807 spin_lock_irq(&pool->lock);
1808 spin_lock(&pool->lock); 1808 spin_lock(&wq_mayday_lock); /* for wq->maydays */
1809 1809
1810 if (need_to_create_worker(pool)) { 1810 if (need_to_create_worker(pool)) {
1811 /* 1811 /*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1818 send_mayday(work); 1818 send_mayday(work);
1819 } 1819 }
1820 1820
1821 spin_unlock(&pool->lock); 1821 spin_unlock(&wq_mayday_lock);
1822 spin_unlock_irq(&wq_mayday_lock); 1822 spin_unlock_irq(&pool->lock);
1823 1823
1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1825} 1825}
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
1841 * spin_lock_irq(pool->lock) which may be released and regrabbed 1841 * spin_lock_irq(pool->lock) which may be released and regrabbed
1842 * multiple times. Does GFP_KERNEL allocations. Called only from 1842 * multiple times. Does GFP_KERNEL allocations. Called only from
1843 * manager. 1843 * manager.
1844 *
1845 * Return:
1846 * %false if no action was taken and pool->lock stayed locked, %true
1847 * otherwise.
1848 */ 1844 */
1849static bool maybe_create_worker(struct worker_pool *pool) 1845static void maybe_create_worker(struct worker_pool *pool)
1850__releases(&pool->lock) 1846__releases(&pool->lock)
1851__acquires(&pool->lock) 1847__acquires(&pool->lock)
1852{ 1848{
1853 if (!need_to_create_worker(pool))
1854 return false;
1855restart: 1849restart:
1856 spin_unlock_irq(&pool->lock); 1850 spin_unlock_irq(&pool->lock);
1857 1851
@@ -1877,7 +1871,6 @@ restart:
1877 */ 1871 */
1878 if (need_to_create_worker(pool)) 1872 if (need_to_create_worker(pool))
1879 goto restart; 1873 goto restart;
1880 return true;
1881} 1874}
1882 1875
1883/** 1876/**
@@ -1897,16 +1890,14 @@ restart:
1897 * multiple times. Does GFP_KERNEL allocations. 1890 * multiple times. Does GFP_KERNEL allocations.
1898 * 1891 *
1899 * Return: 1892 * Return:
1900 * %false if the pool don't need management and the caller can safely start 1893 * %false if the pool doesn't need management and the caller can safely
1901 * processing works, %true indicates that the function released pool->lock 1894 * start processing works, %true if management function was performed and
1902 * and reacquired it to perform some management function and that the 1895 * the conditions that the caller verified before calling the function may
1903 * conditions that the caller verified while holding the lock before 1896 * no longer be true.
1904 * calling the function might no longer be true.
1905 */ 1897 */
1906static bool manage_workers(struct worker *worker) 1898static bool manage_workers(struct worker *worker)
1907{ 1899{
1908 struct worker_pool *pool = worker->pool; 1900 struct worker_pool *pool = worker->pool;
1909 bool ret = false;
1910 1901
1911 /* 1902 /*
1912 * Anyone who successfully grabs manager_arb wins the arbitration 1903 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
1919 * actual management, the pool may stall indefinitely. 1910 * actual management, the pool may stall indefinitely.
1920 */ 1911 */
1921 if (!mutex_trylock(&pool->manager_arb)) 1912 if (!mutex_trylock(&pool->manager_arb))
1922 return ret; 1913 return false;
1923 1914
1924 ret |= maybe_create_worker(pool); 1915 maybe_create_worker(pool);
1925 1916
1926 mutex_unlock(&pool->manager_arb); 1917 mutex_unlock(&pool->manager_arb);
1927 return ret; 1918 return true;
1928} 1919}
1929 1920
1930/** 1921/**
@@ -2248,12 +2239,30 @@ repeat:
2248 * Slurp in all works issued via this workqueue and 2239 * Slurp in all works issued via this workqueue and
2249 * process'em. 2240 * process'em.
2250 */ 2241 */
2251 WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); 2242 WARN_ON_ONCE(!list_empty(scheduled));
2252 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2243 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2253 if (get_work_pwq(work) == pwq) 2244 if (get_work_pwq(work) == pwq)
2254 move_linked_works(work, scheduled, &n); 2245 move_linked_works(work, scheduled, &n);
2255 2246
2256 process_scheduled_works(rescuer); 2247 if (!list_empty(scheduled)) {
2248 process_scheduled_works(rescuer);
2249
2250 /*
2251 * The above execution of rescued work items could
2252 * have created more to rescue through
2253 * pwq_activate_first_delayed() or chained
2254 * queueing. Let's put @pwq back on mayday list so
2255 * that such back-to-back work items, which may be
2256 * being used to relieve memory pressure, don't
2257 * incur MAYDAY_INTERVAL delay inbetween.
2258 */
2259 if (need_to_create_worker(pool)) {
2260 spin_lock(&wq_mayday_lock);
2261 get_pwq(pwq);
2262 list_move_tail(&pwq->mayday_node, &wq->maydays);
2263 spin_unlock(&wq_mayday_lock);
2264 }
2265 }
2257 2266
2258 /* 2267 /*
2259 * Put the reference grabbed by send_mayday(). @pool won't 2268 * Put the reference grabbed by send_mayday(). @pool won't