aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit_tree.c16
-rw-r--r--kernel/auditfilter.c23
-rw-r--r--kernel/auditsc.c56
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c171
-rw-r--r--kernel/cgroup.c175
-rw-r--r--kernel/cpuset.c162
-rw-r--r--kernel/debug/debug_core.c52
-rw-r--r--kernel/debug/kdb/kdb_bp.c37
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c269
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/events/core.c25
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c278
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/gcov/Kconfig5
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdesc.c52
-rw-r--r--kernel/irq/proc.c22
-rw-r--r--kernel/irq_work.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c43
-rw-r--r--kernel/kprobes.c22
-rw-r--r--kernel/locking/mutex-debug.c2
-rw-r--r--kernel/module.c251
-rw-r--r--kernel/nsproxy.c10
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/params.c100
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c57
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/hibernate.c14
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c96
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/range.c10
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/deadline.c25
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/stacktrace.c32
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c1
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c151
-rw-r--r--kernel/trace/ftrace.c383
-rw-r--r--kernel/trace/ring_buffer.c75
-rw-r--r--kernel/trace/trace.c254
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c126
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c25
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_syscalls.c54
-rw-r--r--kernel/trace/trace_uprobe.c28
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c6
-rw-r--r--kernel/user_namespace.c153
-rw-r--r--kernel/utsname.c31
-rw-r--r--kernel/workqueue.c55
90 files changed, 3554 insertions, 2596 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
57obj-$(CONFIG_USER_NS) += user_namespace.o 57obj-$(CONFIG_USER_NS) += user_namespace.o
58obj-$(CONFIG_PID_NS) += pid_namespace.o 58obj-$(CONFIG_PID_NS) += pid_namespace.o
59obj-$(CONFIG_IKCONFIG) += configs.o 59obj-$(CONFIG_IKCONFIG) += configs.o
60obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
61obj-$(CONFIG_SMP) += stop_machine.o 60obj-$(CONFIG_SMP) += stop_machine.o
62obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 61obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
63obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 62obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 1f37f15117e5..72ab759a0b43 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -429,7 +429,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
429 * This function doesn't consume an skb as might be expected since it has to 429 * This function doesn't consume an skb as might be expected since it has to
430 * copy it anyways. 430 * copy it anyways.
431 */ 431 */
432static void kauditd_send_multicast_skb(struct sk_buff *skb) 432static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
433{ 433{
434 struct sk_buff *copy; 434 struct sk_buff *copy;
435 struct audit_net *aunet = net_generic(&init_net, audit_net_id); 435 struct audit_net *aunet = net_generic(&init_net, audit_net_id);
@@ -448,11 +448,11 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
448 * no reason for new multicast clients to continue with this 448 * no reason for new multicast clients to continue with this
449 * non-compliance. 449 * non-compliance.
450 */ 450 */
451 copy = skb_copy(skb, GFP_KERNEL); 451 copy = skb_copy(skb, gfp_mask);
452 if (!copy) 452 if (!copy)
453 return; 453 return;
454 454
455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); 455 nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
456} 456}
457 457
458/* 458/*
@@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
833 s.backlog_limit = audit_backlog_limit; 833 s.backlog_limit = audit_backlog_limit;
834 s.lost = atomic_read(&audit_lost); 834 s.lost = atomic_read(&audit_lost);
835 s.backlog = skb_queue_len(&audit_skb_queue); 835 s.backlog = skb_queue_len(&audit_skb_queue);
836 s.version = AUDIT_VERSION_LATEST; 836 s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
837 s.backlog_wait_time = audit_backlog_wait_time; 837 s.backlog_wait_time = audit_backlog_wait_time;
838 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); 838 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
839 break; 839 break;
@@ -1100,7 +1100,7 @@ static void audit_receive(struct sk_buff *skb)
1100} 1100}
1101 1101
1102/* Run custom bind function on netlink socket group connect or bind requests. */ 1102/* Run custom bind function on netlink socket group connect or bind requests. */
1103static int audit_bind(int group) 1103static int audit_bind(struct net *net, int group)
1104{ 1104{
1105 if (!capable(CAP_AUDIT_READ)) 1105 if (!capable(CAP_AUDIT_READ))
1106 return -EPERM; 1106 return -EPERM;
@@ -1940,7 +1940,7 @@ void audit_log_end(struct audit_buffer *ab)
1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1940 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1941 1941
1942 nlh->nlmsg_len = ab->skb->len; 1942 nlh->nlmsg_len = ab->skb->len;
1943 kauditd_send_multicast_skb(ab->skb); 1943 kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
1944 1944
1945 /* 1945 /*
1946 * The original kaudit unicast socket sends up messages with 1946 * The original kaudit unicast socket sends up messages with
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 80f29e015570..2e0c97427b33 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk)
174 struct fsnotify_mark *entry = &chunk->mark; 174 struct fsnotify_mark *entry = &chunk->mark;
175 struct list_head *list; 175 struct list_head *list;
176 176
177 if (!entry->i.inode) 177 if (!entry->inode)
178 return; 178 return;
179 list = chunk_hash(entry->i.inode); 179 list = chunk_hash(entry->inode);
180 list_add_rcu(&chunk->hash, list); 180 list_add_rcu(&chunk->hash, list);
181} 181}
182 182
@@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
188 188
189 list_for_each_entry_rcu(p, list, hash) { 189 list_for_each_entry_rcu(p, list, hash) {
190 /* mark.inode may have gone NULL, but who cares? */ 190 /* mark.inode may have gone NULL, but who cares? */
191 if (p->mark.i.inode == inode) { 191 if (p->mark.inode == inode) {
192 atomic_long_inc(&p->refs); 192 atomic_long_inc(&p->refs);
193 return p; 193 return p;
194 } 194 }
@@ -231,7 +231,7 @@ static void untag_chunk(struct node *p)
231 new = alloc_chunk(size); 231 new = alloc_chunk(size);
232 232
233 spin_lock(&entry->lock); 233 spin_lock(&entry->lock);
234 if (chunk->dead || !entry->i.inode) { 234 if (chunk->dead || !entry->inode) {
235 spin_unlock(&entry->lock); 235 spin_unlock(&entry->lock);
236 if (new) 236 if (new)
237 free_chunk(new); 237 free_chunk(new);
@@ -258,7 +258,7 @@ static void untag_chunk(struct node *p)
258 goto Fallback; 258 goto Fallback;
259 259
260 fsnotify_duplicate_mark(&new->mark, entry); 260 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
262 fsnotify_put_mark(&new->mark); 262 fsnotify_put_mark(&new->mark);
263 goto Fallback; 263 goto Fallback;
264 } 264 }
@@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
386 chunk_entry = &chunk->mark; 386 chunk_entry = &chunk->mark;
387 387
388 spin_lock(&old_entry->lock); 388 spin_lock(&old_entry->lock);
389 if (!old_entry->i.inode) { 389 if (!old_entry->inode) {
390 /* old_entry is being shot, lets just lie */ 390 /* old_entry is being shot, lets just lie */
391 spin_unlock(&old_entry->lock); 391 spin_unlock(&old_entry->lock);
392 fsnotify_put_mark(old_entry); 392 fsnotify_put_mark(old_entry);
@@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
395 } 395 }
396 396
397 fsnotify_duplicate_mark(chunk_entry, old_entry); 397 fsnotify_duplicate_mark(chunk_entry, old_entry);
398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 398 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
399 spin_unlock(&old_entry->lock); 399 spin_unlock(&old_entry->lock);
400 fsnotify_put_mark(chunk_entry); 400 fsnotify_put_mark(chunk_entry);
401 fsnotify_put_mark(old_entry); 401 fsnotify_put_mark(old_entry);
@@ -611,7 +611,7 @@ void audit_trim_trees(void)
611 list_for_each_entry(node, &tree->chunks, list) { 611 list_for_each_entry(node, &tree->chunks, list) {
612 struct audit_chunk *chunk = find_chunk(node); 612 struct audit_chunk *chunk = find_chunk(node);
613 /* this could be NULL if the watch is dying else where... */ 613 /* this could be NULL if the watch is dying else where... */
614 struct inode *inode = chunk->mark.i.inode; 614 struct inode *inode = chunk->mark.inode;
615 node->index |= 1U<<31; 615 node->index |= 1U<<31;
616 if (iterate_mounts(compare_root, inode, root_mnt)) 616 if (iterate_mounts(compare_root, inode, root_mnt))
617 node->index &= ~(1U<<31); 617 node->index &= ~(1U<<31);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3598e13f2a65..4f68a326d92e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -442,19 +442,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { 442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
443 f->type = AUDIT_LOGINUID_SET; 443 f->type = AUDIT_LOGINUID_SET;
444 f->val = 0; 444 f->val = 0;
445 } 445 entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
446
447 if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
448 struct pid *pid;
449 rcu_read_lock();
450 pid = find_vpid(f->val);
451 if (!pid) {
452 rcu_read_unlock();
453 err = -ESRCH;
454 goto exit_free;
455 }
456 f->val = pid_nr(pid);
457 rcu_read_unlock();
458 } 446 }
459 447
460 err = audit_field_valid(entry, f); 448 err = audit_field_valid(entry, f);
@@ -630,6 +618,13 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
630 data->buflen += data->values[i] = 618 data->buflen += data->values[i] =
631 audit_pack_string(&bufp, krule->filterkey); 619 audit_pack_string(&bufp, krule->filterkey);
632 break; 620 break;
621 case AUDIT_LOGINUID_SET:
622 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
623 data->fields[i] = AUDIT_LOGINUID;
624 data->values[i] = AUDIT_UID_UNSET;
625 break;
626 }
627 /* fallthrough if set */
633 default: 628 default:
634 data->values[i] = f->val; 629 data->values[i] = f->val;
635 } 630 }
@@ -646,6 +641,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
646 int i; 641 int i;
647 642
648 if (a->flags != b->flags || 643 if (a->flags != b->flags ||
644 a->pflags != b->pflags ||
649 a->listnr != b->listnr || 645 a->listnr != b->listnr ||
650 a->action != b->action || 646 a->action != b->action ||
651 a->field_count != b->field_count) 647 a->field_count != b->field_count)
@@ -764,6 +760,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
764 new = &entry->rule; 760 new = &entry->rule;
765 new->vers_ops = old->vers_ops; 761 new->vers_ops = old->vers_ops;
766 new->flags = old->flags; 762 new->flags = old->flags;
763 new->pflags = old->pflags;
767 new->listnr = old->listnr; 764 new->listnr = old->listnr;
768 new->action = old->action; 765 new->action = old->action;
769 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 766 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..072566dd0caf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -72,6 +72,8 @@
72#include <linux/fs_struct.h> 72#include <linux/fs_struct.h>
73#include <linux/compat.h> 73#include <linux/compat.h>
74#include <linux/ctype.h> 74#include <linux/ctype.h>
75#include <linux/string.h>
76#include <uapi/linux/limits.h>
75 77
76#include "audit.h" 78#include "audit.h"
77 79
@@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1861 } 1863 }
1862 1864
1863 list_for_each_entry_reverse(n, &context->names_list, list) { 1865 list_for_each_entry_reverse(n, &context->names_list, list) {
1864 /* does the name pointer match? */ 1866 if (!n->name || strcmp(n->name->name, name->name))
1865 if (!n->name || n->name->name != name->name)
1866 continue; 1867 continue;
1867 1868
1868 /* match the correct record type */ 1869 /* match the correct record type */
@@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
1877 } 1878 }
1878 1879
1879out_alloc: 1880out_alloc:
1880 /* unable to find the name from a previous getname(). Allocate a new 1881 /* unable to find an entry with both a matching name and type */
1881 * anonymous entry. 1882 n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
1882 */
1883 n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
1884 if (!n) 1883 if (!n)
1885 return; 1884 return;
1885 /* unfortunately, while we may have a path name to record with the
1886 * inode, we can't always rely on the string lasting until the end of
1887 * the syscall so we need to create our own copy, it may fail due to
1888 * memory allocation issues, but we do our best */
1889 if (name) {
1890 /* we can't use getname_kernel() due to size limits */
1891 size_t len = strlen(name->name) + 1;
1892 struct filename *new = __getname();
1893
1894 if (unlikely(!new))
1895 goto out;
1896
1897 if (len <= (PATH_MAX - sizeof(*new))) {
1898 new->name = (char *)(new) + sizeof(*new);
1899 new->separate = false;
1900 } else if (len <= PATH_MAX) {
1901 /* this looks odd, but is due to final_putname() */
1902 struct filename *new2;
1903
1904 new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
1905 if (unlikely(!new2)) {
1906 __putname(new);
1907 goto out;
1908 }
1909 new2->name = (char *)new;
1910 new2->separate = true;
1911 new = new2;
1912 } else {
1913 /* we should never get here, but let's be safe */
1914 __putname(new);
1915 goto out;
1916 }
1917 strlcpy((char *)new->name, name->name, len);
1918 new->uptr = NULL;
1919 new->aname = n;
1920 n->name = new;
1921 n->name_put = true;
1922 }
1886out: 1923out:
1887 if (parent) { 1924 if (parent) {
1888 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1925 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1897,6 +1934,11 @@ out:
1897 audit_copy_inode(n, dentry, inode); 1934 audit_copy_inode(n, dentry, inode);
1898} 1935}
1899 1936
1937void __audit_file(const struct file *file)
1938{
1939 __audit_inode(NULL, file->f_path.dentry, 0);
1940}
1941
1900/** 1942/**
1901 * __audit_inode_child - collect inode info for created/removed objects 1943 * __audit_inode_child - collect inode info for created/removed objects
1902 * @parent: inode of dentry parent 1944 * @parent: inode of dentry parent
@@ -2373,7 +2415,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2373 ax->d.next = context->aux; 2415 ax->d.next = context->aux;
2374 context->aux = (void *)ax; 2416 context->aux = (void *)ax;
2375 2417
2376 dentry = dget(bprm->file->f_dentry); 2418 dentry = dget(bprm->file->f_path.dentry);
2377 get_vfs_caps_from_disk(dentry, &vcaps); 2419 get_vfs_caps_from_disk(dentry, &vcaps);
2378 dput(dentry); 2420 dput(dentry);
2379 2421
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o 2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF 3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o 4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif 5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16#include <linux/mm.h>
17
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23
24/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr)
26{
27 struct bpf_array *array;
28 u32 elem_size, array_size;
29
30 /* check sanity of attributes */
31 if (attr->max_entries == 0 || attr->key_size != 4 ||
32 attr->value_size == 0)
33 return ERR_PTR(-EINVAL);
34
35 elem_size = round_up(attr->value_size, 8);
36
37 /* check round_up into zero and u32 overflow */
38 if (elem_size == 0 ||
39 attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
40 return ERR_PTR(-ENOMEM);
41
42 array_size = sizeof(*array) + attr->max_entries * elem_size;
43
44 /* allocate all map elements and zero-initialize them */
45 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
46 if (!array) {
47 array = vzalloc(array_size);
48 if (!array)
49 return ERR_PTR(-ENOMEM);
50 }
51
52 /* copy mandatory map attributes */
53 array->map.key_size = attr->key_size;
54 array->map.value_size = attr->value_size;
55 array->map.max_entries = attr->max_entries;
56
57 array->elem_size = elem_size;
58
59 return &array->map;
60}
61
62/* Called from syscall or from eBPF program */
63static void *array_map_lookup_elem(struct bpf_map *map, void *key)
64{
65 struct bpf_array *array = container_of(map, struct bpf_array, map);
66 u32 index = *(u32 *)key;
67
68 if (index >= array->map.max_entries)
69 return NULL;
70
71 return array->value + array->elem_size * index;
72}
73
74/* Called from syscall */
75static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
76{
77 struct bpf_array *array = container_of(map, struct bpf_array, map);
78 u32 index = *(u32 *)key;
79 u32 *next = (u32 *)next_key;
80
81 if (index >= array->map.max_entries) {
82 *next = 0;
83 return 0;
84 }
85
86 if (index == array->map.max_entries - 1)
87 return -ENOENT;
88
89 *next = index + 1;
90 return 0;
91}
92
93/* Called from syscall or from eBPF program */
94static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
95 u64 map_flags)
96{
97 struct bpf_array *array = container_of(map, struct bpf_array, map);
98 u32 index = *(u32 *)key;
99
100 if (map_flags > BPF_EXIST)
101 /* unknown flags */
102 return -EINVAL;
103
104 if (index >= array->map.max_entries)
105 /* all elements were pre-allocated, cannot insert a new one */
106 return -E2BIG;
107
108 if (map_flags == BPF_NOEXIST)
109 /* all elements already exist */
110 return -EEXIST;
111
112 memcpy(array->value + array->elem_size * index, value, array->elem_size);
113 return 0;
114}
115
116/* Called from syscall or from eBPF program */
117static int array_map_delete_elem(struct bpf_map *map, void *key)
118{
119 return -EINVAL;
120}
121
122/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
123static void array_map_free(struct bpf_map *map)
124{
125 struct bpf_array *array = container_of(map, struct bpf_array, map);
126
127 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
128 * so the programs (can be more than one that used this map) were
129 * disconnected from events. Wait for outstanding programs to complete
130 * and free the array
131 */
132 synchronize_rcu();
133
134 kvfree(array);
135}
136
137static struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key,
141 .map_lookup_elem = array_map_lookup_elem,
142 .map_update_elem = array_map_update_elem,
143 .map_delete_elem = array_map_delete_elem,
144};
145
146static struct bpf_map_type_list tl = {
147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY,
149};
150
151static int __init register_array_map(void)
152{
153 bpf_register_map_type(&tl);
154 return 0;
155}
156late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
163 163
164void bpf_jit_binary_free(struct bpf_binary_header *hdr) 164void bpf_jit_binary_free(struct bpf_binary_header *hdr)
165{ 165{
166 module_free(NULL, hdr); 166 module_memfree(hdr);
167} 167}
168#endif /* CONFIG_BPF_JIT */ 168#endif /* CONFIG_BPF_JIT */
169 169
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/jhash.h>
14#include <linux/filter.h>
15#include <linux/vmalloc.h>
16
17struct bpf_htab {
18 struct bpf_map map;
19 struct hlist_head *buckets;
20 spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */
24};
25
26/* each htab element is struct htab_elem + key + value */
27struct htab_elem {
28 struct hlist_node hash_node;
29 struct rcu_head rcu;
30 u32 hash;
31 char key[0] __aligned(8);
32};
33
34/* Called from syscall */
35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36{
37 struct bpf_htab *htab;
38 int err, i;
39
40 htab = kzalloc(sizeof(*htab), GFP_USER);
41 if (!htab)
42 return ERR_PTR(-ENOMEM);
43
44 /* mandatory map attributes */
45 htab->map.key_size = attr->key_size;
46 htab->map.value_size = attr->value_size;
47 htab->map.max_entries = attr->max_entries;
48
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
51 */
52 err = -EINVAL;
53 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 htab->map.value_size == 0)
55 goto free_htab;
56
57 /* hash table size must be power of 2 */
58 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59
60 err = -E2BIG;
61 if (htab->map.key_size > MAX_BPF_STACK)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
64 */
65 goto free_htab;
66
67 err = -ENOMEM;
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab->n_buckets == 0 ||
70 htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
71 goto free_htab;
72
73 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
74 GFP_USER | __GFP_NOWARN);
75
76 if (!htab->buckets) {
77 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
78 if (!htab->buckets)
79 goto free_htab;
80 }
81
82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]);
84
85 spin_lock_init(&htab->lock);
86 htab->count = 0;
87
88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size;
91 return &htab->map;
92
93free_htab:
94 kfree(htab);
95 return ERR_PTR(err);
96}
97
98static inline u32 htab_map_hash(const void *key, u32 key_len)
99{
100 return jhash(key, key_len, 0);
101}
102
103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
104{
105 return &htab->buckets[hash & (htab->n_buckets - 1)];
106}
107
108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
109 void *key, u32 key_size)
110{
111 struct htab_elem *l;
112
113 hlist_for_each_entry_rcu(l, head, hash_node)
114 if (l->hash == hash && !memcmp(&l->key, key, key_size))
115 return l;
116
117 return NULL;
118}
119
120/* Called from syscall or from eBPF program */
121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
122{
123 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
124 struct hlist_head *head;
125 struct htab_elem *l;
126 u32 hash, key_size;
127
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
130
131 key_size = map->key_size;
132
133 hash = htab_map_hash(key, key_size);
134
135 head = select_bucket(htab, hash);
136
137 l = lookup_elem_raw(head, hash, key, key_size);
138
139 if (l)
140 return l->key + round_up(map->key_size, 8);
141
142 return NULL;
143}
144
145/* Called from syscall */
146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
147{
148 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
149 struct hlist_head *head;
150 struct htab_elem *l, *next_l;
151 u32 hash, key_size;
152 int i;
153
154 WARN_ON_ONCE(!rcu_read_lock_held());
155
156 key_size = map->key_size;
157
158 hash = htab_map_hash(key, key_size);
159
160 head = select_bucket(htab, hash);
161
162 /* lookup the key */
163 l = lookup_elem_raw(head, hash, key, key_size);
164
165 if (!l) {
166 i = 0;
167 goto find_first_elem;
168 }
169
170 /* key was found, get next key in the same bucket */
171 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
172 struct htab_elem, hash_node);
173
174 if (next_l) {
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key, next_l->key, key_size);
177 return 0;
178 }
179
180 /* no more elements in this hash list, go to the next bucket */
181 i = hash & (htab->n_buckets - 1);
182 i++;
183
184find_first_elem:
185 /* iterate over buckets */
186 for (; i < htab->n_buckets; i++) {
187 head = select_bucket(htab, i);
188
189 /* pick first element in the bucket */
190 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
191 struct htab_elem, hash_node);
192 if (next_l) {
193 /* if it's not empty, just return it */
194 memcpy(next_key, next_l->key, key_size);
195 return 0;
196 }
197 }
198
199 /* itereated over all buckets and all elements */
200 return -ENOENT;
201}
202
203/* Called from syscall or from eBPF program */
204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
205 u64 map_flags)
206{
207 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
208 struct htab_elem *l_new, *l_old;
209 struct hlist_head *head;
210 unsigned long flags;
211 u32 key_size;
212 int ret;
213
214 if (map_flags > BPF_EXIST)
215 /* unknown flags */
216 return -EINVAL;
217
218 WARN_ON_ONCE(!rcu_read_lock_held());
219
220 /* allocate new element outside of lock */
221 l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
222 if (!l_new)
223 return -ENOMEM;
224
225 key_size = map->key_size;
226
227 memcpy(l_new->key, key, key_size);
228 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
229
230 l_new->hash = htab_map_hash(l_new->key, key_size);
231
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags);
234
235 head = select_bucket(htab, l_new->hash);
236
237 l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
238
239 if (!l_old && unlikely(htab->count >= map->max_entries)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
242 */
243 ret = -E2BIG;
244 goto err;
245 }
246
247 if (l_old && map_flags == BPF_NOEXIST) {
248 /* elem already exists */
249 ret = -EEXIST;
250 goto err;
251 }
252
253 if (!l_old && map_flags == BPF_EXIST) {
254 /* elem doesn't exist, cannot update it */
255 ret = -ENOENT;
256 goto err;
257 }
258
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
261 */
262 hlist_add_head_rcu(&l_new->hash_node, head);
263 if (l_old) {
264 hlist_del_rcu(&l_old->hash_node);
265 kfree_rcu(l_old, rcu);
266 } else {
267 htab->count++;
268 }
269 spin_unlock_irqrestore(&htab->lock, flags);
270
271 return 0;
272err:
273 spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new);
275 return ret;
276}
277
278/* Called from syscall or from eBPF program */
279static int htab_map_delete_elem(struct bpf_map *map, void *key)
280{
281 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
282 struct hlist_head *head;
283 struct htab_elem *l;
284 unsigned long flags;
285 u32 hash, key_size;
286 int ret = -ENOENT;
287
288 WARN_ON_ONCE(!rcu_read_lock_held());
289
290 key_size = map->key_size;
291
292 hash = htab_map_hash(key, key_size);
293
294 spin_lock_irqsave(&htab->lock, flags);
295
296 head = select_bucket(htab, hash);
297
298 l = lookup_elem_raw(head, hash, key, key_size);
299
300 if (l) {
301 hlist_del_rcu(&l->hash_node);
302 htab->count--;
303 kfree_rcu(l, rcu);
304 ret = 0;
305 }
306
307 spin_unlock_irqrestore(&htab->lock, flags);
308 return ret;
309}
310
311static void delete_all_elements(struct bpf_htab *htab)
312{
313 int i;
314
315 for (i = 0; i < htab->n_buckets; i++) {
316 struct hlist_head *head = select_bucket(htab, i);
317 struct hlist_node *n;
318 struct htab_elem *l;
319
320 hlist_for_each_entry_safe(l, n, head, hash_node) {
321 hlist_del_rcu(&l->hash_node);
322 htab->count--;
323 kfree(l);
324 }
325 }
326}
327
328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329static void htab_map_free(struct bpf_map *map)
330{
331 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
332
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
337 */
338 synchronize_rcu();
339
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
342 */
343 delete_all_elements(htab);
344 kvfree(htab->buckets);
345 kfree(htab);
346}
347
348static struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key,
352 .map_lookup_elem = htab_map_lookup_elem,
353 .map_update_elem = htab_map_update_elem,
354 .map_delete_elem = htab_map_delete_elem,
355};
356
357static struct bpf_map_type_list tl = {
358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH,
360};
361
362static int __init register_htab_map(void)
363{
364 bpf_register_map_type(&tl);
365 return 0;
366}
367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/rcupdate.h>
14
15/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return
17 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
18 *
19 * Different map implementations will rely on rcu in map methods
20 * lookup/update/delete, therefore eBPF programs must run under rcu lock
21 * if program is allowed to access maps, so check rcu_read_lock_held in
22 * all three functions.
23 */
24static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
25{
26 /* verifier checked that R1 contains a valid pointer to bpf_map
27 * and R2 points to a program stack and map->key_size bytes were
28 * initialized
29 */
30 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
31 void *key = (void *) (unsigned long) r2;
32 void *value;
33
34 WARN_ON_ONCE(!rcu_read_lock_held());
35
36 value = map->ops->map_lookup_elem(map, key);
37
38 /* lookup() returns either pointer to element value or NULL
39 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
40 */
41 return (unsigned long) value;
42}
43
44struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem,
46 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
48 .arg1_type = ARG_CONST_MAP_PTR,
49 .arg2_type = ARG_PTR_TO_MAP_KEY,
50};
51
52static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
53{
54 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
55 void *key = (void *) (unsigned long) r2;
56 void *value = (void *) (unsigned long) r3;
57
58 WARN_ON_ONCE(!rcu_read_lock_held());
59
60 return map->ops->map_update_elem(map, key, value, r4);
61}
62
63struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem,
65 .gpl_only = false,
66 .ret_type = RET_INTEGER,
67 .arg1_type = ARG_CONST_MAP_PTR,
68 .arg2_type = ARG_PTR_TO_MAP_KEY,
69 .arg3_type = ARG_PTR_TO_MAP_VALUE,
70 .arg4_type = ARG_ANYTHING,
71};
72
73static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
74{
75 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
76 void *key = (void *) (unsigned long) r2;
77
78 WARN_ON_ONCE(!rcu_read_lock_held());
79
80 return map->ops->map_delete_elem(map, key);
81}
82
83struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem,
85 .gpl_only = false,
86 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY,
89};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ESRCH; 172 err = -ENOENT;
173 rcu_read_lock(); 173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key); 174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 175 if (!value)
@@ -190,7 +190,7 @@ err_put:
190 return err; 190 return err;
191} 191}
192 192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value 193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
194 194
195static int map_update_elem(union bpf_attr *attr) 195static int map_update_elem(union bpf_attr *attr)
196{ 196{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
231 * therefore all map accessors rely on this fact, so do the same here 231 * therefore all map accessors rely on this fact, so do the same here
232 */ 232 */
233 rcu_read_lock(); 233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value); 234 err = map->ops->map_update_elem(map, key, value, attr->flags);
235 rcu_read_unlock(); 235 rcu_read_unlock();
236 236
237free_value: 237free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
18 u64 arg2; 18 u64 arg2;
19}; 19};
20 20
21static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
22{
23 return 0;
24}
25
26static struct bpf_func_proto test_funcs[] = {
27 [BPF_FUNC_unspec] = {
28 .func = test_func,
29 .gpl_only = true,
30 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
31 .arg1_type = ARG_CONST_MAP_PTR,
32 .arg2_type = ARG_PTR_TO_MAP_KEY,
33 },
34};
35
36static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) 21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
37{ 22{
38 if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) 23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
39 return NULL; 31 return NULL;
40 return &test_funcs[func_id]; 32 }
41} 33}
42 34
43static const struct bpf_context_access { 35static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
78 .type = BPF_PROG_TYPE_UNSPEC, 70 .type = BPF_PROG_TYPE_UNSPEC,
79}; 71};
80 72
81static struct bpf_map *test_map_alloc(union bpf_attr *attr)
82{
83 struct bpf_map *map;
84
85 map = kzalloc(sizeof(*map), GFP_USER);
86 if (!map)
87 return ERR_PTR(-ENOMEM);
88
89 map->key_size = attr->key_size;
90 map->value_size = attr->value_size;
91 map->max_entries = attr->max_entries;
92 return map;
93}
94
95static void test_map_free(struct bpf_map *map)
96{
97 kfree(map);
98}
99
100static struct bpf_map_ops test_map_ops = {
101 .map_alloc = test_map_alloc,
102 .map_free = test_map_free,
103};
104
105static struct bpf_map_type_list tl_map = {
106 .ops = &test_map_ops,
107 .type = BPF_MAP_TYPE_UNSPEC,
108};
109
110static int __init register_test_ops(void) 73static int __init register_test_ops(void)
111{ 74{
112 bpf_register_map_type(&tl_map);
113 bpf_register_prog_type(&tl_prog); 75 bpf_register_prog_type(&tl_prog);
114 return 0; 76 return 0;
115} 77}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
153 153
154enum bpf_stack_slot_type { 154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */ 155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* 1st byte of register spilled into stack */ 156 STACK_SPILL, /* register spilled into stack */
157 STACK_SPILL_PART, /* other 7 bytes of register spill */
158 STACK_MISC /* BPF program wrote some data into this slot */ 157 STACK_MISC /* BPF program wrote some data into this slot */
159}; 158};
160 159
161struct bpf_stack_slot { 160#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
162 enum bpf_stack_slot_type stype;
163 struct reg_state reg_st;
164};
165 161
166/* state of the program: 162/* state of the program:
167 * type of all registers and stack info 163 * type of all registers and stack info
168 */ 164 */
169struct verifier_state { 165struct verifier_state {
170 struct reg_state regs[MAX_BPF_REG]; 166 struct reg_state regs[MAX_BPF_REG];
171 struct bpf_stack_slot stack[MAX_BPF_STACK]; 167 u8 stack_slot_type[MAX_BPF_STACK];
168 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
172}; 169};
173 170
174/* linked list of verifier states used to prune search */ 171/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
259 env->cur_state.regs[i].map_ptr->key_size, 256 env->cur_state.regs[i].map_ptr->key_size,
260 env->cur_state.regs[i].map_ptr->value_size); 257 env->cur_state.regs[i].map_ptr->value_size);
261 } 258 }
262 for (i = 0; i < MAX_BPF_STACK; i++) { 259 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
263 if (env->cur_state.stack[i].stype == STACK_SPILL) 260 if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
264 verbose(" fp%d=%s", -MAX_BPF_STACK + i, 261 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
265 reg_type_str[env->cur_state.stack[i].reg_st.type]); 262 reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
266 } 263 }
267 verbose("\n"); 264 verbose("\n");
268} 265}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
539static int check_stack_write(struct verifier_state *state, int off, int size, 536static int check_stack_write(struct verifier_state *state, int off, int size,
540 int value_regno) 537 int value_regno)
541{ 538{
542 struct bpf_stack_slot *slot;
543 int i; 539 int i;
540 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
541 * so it's aligned access and [off, off + size) are within stack limits
542 */
544 543
545 if (value_regno >= 0 && 544 if (value_regno >= 0 &&
546 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 545 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
548 state->regs[value_regno].type == PTR_TO_CTX)) { 547 state->regs[value_regno].type == PTR_TO_CTX)) {
549 548
550 /* register containing pointer is being spilled into stack */ 549 /* register containing pointer is being spilled into stack */
551 if (size != 8) { 550 if (size != BPF_REG_SIZE) {
552 verbose("invalid size of register spill\n"); 551 verbose("invalid size of register spill\n");
553 return -EACCES; 552 return -EACCES;
554 } 553 }
555 554
556 slot = &state->stack[MAX_BPF_STACK + off];
557 slot->stype = STACK_SPILL;
558 /* save register state */ 555 /* save register state */
559 slot->reg_st = state->regs[value_regno]; 556 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
560 for (i = 1; i < 8; i++) { 557 state->regs[value_regno];
561 slot = &state->stack[MAX_BPF_STACK + off + i];
562 slot->stype = STACK_SPILL_PART;
563 slot->reg_st.type = UNKNOWN_VALUE;
564 slot->reg_st.map_ptr = NULL;
565 }
566 } else {
567 558
559 for (i = 0; i < BPF_REG_SIZE; i++)
560 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
561 } else {
568 /* regular write of data into stack */ 562 /* regular write of data into stack */
569 for (i = 0; i < size; i++) { 563 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
570 slot = &state->stack[MAX_BPF_STACK + off + i]; 564 (struct reg_state) {};
571 slot->stype = STACK_MISC; 565
572 slot->reg_st.type = UNKNOWN_VALUE; 566 for (i = 0; i < size; i++)
573 slot->reg_st.map_ptr = NULL; 567 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
574 }
575 } 568 }
576 return 0; 569 return 0;
577} 570}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
579static int check_stack_read(struct verifier_state *state, int off, int size, 572static int check_stack_read(struct verifier_state *state, int off, int size,
580 int value_regno) 573 int value_regno)
581{ 574{
575 u8 *slot_type;
582 int i; 576 int i;
583 struct bpf_stack_slot *slot;
584 577
585 slot = &state->stack[MAX_BPF_STACK + off]; 578 slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
586 579
587 if (slot->stype == STACK_SPILL) { 580 if (slot_type[0] == STACK_SPILL) {
588 if (size != 8) { 581 if (size != BPF_REG_SIZE) {
589 verbose("invalid size of register spill\n"); 582 verbose("invalid size of register spill\n");
590 return -EACCES; 583 return -EACCES;
591 } 584 }
592 for (i = 1; i < 8; i++) { 585 for (i = 1; i < BPF_REG_SIZE; i++) {
593 if (state->stack[MAX_BPF_STACK + off + i].stype != 586 if (slot_type[i] != STACK_SPILL) {
594 STACK_SPILL_PART) {
595 verbose("corrupted spill memory\n"); 587 verbose("corrupted spill memory\n");
596 return -EACCES; 588 return -EACCES;
597 } 589 }
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
599 591
600 if (value_regno >= 0) 592 if (value_regno >= 0)
601 /* restore register state from stack */ 593 /* restore register state from stack */
602 state->regs[value_regno] = slot->reg_st; 594 state->regs[value_regno] =
595 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
603 return 0; 596 return 0;
604 } else { 597 } else {
605 for (i = 0; i < size; i++) { 598 for (i = 0; i < size; i++) {
606 if (state->stack[MAX_BPF_STACK + off + i].stype != 599 if (slot_type[i] != STACK_MISC) {
607 STACK_MISC) {
608 verbose("invalid read from stack off %d+%d size %d\n", 600 verbose("invalid read from stack off %d+%d size %d\n",
609 off, i, size); 601 off, i, size);
610 return -EACCES; 602 return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
747 } 739 }
748 740
749 for (i = 0; i < access_size; i++) { 741 for (i = 0; i < access_size; i++) {
750 if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { 742 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
751 verbose("invalid indirect read from stack off %d+%d size %d\n", 743 verbose("invalid indirect read from stack off %d+%d size %d\n",
752 off, i, access_size); 744 off, i, access_size);
753 return -EACCES; 745 return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1180 return 0; 1172 return 0;
1181} 1173}
1182 1174
1175/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers,
1178 * preserve R6-R9, and store return value into R0
1179 *
1180 * Implicit input:
1181 * ctx == skb == R6 == CTX
1182 *
1183 * Explicit input:
1184 * SRC == any register
1185 * IMM == 32-bit immediate
1186 *
1187 * Output:
1188 * R0 - 8/16/32-bit skb data converted to cpu endianness
1189 */
1190static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1191{
1192 struct reg_state *regs = env->cur_state.regs;
1193 u8 mode = BPF_MODE(insn->code);
1194 struct reg_state *reg;
1195 int i, err;
1196
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
1199 return -EINVAL;
1200 }
1201
1202 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
1203 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
1204 verbose("BPF_LD_ABS uses reserved fields\n");
1205 return -EINVAL;
1206 }
1207
1208 /* check whether implicit source operand (register R6) is readable */
1209 err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
1210 if (err)
1211 return err;
1212
1213 if (regs[BPF_REG_6].type != PTR_TO_CTX) {
1214 verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
1215 return -EINVAL;
1216 }
1217
1218 if (mode == BPF_IND) {
1219 /* check explicit source operand */
1220 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1221 if (err)
1222 return err;
1223 }
1224
1225 /* reset caller saved regs to unreadable */
1226 for (i = 0; i < CALLER_SAVED_REGS; i++) {
1227 reg = regs + caller_saved[i];
1228 reg->type = NOT_INIT;
1229 reg->imm = 0;
1230 }
1231
1232 /* mark destination R0 register as readable, since it contains
1233 * the value fetched from the packet
1234 */
1235 regs[BPF_REG_0].type = UNKNOWN_VALUE;
1236 return 0;
1237}
1238
1183/* non-recursive DFS pseudo code 1239/* non-recursive DFS pseudo code
1184 * 1 procedure DFS-iterative(G,v): 1240 * 1 procedure DFS-iterative(G,v):
1185 * 2 label v as discovered 1241 * 2 label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1417 } 1473 }
1418 1474
1419 for (i = 0; i < MAX_BPF_STACK; i++) { 1475 for (i = 0; i < MAX_BPF_STACK; i++) {
1420 if (memcmp(&old->stack[i], &cur->stack[i], 1476 if (old->stack_slot_type[i] == STACK_INVALID)
1421 sizeof(old->stack[0])) != 0) { 1477 continue;
1422 if (old->stack[i].stype == STACK_INVALID) 1478 if (old->stack_slot_type[i] != cur->stack_slot_type[i])
1423 continue; 1479 /* Ex: old explored (safe) state has STACK_SPILL in
1480 * this stack slot, but current has has STACK_MISC ->
1481 * this verifier states are not equivalent,
1482 * return false to continue verification of this path
1483 */
1424 return false; 1484 return false;
1425 } 1485 if (i % BPF_REG_SIZE)
1486 continue;
1487 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
1488 &cur->spilled_regs[i / BPF_REG_SIZE],
1489 sizeof(old->spilled_regs[0])))
1490 /* when explored and current stack slot types are
1491 * the same, check that stored pointers types
1492 * are the same as well.
1493 * Ex: explored safe path could have stored
1494 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
1495 * but current path has stored:
1496 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
1497 * such verifier states are not equivalent.
1498 * return false to continue verification of this path
1499 */
1500 return false;
1501 else
1502 continue;
1426 } 1503 }
1427 return true; 1504 return true;
1428} 1505}
@@ -1664,8 +1741,10 @@ process_bpf_exit:
1664 u8 mode = BPF_MODE(insn->code); 1741 u8 mode = BPF_MODE(insn->code);
1665 1742
1666 if (mode == BPF_ABS || mode == BPF_IND) { 1743 if (mode == BPF_ABS || mode == BPF_IND) {
1667 verbose("LD_ABS is not supported yet\n"); 1744 err = check_ld_abs(env, insn);
1668 return -EINVAL; 1745 if (err)
1746 return err;
1747
1669 } else if (mode == BPF_IMM) { 1748 } else if (mode == BPF_IMM) {
1670 err = check_ld_imm(env, insn); 1749 err = check_ld_imm(env, insn);
1671 if (err) 1750 if (err)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 136eceadeed1..bb263d0caab3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
277 if (!(cgrp->root->subsys_mask & (1 << ss->id))) 277 if (!(cgrp->root->subsys_mask & (1 << ss->id)))
278 return NULL; 278 return NULL;
279 279
280 /*
281 * This function is used while updating css associations and thus
282 * can't test the csses directly. Use ->child_subsys_mask.
283 */
280 while (cgroup_parent(cgrp) && 284 while (cgroup_parent(cgrp) &&
281 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) 285 !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
282 cgrp = cgroup_parent(cgrp); 286 cgrp = cgroup_parent(cgrp);
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
284 return cgroup_css(cgrp, ss); 288 return cgroup_css(cgrp, ss);
285} 289}
286 290
291/**
292 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
293 * @cgrp: the cgroup of interest
294 * @ss: the subsystem of interest
295 *
296 * Find and get the effective css of @cgrp for @ss. The effective css is
297 * defined as the matching css of the nearest ancestor including self which
298 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
299 * the root css is returned, so this function always returns a valid css.
300 * The returned css must be put using css_put().
301 */
302struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
303 struct cgroup_subsys *ss)
304{
305 struct cgroup_subsys_state *css;
306
307 rcu_read_lock();
308
309 do {
310 css = cgroup_css(cgrp, ss);
311
312 if (css && css_tryget_online(css))
313 goto out_unlock;
314 cgrp = cgroup_parent(cgrp);
315 } while (cgrp);
316
317 css = init_css_set.subsys[ss->id];
318 css_get(css);
319out_unlock:
320 rcu_read_unlock();
321 return css;
322}
323
287/* convenient tests for these bits */ 324/* convenient tests for these bits */
288static inline bool cgroup_is_dead(const struct cgroup *cgrp) 325static inline bool cgroup_is_dead(const struct cgroup *cgrp)
289{ 326{
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
1019} 1056}
1020 1057
1021/** 1058/**
1022 * cgroup_refresh_child_subsys_mask - update child_subsys_mask 1059 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1023 * @cgrp: the target cgroup 1060 * @cgrp: the target cgroup
1061 * @subtree_control: the new subtree_control mask to consider
1024 * 1062 *
1025 * On the default hierarchy, a subsystem may request other subsystems to be 1063 * On the default hierarchy, a subsystem may request other subsystems to be
1026 * enabled together through its ->depends_on mask. In such cases, more 1064 * enabled together through its ->depends_on mask. In such cases, more
1027 * subsystems than specified in "cgroup.subtree_control" may be enabled. 1065 * subsystems than specified in "cgroup.subtree_control" may be enabled.
1028 * 1066 *
1029 * This function determines which subsystems need to be enabled given the 1067 * This function calculates which subsystems need to be enabled if
1030 * current @cgrp->subtree_control and records it in 1068 * @subtree_control is to be applied to @cgrp. The returned mask is always
1031 * @cgrp->child_subsys_mask. The resulting mask is always a superset of 1069 * a superset of @subtree_control and follows the usual hierarchy rules.
1032 * @cgrp->subtree_control and follows the usual hierarchy rules.
1033 */ 1070 */
1034static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) 1071static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
1072 unsigned int subtree_control)
1035{ 1073{
1036 struct cgroup *parent = cgroup_parent(cgrp); 1074 struct cgroup *parent = cgroup_parent(cgrp);
1037 unsigned int cur_ss_mask = cgrp->subtree_control; 1075 unsigned int cur_ss_mask = subtree_control;
1038 struct cgroup_subsys *ss; 1076 struct cgroup_subsys *ss;
1039 int ssid; 1077 int ssid;
1040 1078
1041 lockdep_assert_held(&cgroup_mutex); 1079 lockdep_assert_held(&cgroup_mutex);
1042 1080
1043 if (!cgroup_on_dfl(cgrp)) { 1081 if (!cgroup_on_dfl(cgrp))
1044 cgrp->child_subsys_mask = cur_ss_mask; 1082 return cur_ss_mask;
1045 return;
1046 }
1047 1083
1048 while (true) { 1084 while (true) {
1049 unsigned int new_ss_mask = cur_ss_mask; 1085 unsigned int new_ss_mask = cur_ss_mask;
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1067 cur_ss_mask = new_ss_mask; 1103 cur_ss_mask = new_ss_mask;
1068 } 1104 }
1069 1105
1070 cgrp->child_subsys_mask = cur_ss_mask; 1106 return cur_ss_mask;
1107}
1108
1109/**
1110 * cgroup_refresh_child_subsys_mask - update child_subsys_mask
1111 * @cgrp: the target cgroup
1112 *
1113 * Update @cgrp->child_subsys_mask according to the current
1114 * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
1115 */
1116static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
1117{
1118 cgrp->child_subsys_mask =
1119 cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
1071} 1120}
1072 1121
1073/** 1122/**
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2641 loff_t off) 2690 loff_t off)
2642{ 2691{
2643 unsigned int enable = 0, disable = 0; 2692 unsigned int enable = 0, disable = 0;
2644 unsigned int css_enable, css_disable, old_ctrl, new_ctrl; 2693 unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
2645 struct cgroup *cgrp, *child; 2694 struct cgroup *cgrp, *child;
2646 struct cgroup_subsys *ss; 2695 struct cgroup_subsys *ss;
2647 char *tok; 2696 char *tok;
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2693 ret = -ENOENT; 2742 ret = -ENOENT;
2694 goto out_unlock; 2743 goto out_unlock;
2695 } 2744 }
2696
2697 /*
2698 * @ss is already enabled through dependency and
2699 * we'll just make it visible. Skip draining.
2700 */
2701 if (cgrp->child_subsys_mask & (1 << ssid))
2702 continue;
2703
2704 /*
2705 * Because css offlining is asynchronous, userland
2706 * might try to re-enable the same controller while
2707 * the previous instance is still around. In such
2708 * cases, wait till it's gone using offline_waitq.
2709 */
2710 cgroup_for_each_live_child(child, cgrp) {
2711 DEFINE_WAIT(wait);
2712
2713 if (!cgroup_css(child, ss))
2714 continue;
2715
2716 cgroup_get(child);
2717 prepare_to_wait(&child->offline_waitq, &wait,
2718 TASK_UNINTERRUPTIBLE);
2719 cgroup_kn_unlock(of->kn);
2720 schedule();
2721 finish_wait(&child->offline_waitq, &wait);
2722 cgroup_put(child);
2723
2724 return restart_syscall();
2725 }
2726 } else if (disable & (1 << ssid)) { 2745 } else if (disable & (1 << ssid)) {
2727 if (!(cgrp->subtree_control & (1 << ssid))) { 2746 if (!(cgrp->subtree_control & (1 << ssid))) {
2728 disable &= ~(1 << ssid); 2747 disable &= ~(1 << ssid);
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2758 * subsystems than specified may need to be enabled or disabled 2777 * subsystems than specified may need to be enabled or disabled
2759 * depending on subsystem dependencies. 2778 * depending on subsystem dependencies.
2760 */ 2779 */
2761 cgrp->subtree_control |= enable; 2780 old_sc = cgrp->subtree_control;
2762 cgrp->subtree_control &= ~disable; 2781 old_ss = cgrp->child_subsys_mask;
2782 new_sc = (old_sc | enable) & ~disable;
2783 new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
2763 2784
2764 old_ctrl = cgrp->child_subsys_mask; 2785 css_enable = ~old_ss & new_ss;
2765 cgroup_refresh_child_subsys_mask(cgrp); 2786 css_disable = old_ss & ~new_ss;
2766 new_ctrl = cgrp->child_subsys_mask;
2767
2768 css_enable = ~old_ctrl & new_ctrl;
2769 css_disable = old_ctrl & ~new_ctrl;
2770 enable |= css_enable; 2787 enable |= css_enable;
2771 disable |= css_disable; 2788 disable |= css_disable;
2772 2789
2773 /* 2790 /*
2791 * Because css offlining is asynchronous, userland might try to
2792 * re-enable the same controller while the previous instance is
2793 * still around. In such cases, wait till it's gone using
2794 * offline_waitq.
2795 */
2796 for_each_subsys(ss, ssid) {
2797 if (!(css_enable & (1 << ssid)))
2798 continue;
2799
2800 cgroup_for_each_live_child(child, cgrp) {
2801 DEFINE_WAIT(wait);
2802
2803 if (!cgroup_css(child, ss))
2804 continue;
2805
2806 cgroup_get(child);
2807 prepare_to_wait(&child->offline_waitq, &wait,
2808 TASK_UNINTERRUPTIBLE);
2809 cgroup_kn_unlock(of->kn);
2810 schedule();
2811 finish_wait(&child->offline_waitq, &wait);
2812 cgroup_put(child);
2813
2814 return restart_syscall();
2815 }
2816 }
2817
2818 cgrp->subtree_control = new_sc;
2819 cgrp->child_subsys_mask = new_ss;
2820
2821 /*
2774 * Create new csses or make the existing ones visible. A css is 2822 * Create new csses or make the existing ones visible. A css is
2775 * created invisible if it's being implicitly enabled through 2823 * created invisible if it's being implicitly enabled through
2776 * dependency. An invisible css is made visible when the userland 2824 * dependency. An invisible css is made visible when the userland
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2825 } 2873 }
2826 } 2874 }
2827 2875
2876 /*
2877 * The effective csses of all the descendants (excluding @cgrp) may
2878 * have changed. Subsystems can optionally subscribe to this event
2879 * by implementing ->css_e_css_changed() which is invoked if any of
2880 * the effective csses seen from the css's cgroup may have changed.
2881 */
2882 for_each_subsys(ss, ssid) {
2883 struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
2884 struct cgroup_subsys_state *css;
2885
2886 if (!ss->css_e_css_changed || !this_css)
2887 continue;
2888
2889 css_for_each_descendant_pre(css, this_css)
2890 if (css != this_css)
2891 ss->css_e_css_changed(css);
2892 }
2893
2828 kernfs_activate(cgrp->kn); 2894 kernfs_activate(cgrp->kn);
2829 ret = 0; 2895 ret = 0;
2830out_unlock: 2896out_unlock:
@@ -2832,9 +2898,8 @@ out_unlock:
2832 return ret ?: nbytes; 2898 return ret ?: nbytes;
2833 2899
2834err_undo_css: 2900err_undo_css:
2835 cgrp->subtree_control &= ~enable; 2901 cgrp->subtree_control = old_sc;
2836 cgrp->subtree_control |= disable; 2902 cgrp->child_subsys_mask = old_ss;
2837 cgroup_refresh_child_subsys_mask(cgrp);
2838 2903
2839 for_each_subsys(ss, ssid) { 2904 for_each_subsys(ss, ssid) {
2840 if (!(enable & (1 << ssid))) 2905 if (!(enable & (1 << ssid)))
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
4370 if (ss) { 4435 if (ss) {
4371 /* css release path */ 4436 /* css release path */
4372 cgroup_idr_remove(&ss->css_idr, css->id); 4437 cgroup_idr_remove(&ss->css_idr, css->id);
4438 if (ss->css_released)
4439 ss->css_released(css);
4373 } else { 4440 } else {
4374 /* cgroup release path */ 4441 /* cgroup release path */
4375 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 4442 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9d0ad7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249 249
250/* 250/*
251 * There are two global mutexes guarding cpuset structures - cpuset_mutex 251 * There are two global locks guarding cpuset structures - cpuset_mutex and
252 * and callback_mutex. The latter may nest inside the former. We also 252 * callback_lock. We also require taking task_lock() when dereferencing a
253 * require taking task_lock() when dereferencing a task's cpuset pointer. 253 * task's cpuset pointer. See "The task_lock() exception", at the end of this
254 * See "The task_lock() exception", at the end of this comment. 254 * comment.
255 * 255 *
256 * A task must hold both mutexes to modify cpusets. If a task holds 256 * A task must hold both locks to modify cpusets. If a task holds
257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
258 * is the only task able to also acquire callback_mutex and be able to 258 * is the only task able to also acquire callback_lock and be able to
259 * modify cpusets. It can perform various checks on the cpuset structure 259 * modify cpusets. It can perform various checks on the cpuset structure
260 * first, knowing nothing will change. It can also allocate memory while 260 * first, knowing nothing will change. It can also allocate memory while
261 * just holding cpuset_mutex. While it is performing these checks, various 261 * just holding cpuset_mutex. While it is performing these checks, various
262 * callback routines can briefly acquire callback_mutex to query cpusets. 262 * callback routines can briefly acquire callback_lock to query cpusets.
263 * Once it is ready to make the changes, it takes callback_mutex, blocking 263 * Once it is ready to make the changes, it takes callback_lock, blocking
264 * everyone else. 264 * everyone else.
265 * 265 *
266 * Calls to the kernel memory allocator can not be made while holding 266 * Calls to the kernel memory allocator can not be made while holding
267 * callback_mutex, as that would risk double tripping on callback_mutex 267 * callback_lock, as that would risk double tripping on callback_lock
268 * from one of the callbacks into the cpuset code from within 268 * from one of the callbacks into the cpuset code from within
269 * __alloc_pages(). 269 * __alloc_pages().
270 * 270 *
271 * If a task is only holding callback_mutex, then it has read-only 271 * If a task is only holding callback_lock, then it has read-only
272 * access to cpusets. 272 * access to cpusets.
273 * 273 *
274 * Now, the task_struct fields mems_allowed and mempolicy may be changed 274 * Now, the task_struct fields mems_allowed and mempolicy may be changed
275 * by other task, we use alloc_lock in the task_struct fields to protect 275 * by other task, we use alloc_lock in the task_struct fields to protect
276 * them. 276 * them.
277 * 277 *
278 * The cpuset_common_file_read() handlers only hold callback_mutex across 278 * The cpuset_common_file_read() handlers only hold callback_lock across
279 * small pieces of code, such as when reading out possibly multi-word 279 * small pieces of code, such as when reading out possibly multi-word
280 * cpumasks and nodemasks. 280 * cpumasks and nodemasks.
281 * 281 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
284 */ 284 */
285 285
286static DEFINE_MUTEX(cpuset_mutex); 286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_MUTEX(callback_mutex); 287static DEFINE_SPINLOCK(callback_lock);
288 288
289/* 289/*
290 * CPU / memory hotplug is handled asynchronously. 290 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
329 * One way or another, we guarantee to return some non-empty subset 329 * One way or another, we guarantee to return some non-empty subset
330 * of cpu_online_mask. 330 * of cpu_online_mask.
331 * 331 *
332 * Call with callback_mutex held. 332 * Call with callback_lock or cpuset_mutex held.
333 */ 333 */
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{ 335{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
347 * One way or another, we guarantee to return some non-empty subset 347 * One way or another, we guarantee to return some non-empty subset
348 * of node_states[N_MEMORY]. 348 * of node_states[N_MEMORY].
349 * 349 *
350 * Call with callback_mutex held. 350 * Call with callback_lock or cpuset_mutex held.
351 */ 351 */
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{ 353{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
359/* 359/*
360 * update task's spread flag if cpuset's page/slab spread flag is set 360 * update task's spread flag if cpuset's page/slab spread flag is set
361 * 361 *
362 * Called with callback_mutex/cpuset_mutex held 362 * Call with callback_lock or cpuset_mutex held.
363 */ 363 */
364static void cpuset_update_task_spread_flag(struct cpuset *cs, 364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk) 365 struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
886 continue; 886 continue;
887 rcu_read_unlock(); 887 rcu_read_unlock();
888 888
889 mutex_lock(&callback_mutex); 889 spin_lock_irq(&callback_lock);
890 cpumask_copy(cp->effective_cpus, new_cpus); 890 cpumask_copy(cp->effective_cpus, new_cpus);
891 mutex_unlock(&callback_mutex); 891 spin_unlock_irq(&callback_lock);
892 892
893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
953 if (retval < 0) 953 if (retval < 0)
954 return retval; 954 return retval;
955 955
956 mutex_lock(&callback_mutex); 956 spin_lock_irq(&callback_lock);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 958 spin_unlock_irq(&callback_lock);
959 959
960 /* use trialcs->cpus_allowed as a temp variable */ 960 /* use trialcs->cpus_allowed as a temp variable */
961 update_cpumasks_hier(cs, trialcs->cpus_allowed); 961 update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1142 continue; 1142 continue;
1143 rcu_read_unlock(); 1143 rcu_read_unlock();
1144 1144
1145 mutex_lock(&callback_mutex); 1145 spin_lock_irq(&callback_lock);
1146 cp->effective_mems = *new_mems; 1146 cp->effective_mems = *new_mems;
1147 mutex_unlock(&callback_mutex); 1147 spin_unlock_irq(&callback_lock);
1148 1148
1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1150 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1150 !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1165 * mempolicies and if the cpuset is marked 'memory_migrate', 1165 * mempolicies and if the cpuset is marked 'memory_migrate',
1166 * migrate the tasks pages to the new memory. 1166 * migrate the tasks pages to the new memory.
1167 * 1167 *
1168 * Call with cpuset_mutex held. May take callback_mutex during call. 1168 * Call with cpuset_mutex held. May take callback_lock during call.
1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1171 * their mempolicies to the cpusets new mems_allowed. 1171 * their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1212 if (retval < 0) 1212 if (retval < 0)
1213 goto done; 1213 goto done;
1214 1214
1215 mutex_lock(&callback_mutex); 1215 spin_lock_irq(&callback_lock);
1216 cs->mems_allowed = trialcs->mems_allowed; 1216 cs->mems_allowed = trialcs->mems_allowed;
1217 mutex_unlock(&callback_mutex); 1217 spin_unlock_irq(&callback_lock);
1218 1218
1219 /* use trialcs->mems_allowed as a temp variable */ 1219 /* use trialcs->mems_allowed as a temp variable */
1220 update_nodemasks_hier(cs, &cs->mems_allowed); 1220 update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1306 || (is_spread_page(cs) != is_spread_page(trialcs))); 1306 || (is_spread_page(cs) != is_spread_page(trialcs)));
1307 1307
1308 mutex_lock(&callback_mutex); 1308 spin_lock_irq(&callback_lock);
1309 cs->flags = trialcs->flags; 1309 cs->flags = trialcs->flags;
1310 mutex_unlock(&callback_mutex); 1310 spin_unlock_irq(&callback_lock);
1311 1311
1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1313 rebuild_sched_domains_locked(); 1313 rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1714 count = seq_get_buf(sf, &buf); 1714 count = seq_get_buf(sf, &buf);
1715 s = buf; 1715 s = buf;
1716 1716
1717 mutex_lock(&callback_mutex); 1717 spin_lock_irq(&callback_lock);
1718 1718
1719 switch (type) { 1719 switch (type) {
1720 case FILE_CPULIST: 1720 case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1741 seq_commit(sf, -1); 1741 seq_commit(sf, -1);
1742 } 1742 }
1743out_unlock: 1743out_unlock:
1744 mutex_unlock(&callback_mutex); 1744 spin_unlock_irq(&callback_lock);
1745 return ret; 1745 return ret;
1746} 1746}
1747 1747
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1958 1958
1959 cpuset_inc(); 1959 cpuset_inc();
1960 1960
1961 mutex_lock(&callback_mutex); 1961 spin_lock_irq(&callback_lock);
1962 if (cgroup_on_dfl(cs->css.cgroup)) { 1962 if (cgroup_on_dfl(cs->css.cgroup)) {
1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1964 cs->effective_mems = parent->effective_mems; 1964 cs->effective_mems = parent->effective_mems;
1965 } 1965 }
1966 mutex_unlock(&callback_mutex); 1966 spin_unlock_irq(&callback_lock);
1967 1967
1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1969 goto out_unlock; 1969 goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1990 } 1990 }
1991 rcu_read_unlock(); 1991 rcu_read_unlock();
1992 1992
1993 mutex_lock(&callback_mutex); 1993 spin_lock_irq(&callback_lock);
1994 cs->mems_allowed = parent->mems_allowed; 1994 cs->mems_allowed = parent->mems_allowed;
1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1996 mutex_unlock(&callback_mutex); 1996 spin_unlock_irq(&callback_lock);
1997out_unlock: 1997out_unlock:
1998 mutex_unlock(&cpuset_mutex); 1998 mutex_unlock(&cpuset_mutex);
1999 return 0; 1999 return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2032static void cpuset_bind(struct cgroup_subsys_state *root_css) 2032static void cpuset_bind(struct cgroup_subsys_state *root_css)
2033{ 2033{
2034 mutex_lock(&cpuset_mutex); 2034 mutex_lock(&cpuset_mutex);
2035 mutex_lock(&callback_mutex); 2035 spin_lock_irq(&callback_lock);
2036 2036
2037 if (cgroup_on_dfl(root_css->cgroup)) { 2037 if (cgroup_on_dfl(root_css->cgroup)) {
2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2043 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2043 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2044 } 2044 }
2045 2045
2046 mutex_unlock(&callback_mutex); 2046 spin_unlock_irq(&callback_lock);
2047 mutex_unlock(&cpuset_mutex); 2047 mutex_unlock(&cpuset_mutex);
2048} 2048}
2049 2049
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2128{ 2128{
2129 bool is_empty; 2129 bool is_empty;
2130 2130
2131 mutex_lock(&callback_mutex); 2131 spin_lock_irq(&callback_lock);
2132 cpumask_copy(cs->cpus_allowed, new_cpus); 2132 cpumask_copy(cs->cpus_allowed, new_cpus);
2133 cpumask_copy(cs->effective_cpus, new_cpus); 2133 cpumask_copy(cs->effective_cpus, new_cpus);
2134 cs->mems_allowed = *new_mems; 2134 cs->mems_allowed = *new_mems;
2135 cs->effective_mems = *new_mems; 2135 cs->effective_mems = *new_mems;
2136 mutex_unlock(&callback_mutex); 2136 spin_unlock_irq(&callback_lock);
2137 2137
2138 /* 2138 /*
2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
2170 if (nodes_empty(*new_mems)) 2170 if (nodes_empty(*new_mems))
2171 *new_mems = parent_cs(cs)->effective_mems; 2171 *new_mems = parent_cs(cs)->effective_mems;
2172 2172
2173 mutex_lock(&callback_mutex); 2173 spin_lock_irq(&callback_lock);
2174 cpumask_copy(cs->effective_cpus, new_cpus); 2174 cpumask_copy(cs->effective_cpus, new_cpus);
2175 cs->effective_mems = *new_mems; 2175 cs->effective_mems = *new_mems;
2176 mutex_unlock(&callback_mutex); 2176 spin_unlock_irq(&callback_lock);
2177 2177
2178 if (cpus_updated) 2178 if (cpus_updated)
2179 update_tasks_cpumask(cs); 2179 update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2259 2259
2260 /* synchronize cpus_allowed to cpu_active_mask */ 2260 /* synchronize cpus_allowed to cpu_active_mask */
2261 if (cpus_updated) { 2261 if (cpus_updated) {
2262 mutex_lock(&callback_mutex); 2262 spin_lock_irq(&callback_lock);
2263 if (!on_dfl) 2263 if (!on_dfl)
2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2266 mutex_unlock(&callback_mutex); 2266 spin_unlock_irq(&callback_lock);
2267 /* we don't mess with cpumasks of tasks in top_cpuset */ 2267 /* we don't mess with cpumasks of tasks in top_cpuset */
2268 } 2268 }
2269 2269
2270 /* synchronize mems_allowed to N_MEMORY */ 2270 /* synchronize mems_allowed to N_MEMORY */
2271 if (mems_updated) { 2271 if (mems_updated) {
2272 mutex_lock(&callback_mutex); 2272 spin_lock_irq(&callback_lock);
2273 if (!on_dfl) 2273 if (!on_dfl)
2274 top_cpuset.mems_allowed = new_mems; 2274 top_cpuset.mems_allowed = new_mems;
2275 top_cpuset.effective_mems = new_mems; 2275 top_cpuset.effective_mems = new_mems;
2276 mutex_unlock(&callback_mutex); 2276 spin_unlock_irq(&callback_lock);
2277 update_tasks_nodemask(&top_cpuset); 2277 update_tasks_nodemask(&top_cpuset);
2278 } 2278 }
2279 2279
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
2366 2366
2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2368{ 2368{
2369 mutex_lock(&callback_mutex); 2369 unsigned long flags;
2370
2371 spin_lock_irqsave(&callback_lock, flags);
2370 rcu_read_lock(); 2372 rcu_read_lock();
2371 guarantee_online_cpus(task_cs(tsk), pmask); 2373 guarantee_online_cpus(task_cs(tsk), pmask);
2372 rcu_read_unlock(); 2374 rcu_read_unlock();
2373 mutex_unlock(&callback_mutex); 2375 spin_unlock_irqrestore(&callback_lock, flags);
2374} 2376}
2375 2377
2376void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2378void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
2416nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2418nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2417{ 2419{
2418 nodemask_t mask; 2420 nodemask_t mask;
2421 unsigned long flags;
2419 2422
2420 mutex_lock(&callback_mutex); 2423 spin_lock_irqsave(&callback_lock, flags);
2421 rcu_read_lock(); 2424 rcu_read_lock();
2422 guarantee_online_mems(task_cs(tsk), &mask); 2425 guarantee_online_mems(task_cs(tsk), &mask);
2423 rcu_read_unlock(); 2426 rcu_read_unlock();
2424 mutex_unlock(&callback_mutex); 2427 spin_unlock_irqrestore(&callback_lock, flags);
2425 2428
2426 return mask; 2429 return mask;
2427} 2430}
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2440/* 2443/*
2441 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2444 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
2442 * mem_hardwall ancestor to the specified cpuset. Call holding 2445 * mem_hardwall ancestor to the specified cpuset. Call holding
2443 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2446 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
2444 * (an unusual configuration), then returns the root cpuset. 2447 * (an unusual configuration), then returns the root cpuset.
2445 */ 2448 */
2446static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2449static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2451} 2454}
2452 2455
2453/** 2456/**
2454 * cpuset_node_allowed_softwall - Can we allocate on a memory node? 2457 * cpuset_node_allowed - Can we allocate on a memory node?
2455 * @node: is this an allowed node? 2458 * @node: is this an allowed node?
2456 * @gfp_mask: memory allocation flags 2459 * @gfp_mask: memory allocation flags
2457 * 2460 *
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2463 * flag, yes. 2466 * flag, yes.
2464 * Otherwise, no. 2467 * Otherwise, no.
2465 * 2468 *
2466 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2467 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2468 * might sleep, and might allow a node from an enclosing cpuset.
2469 *
2470 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2471 * cpusets, and never sleeps.
2472 *
2473 * The __GFP_THISNODE placement logic is really handled elsewhere, 2469 * The __GFP_THISNODE placement logic is really handled elsewhere,
2474 * by forcibly using a zonelist starting at a specified node, and by 2470 * by forcibly using a zonelist starting at a specified node, and by
2475 * (in get_page_from_freelist()) refusing to consider the zones for 2471 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2482 * GFP_KERNEL allocations are not so marked, so can escape to the 2478 * GFP_KERNEL allocations are not so marked, so can escape to the
2483 * nearest enclosing hardwalled ancestor cpuset. 2479 * nearest enclosing hardwalled ancestor cpuset.
2484 * 2480 *
2485 * Scanning up parent cpusets requires callback_mutex. The 2481 * Scanning up parent cpusets requires callback_lock. The
2486 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2482 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2487 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2483 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2488 * current tasks mems_allowed came up empty on the first pass over 2484 * current tasks mems_allowed came up empty on the first pass over
2489 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2485 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2490 * cpuset are short of memory, might require taking the callback_mutex 2486 * cpuset are short of memory, might require taking the callback_lock.
2491 * mutex.
2492 * 2487 *
2493 * The first call here from mm/page_alloc:get_page_from_freelist() 2488 * The first call here from mm/page_alloc:get_page_from_freelist()
2494 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2489 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2505 * TIF_MEMDIE - any node ok 2500 * TIF_MEMDIE - any node ok
2506 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2501 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2507 * GFP_USER - only nodes in current tasks mems allowed ok. 2502 * GFP_USER - only nodes in current tasks mems allowed ok.
2508 *
2509 * Rule:
2510 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2511 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2512 * the code that might scan up ancestor cpusets and sleep.
2513 */ 2503 */
2514int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2504int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2515{ 2505{
2516 struct cpuset *cs; /* current cpuset ancestors */ 2506 struct cpuset *cs; /* current cpuset ancestors */
2517 int allowed; /* is allocation in zone z allowed? */ 2507 int allowed; /* is allocation in zone z allowed? */
2508 unsigned long flags;
2518 2509
2519 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2510 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2520 return 1; 2511 return 1;
2521 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2522 if (node_isset(node, current->mems_allowed)) 2512 if (node_isset(node, current->mems_allowed))
2523 return 1; 2513 return 1;
2524 /* 2514 /*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2534 return 1; 2524 return 1;
2535 2525
2536 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2526 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2537 mutex_lock(&callback_mutex); 2527 spin_lock_irqsave(&callback_lock, flags);
2538 2528
2539 rcu_read_lock(); 2529 rcu_read_lock();
2540 cs = nearest_hardwall_ancestor(task_cs(current)); 2530 cs = nearest_hardwall_ancestor(task_cs(current));
2541 allowed = node_isset(node, cs->mems_allowed); 2531 allowed = node_isset(node, cs->mems_allowed);
2542 rcu_read_unlock(); 2532 rcu_read_unlock();
2543 2533
2544 mutex_unlock(&callback_mutex); 2534 spin_unlock_irqrestore(&callback_lock, flags);
2545 return allowed; 2535 return allowed;
2546} 2536}
2547 2537
2548/*
2549 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2550 * @node: is this an allowed node?
2551 * @gfp_mask: memory allocation flags
2552 *
2553 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2554 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2555 * yes. If the task has been OOM killed and has access to memory reserves as
2556 * specified by the TIF_MEMDIE flag, yes.
2557 * Otherwise, no.
2558 *
2559 * The __GFP_THISNODE placement logic is really handled elsewhere,
2560 * by forcibly using a zonelist starting at a specified node, and by
2561 * (in get_page_from_freelist()) refusing to consider the zones for
2562 * any node on the zonelist except the first. By the time any such
2563 * calls get to this routine, we should just shut up and say 'yes'.
2564 *
2565 * Unlike the cpuset_node_allowed_softwall() variant, above,
2566 * this variant requires that the node be in the current task's
2567 * mems_allowed or that we're in interrupt. It does not scan up the
2568 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2569 * It never sleeps.
2570 */
2571int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2572{
2573 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2574 return 1;
2575 if (node_isset(node, current->mems_allowed))
2576 return 1;
2577 /*
2578 * Allow tasks that have access to memory reserves because they have
2579 * been OOM killed to get memory anywhere.
2580 */
2581 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2582 return 1;
2583 return 0;
2584}
2585
2586/** 2538/**
2587 * cpuset_mem_spread_node() - On which node to begin search for a file page 2539 * cpuset_mem_spread_node() - On which node to begin search for a file page
2588 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2540 * cpuset_slab_spread_node() - On which node to begin search for a slab page
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..07ce18ca71e0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,9 @@
27 * version 2. This program is licensed "as is" without any warranty of any 27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied. 28 * kind, whether express or implied.
29 */ 29 */
30
31#define pr_fmt(fmt) "KGDB: " fmt
32
30#include <linux/pid_namespace.h> 33#include <linux/pid_namespace.h>
31#include <linux/clocksource.h> 34#include <linux/clocksource.h>
32#include <linux/serial_core.h> 35#include <linux/serial_core.h>
@@ -196,8 +199,8 @@ int __weak kgdb_validate_break_address(unsigned long addr)
196 return err; 199 return err;
197 err = kgdb_arch_remove_breakpoint(&tmp); 200 err = kgdb_arch_remove_breakpoint(&tmp);
198 if (err) 201 if (err)
199 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " 202 pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
200 "memory destroyed at: %lx", addr); 203 addr);
201 return err; 204 return err;
202} 205}
203 206
@@ -256,8 +259,8 @@ int dbg_activate_sw_breakpoints(void)
256 error = kgdb_arch_set_breakpoint(&kgdb_break[i]); 259 error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
257 if (error) { 260 if (error) {
258 ret = error; 261 ret = error;
259 printk(KERN_INFO "KGDB: BP install failed: %lx", 262 pr_info("BP install failed: %lx\n",
260 kgdb_break[i].bpt_addr); 263 kgdb_break[i].bpt_addr);
261 continue; 264 continue;
262 } 265 }
263 266
@@ -319,8 +322,8 @@ int dbg_deactivate_sw_breakpoints(void)
319 continue; 322 continue;
320 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 323 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
321 if (error) { 324 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", 325 pr_info("BP remove failed: %lx\n",
323 kgdb_break[i].bpt_addr); 326 kgdb_break[i].bpt_addr);
324 ret = error; 327 ret = error;
325 } 328 }
326 329
@@ -367,7 +370,7 @@ int dbg_remove_all_break(void)
367 goto setundefined; 370 goto setundefined;
368 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); 371 error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
369 if (error) 372 if (error)
370 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", 373 pr_err("breakpoint remove failed: %lx\n",
371 kgdb_break[i].bpt_addr); 374 kgdb_break[i].bpt_addr);
372setundefined: 375setundefined:
373 kgdb_break[i].state = BP_UNDEFINED; 376 kgdb_break[i].state = BP_UNDEFINED;
@@ -400,9 +403,9 @@ static int kgdb_io_ready(int print_wait)
400 if (print_wait) { 403 if (print_wait) {
401#ifdef CONFIG_KGDB_KDB 404#ifdef CONFIG_KGDB_KDB
402 if (!dbg_kdb_mode) 405 if (!dbg_kdb_mode)
403 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); 406 pr_crit("waiting... or $3#33 for KDB\n");
404#else 407#else
405 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); 408 pr_crit("Waiting for remote debugger\n");
406#endif 409#endif
407 } 410 }
408 return 1; 411 return 1;
@@ -430,8 +433,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
430 exception_level = 0; 433 exception_level = 0;
431 kgdb_skipexception(ks->ex_vector, ks->linux_regs); 434 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
432 dbg_activate_sw_breakpoints(); 435 dbg_activate_sw_breakpoints();
433 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", 436 pr_crit("re-enter error: breakpoint removed %lx\n", addr);
434 addr);
435 WARN_ON_ONCE(1); 437 WARN_ON_ONCE(1);
436 438
437 return 1; 439 return 1;
@@ -444,7 +446,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
444 panic("Recursive entry to debugger"); 446 panic("Recursive entry to debugger");
445 } 447 }
446 448
447 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); 449 pr_crit("re-enter exception: ALL breakpoints killed\n");
448#ifdef CONFIG_KGDB_KDB 450#ifdef CONFIG_KGDB_KDB
449 /* Allow kdb to debug itself one level */ 451 /* Allow kdb to debug itself one level */
450 return 0; 452 return 0;
@@ -471,6 +473,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
471 int cpu; 473 int cpu;
472 int trace_on = 0; 474 int trace_on = 0;
473 int online_cpus = num_online_cpus(); 475 int online_cpus = num_online_cpus();
476 u64 time_left;
474 477
475 kgdb_info[ks->cpu].enter_kgdb++; 478 kgdb_info[ks->cpu].enter_kgdb++;
476 kgdb_info[ks->cpu].exception_state |= exception_state; 479 kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +598,13 @@ return_normal:
595 /* 598 /*
596 * Wait for the other CPUs to be notified and be waiting for us: 599 * Wait for the other CPUs to be notified and be waiting for us:
597 */ 600 */
598 while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + 601 time_left = loops_per_jiffy * HZ;
599 atomic_read(&slaves_in_kgdb)) != online_cpus) 602 while (kgdb_do_roundup && --time_left &&
603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
604 online_cpus)
600 cpu_relax(); 605 cpu_relax();
606 if (!time_left)
607 pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
601 608
602 /* 609 /*
603 * At this point the primary processor is completely 610 * At this point the primary processor is completely
@@ -795,15 +802,15 @@ static struct console kgdbcons = {
795static void sysrq_handle_dbg(int key) 802static void sysrq_handle_dbg(int key)
796{ 803{
797 if (!dbg_io_ops) { 804 if (!dbg_io_ops) {
798 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); 805 pr_crit("ERROR: No KGDB I/O module available\n");
799 return; 806 return;
800 } 807 }
801 if (!kgdb_connected) { 808 if (!kgdb_connected) {
802#ifdef CONFIG_KGDB_KDB 809#ifdef CONFIG_KGDB_KDB
803 if (!dbg_kdb_mode) 810 if (!dbg_kdb_mode)
804 printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); 811 pr_crit("KGDB or $3#33 for KDB\n");
805#else 812#else
806 printk(KERN_CRIT "Entering KGDB\n"); 813 pr_crit("Entering KGDB\n");
807#endif 814#endif
808 } 815 }
809 816
@@ -945,7 +952,7 @@ static void kgdb_initial_breakpoint(void)
945{ 952{
946 kgdb_break_asap = 0; 953 kgdb_break_asap = 0;
947 954
948 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); 955 pr_crit("Waiting for connection from remote gdb...\n");
949 kgdb_breakpoint(); 956 kgdb_breakpoint();
950} 957}
951 958
@@ -964,8 +971,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
964 if (dbg_io_ops) { 971 if (dbg_io_ops) {
965 spin_unlock(&kgdb_registration_lock); 972 spin_unlock(&kgdb_registration_lock);
966 973
967 printk(KERN_ERR "kgdb: Another I/O driver is already " 974 pr_err("Another I/O driver is already registered with KGDB\n");
968 "registered with KGDB.\n");
969 return -EBUSY; 975 return -EBUSY;
970 } 976 }
971 977
@@ -981,8 +987,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
981 987
982 spin_unlock(&kgdb_registration_lock); 988 spin_unlock(&kgdb_registration_lock);
983 989
984 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", 990 pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
985 new_dbg_io_ops->name);
986 991
987 /* Arm KGDB now. */ 992 /* Arm KGDB now. */
988 kgdb_register_callbacks(); 993 kgdb_register_callbacks();
@@ -1017,8 +1022,7 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
1017 1022
1018 spin_unlock(&kgdb_registration_lock); 1023 spin_unlock(&kgdb_registration_lock);
1019 1024
1020 printk(KERN_INFO 1025 pr_info("Unregistered I/O driver %s, debugger disabled\n",
1021 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1022 old_dbg_io_ops->name); 1026 old_dbg_io_ops->name);
1023} 1027}
1024EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); 1028EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..e1dbf4a2c69e 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -531,22 +531,29 @@ void __init kdb_initbptab(void)
531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) 531 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
532 bp->bp_free = 1; 532 bp->bp_free = 1;
533 533
534 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]", 534 kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
535 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 535 "Set/Display breakpoints", 0,
536 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]", 536 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
537 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); 537 kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
538 "Display breakpoints", 0,
539 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
538 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) 540 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
539 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]", 541 kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
540 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); 542 "[datar [length]|dataw [length]] Set hw brk", 0,
541 kdb_register_repeat("bc", kdb_bc, "<bpnum>", 543 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
542 "Clear Breakpoint", 0, KDB_REPEAT_NONE); 544 kdb_register_flags("bc", kdb_bc, "<bpnum>",
543 kdb_register_repeat("be", kdb_bc, "<bpnum>", 545 "Clear Breakpoint", 0,
544 "Enable Breakpoint", 0, KDB_REPEAT_NONE); 546 KDB_ENABLE_FLOW_CTRL);
545 kdb_register_repeat("bd", kdb_bc, "<bpnum>", 547 kdb_register_flags("be", kdb_bc, "<bpnum>",
546 "Disable Breakpoint", 0, KDB_REPEAT_NONE); 548 "Enable Breakpoint", 0,
547 549 KDB_ENABLE_FLOW_CTRL);
548 kdb_register_repeat("ss", kdb_ss, "", 550 kdb_register_flags("bd", kdb_bc, "<bpnum>",
549 "Single Step", 1, KDB_REPEAT_NO_ARGS); 551 "Disable Breakpoint", 0,
552 KDB_ENABLE_FLOW_CTRL);
553
554 kdb_register_flags("ss", kdb_ss, "",
555 "Single Step", 1,
556 KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
550 /* 557 /*
551 * Architecture dependent initialization. 558 * Architecture dependent initialization.
552 */ 559 */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca34dcfe..15e1a7af5dd0 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
129 ks->pass_exception = 1; 129 ks->pass_exception = 1;
130 KDB_FLAG_SET(CATASTROPHIC); 130 KDB_FLAG_SET(CATASTROPHIC);
131 } 131 }
132 /* set CATASTROPHIC if the system contains unresponsive processors */
133 for_each_online_cpu(i)
134 if (!kgdb_info[i].enter_kgdb)
135 KDB_FLAG_SET(CATASTROPHIC);
132 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { 136 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
133 KDB_STATE_CLEAR(SSBPT); 137 KDB_STATE_CLEAR(SSBPT);
134 KDB_STATE_CLEAR(DOING_SS); 138 KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..7b40c5f07dce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/types.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/kernel.h> 17#include <linux/kernel.h>
17#include <linux/kmsg_dump.h> 18#include <linux/kmsg_dump.h>
@@ -23,6 +24,7 @@
23#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
24#include <linux/atomic.h> 25#include <linux/atomic.h>
25#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
26#include <linux/mm.h> 28#include <linux/mm.h>
27#include <linux/init.h> 29#include <linux/init.h>
28#include <linux/kallsyms.h> 30#include <linux/kallsyms.h>
@@ -42,6 +44,12 @@
42#include <linux/slab.h> 44#include <linux/slab.h>
43#include "kdb_private.h" 45#include "kdb_private.h"
44 46
47#undef MODULE_PARAM_PREFIX
48#define MODULE_PARAM_PREFIX "kdb."
49
50static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
51module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
52
45#define GREP_LEN 256 53#define GREP_LEN 256
46char kdb_grep_string[GREP_LEN]; 54char kdb_grep_string[GREP_LEN];
47int kdb_grepping_flag; 55int kdb_grepping_flag;
@@ -121,6 +129,7 @@ static kdbmsg_t kdbmsgs[] = {
121 KDBMSG(BADLENGTH, "Invalid length field"), 129 KDBMSG(BADLENGTH, "Invalid length field"),
122 KDBMSG(NOBP, "No Breakpoint exists"), 130 KDBMSG(NOBP, "No Breakpoint exists"),
123 KDBMSG(BADADDR, "Invalid address"), 131 KDBMSG(BADADDR, "Invalid address"),
132 KDBMSG(NOPERM, "Permission denied"),
124}; 133};
125#undef KDBMSG 134#undef KDBMSG
126 135
@@ -188,6 +197,26 @@ struct task_struct *kdb_curr_task(int cpu)
188} 197}
189 198
190/* 199/*
200 * Check whether the flags of the current command and the permissions
201 * of the kdb console has allow a command to be run.
202 */
203static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
204 bool no_args)
205{
206 /* permissions comes from userspace so needs massaging slightly */
207 permissions &= KDB_ENABLE_MASK;
208 permissions |= KDB_ENABLE_ALWAYS_SAFE;
209
210 /* some commands change group when launched with no arguments */
211 if (no_args)
212 permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
213
214 flags |= KDB_ENABLE_ALL;
215
216 return permissions & flags;
217}
218
219/*
191 * kdbgetenv - This function will return the character string value of 220 * kdbgetenv - This function will return the character string value of
192 * an environment variable. 221 * an environment variable.
193 * Parameters: 222 * Parameters:
@@ -476,6 +505,15 @@ int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
476 kdb_symtab_t symtab; 505 kdb_symtab_t symtab;
477 506
478 /* 507 /*
508 * If the enable flags prohibit both arbitrary memory access
509 * and flow control then there are no reasonable grounds to
510 * provide symbol lookup.
511 */
512 if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
513 kdb_cmd_enabled, false))
514 return KDB_NOPERM;
515
516 /*
479 * Process arguments which follow the following syntax: 517 * Process arguments which follow the following syntax:
480 * 518 *
481 * symbol | numeric-address [+/- numeric-offset] 519 * symbol | numeric-address [+/- numeric-offset]
@@ -641,8 +679,13 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0)
641 if (!s->count) 679 if (!s->count)
642 s->usable = 0; 680 s->usable = 0;
643 if (s->usable) 681 if (s->usable)
644 kdb_register(s->name, kdb_exec_defcmd, 682 /* macros are always safe because when executed each
645 s->usage, s->help, 0); 683 * internal command re-enters kdb_parse() and is
684 * safety checked individually.
685 */
686 kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
687 s->help, 0,
688 KDB_ENABLE_ALWAYS_SAFE);
646 return 0; 689 return 0;
647 } 690 }
648 if (!s->usable) 691 if (!s->usable)
@@ -1003,25 +1046,22 @@ int kdb_parse(const char *cmdstr)
1003 1046
1004 if (i < kdb_max_commands) { 1047 if (i < kdb_max_commands) {
1005 int result; 1048 int result;
1049
1050 if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
1051 return KDB_NOPERM;
1052
1006 KDB_STATE_SET(CMD); 1053 KDB_STATE_SET(CMD);
1007 result = (*tp->cmd_func)(argc-1, (const char **)argv); 1054 result = (*tp->cmd_func)(argc-1, (const char **)argv);
1008 if (result && ignore_errors && result > KDB_CMD_GO) 1055 if (result && ignore_errors && result > KDB_CMD_GO)
1009 result = 0; 1056 result = 0;
1010 KDB_STATE_CLEAR(CMD); 1057 KDB_STATE_CLEAR(CMD);
1011 switch (tp->cmd_repeat) { 1058
1012 case KDB_REPEAT_NONE: 1059 if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
1013 argc = 0; 1060 return result;
1014 if (argv[0]) 1061
1015 *(argv[0]) = '\0'; 1062 argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
1016 break; 1063 if (argv[argc])
1017 case KDB_REPEAT_NO_ARGS: 1064 *(argv[argc]) = '\0';
1018 argc = 1;
1019 if (argv[1])
1020 *(argv[1]) = '\0';
1021 break;
1022 case KDB_REPEAT_WITH_ARGS:
1023 break;
1024 }
1025 return result; 1065 return result;
1026 } 1066 }
1027 1067
@@ -1921,10 +1961,14 @@ static int kdb_rm(int argc, const char **argv)
1921 */ 1961 */
1922static int kdb_sr(int argc, const char **argv) 1962static int kdb_sr(int argc, const char **argv)
1923{ 1963{
1964 bool check_mask =
1965 !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
1966
1924 if (argc != 1) 1967 if (argc != 1)
1925 return KDB_ARGCOUNT; 1968 return KDB_ARGCOUNT;
1969
1926 kdb_trap_printk++; 1970 kdb_trap_printk++;
1927 __handle_sysrq(*argv[1], false); 1971 __handle_sysrq(*argv[1], check_mask);
1928 kdb_trap_printk--; 1972 kdb_trap_printk--;
1929 1973
1930 return 0; 1974 return 0;
@@ -1979,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
1979 kdb_printf("%-20s%8u 0x%p ", mod->name, 2023 kdb_printf("%-20s%8u 0x%p ", mod->name,
1980 mod->core_size, (void *)mod); 2024 mod->core_size, (void *)mod);
1981#ifdef CONFIG_MODULE_UNLOAD 2025#ifdef CONFIG_MODULE_UNLOAD
1982 kdb_printf("%4ld ", module_refcount(mod)); 2026 kdb_printf("%4d ", module_refcount(mod));
1983#endif 2027#endif
1984 if (mod->state == MODULE_STATE_GOING) 2028 if (mod->state == MODULE_STATE_GOING)
1985 kdb_printf(" (Unloading)"); 2029 kdb_printf(" (Unloading)");
@@ -2157,6 +2201,8 @@ static void kdb_cpu_status(void)
2157 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { 2201 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2158 if (!cpu_online(i)) { 2202 if (!cpu_online(i)) {
2159 state = 'F'; /* cpu is offline */ 2203 state = 'F'; /* cpu is offline */
2204 } else if (!kgdb_info[i].enter_kgdb) {
2205 state = 'D'; /* cpu is online but unresponsive */
2160 } else { 2206 } else {
2161 state = ' '; /* cpu is responding to kdb */ 2207 state = ' '; /* cpu is responding to kdb */
2162 if (kdb_task_state_char(KDB_TSK(i)) == 'I') 2208 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
2210 /* 2256 /*
2211 * Validate cpunum 2257 * Validate cpunum
2212 */ 2258 */
2213 if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) 2259 if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
2214 return KDB_BADCPUNUM; 2260 return KDB_BADCPUNUM;
2215 2261
2216 dbg_switch_cpu = cpunum; 2262 dbg_switch_cpu = cpunum;
@@ -2375,6 +2421,8 @@ static int kdb_help(int argc, const char **argv)
2375 return 0; 2421 return 0;
2376 if (!kt->cmd_name) 2422 if (!kt->cmd_name)
2377 continue; 2423 continue;
2424 if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
2425 continue;
2378 if (strlen(kt->cmd_usage) > 20) 2426 if (strlen(kt->cmd_usage) > 20)
2379 space = "\n "; 2427 space = "\n ";
2380 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, 2428 kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
@@ -2629,7 +2677,7 @@ static int kdb_grep_help(int argc, const char **argv)
2629} 2677}
2630 2678
2631/* 2679/*
2632 * kdb_register_repeat - This function is used to register a kernel 2680 * kdb_register_flags - This function is used to register a kernel
2633 * debugger command. 2681 * debugger command.
2634 * Inputs: 2682 * Inputs:
2635 * cmd Command name 2683 * cmd Command name
@@ -2641,12 +2689,12 @@ static int kdb_grep_help(int argc, const char **argv)
2641 * zero for success, one if a duplicate command. 2689 * zero for success, one if a duplicate command.
2642 */ 2690 */
2643#define kdb_command_extend 50 /* arbitrary */ 2691#define kdb_command_extend 50 /* arbitrary */
2644int kdb_register_repeat(char *cmd, 2692int kdb_register_flags(char *cmd,
2645 kdb_func_t func, 2693 kdb_func_t func,
2646 char *usage, 2694 char *usage,
2647 char *help, 2695 char *help,
2648 short minlen, 2696 short minlen,
2649 kdb_repeat_t repeat) 2697 kdb_cmdflags_t flags)
2650{ 2698{
2651 int i; 2699 int i;
2652 kdbtab_t *kp; 2700 kdbtab_t *kp;
@@ -2694,19 +2742,18 @@ int kdb_register_repeat(char *cmd,
2694 kp->cmd_func = func; 2742 kp->cmd_func = func;
2695 kp->cmd_usage = usage; 2743 kp->cmd_usage = usage;
2696 kp->cmd_help = help; 2744 kp->cmd_help = help;
2697 kp->cmd_flags = 0;
2698 kp->cmd_minlen = minlen; 2745 kp->cmd_minlen = minlen;
2699 kp->cmd_repeat = repeat; 2746 kp->cmd_flags = flags;
2700 2747
2701 return 0; 2748 return 0;
2702} 2749}
2703EXPORT_SYMBOL_GPL(kdb_register_repeat); 2750EXPORT_SYMBOL_GPL(kdb_register_flags);
2704 2751
2705 2752
2706/* 2753/*
2707 * kdb_register - Compatibility register function for commands that do 2754 * kdb_register - Compatibility register function for commands that do
2708 * not need to specify a repeat state. Equivalent to 2755 * not need to specify a repeat state. Equivalent to
2709 * kdb_register_repeat with KDB_REPEAT_NONE. 2756 * kdb_register_flags with flags set to 0.
2710 * Inputs: 2757 * Inputs:
2711 * cmd Command name 2758 * cmd Command name
2712 * func Function to execute the command 2759 * func Function to execute the command
@@ -2721,8 +2768,7 @@ int kdb_register(char *cmd,
2721 char *help, 2768 char *help,
2722 short minlen) 2769 short minlen)
2723{ 2770{
2724 return kdb_register_repeat(cmd, func, usage, help, minlen, 2771 return kdb_register_flags(cmd, func, usage, help, minlen, 0);
2725 KDB_REPEAT_NONE);
2726} 2772}
2727EXPORT_SYMBOL_GPL(kdb_register); 2773EXPORT_SYMBOL_GPL(kdb_register);
2728 2774
@@ -2764,80 +2810,109 @@ static void __init kdb_inittab(void)
2764 for_each_kdbcmd(kp, i) 2810 for_each_kdbcmd(kp, i)
2765 kp->cmd_name = NULL; 2811 kp->cmd_name = NULL;
2766 2812
2767 kdb_register_repeat("md", kdb_md, "<vaddr>", 2813 kdb_register_flags("md", kdb_md, "<vaddr>",
2768 "Display Memory Contents, also mdWcN, e.g. md8c1", 1, 2814 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2769 KDB_REPEAT_NO_ARGS); 2815 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2770 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>", 2816 kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
2771 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); 2817 "Display Raw Memory", 0,
2772 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>", 2818 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2773 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); 2819 kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
2774 kdb_register_repeat("mds", kdb_md, "<vaddr>", 2820 "Display Physical Memory", 0,
2775 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); 2821 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2776 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>", 2822 kdb_register_flags("mds", kdb_md, "<vaddr>",
2777 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); 2823 "Display Memory Symbolically", 0,
2778 kdb_register_repeat("go", kdb_go, "[<vaddr>]", 2824 KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
2779 "Continue Execution", 1, KDB_REPEAT_NONE); 2825 kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
2780 kdb_register_repeat("rd", kdb_rd, "", 2826 "Modify Memory Contents", 0,
2781 "Display Registers", 0, KDB_REPEAT_NONE); 2827 KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
2782 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>", 2828 kdb_register_flags("go", kdb_go, "[<vaddr>]",
2783 "Modify Registers", 0, KDB_REPEAT_NONE); 2829 "Continue Execution", 1,
2784 kdb_register_repeat("ef", kdb_ef, "<vaddr>", 2830 KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2785 "Display exception frame", 0, KDB_REPEAT_NONE); 2831 kdb_register_flags("rd", kdb_rd, "",
2786 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]", 2832 "Display Registers", 0,
2787 "Stack traceback", 1, KDB_REPEAT_NONE); 2833 KDB_ENABLE_REG_READ);
2788 kdb_register_repeat("btp", kdb_bt, "<pid>", 2834 kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
2789 "Display stack for process <pid>", 0, KDB_REPEAT_NONE); 2835 "Modify Registers", 0,
2790 kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", 2836 KDB_ENABLE_REG_WRITE);
2791 "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); 2837 kdb_register_flags("ef", kdb_ef, "<vaddr>",
2792 kdb_register_repeat("btc", kdb_bt, "", 2838 "Display exception frame", 0,
2793 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); 2839 KDB_ENABLE_MEM_READ);
2794 kdb_register_repeat("btt", kdb_bt, "<vaddr>", 2840 kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
2841 "Stack traceback", 1,
2842 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2843 kdb_register_flags("btp", kdb_bt, "<pid>",
2844 "Display stack for process <pid>", 0,
2845 KDB_ENABLE_INSPECT);
2846 kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
2847 "Backtrace all processes matching state flag", 0,
2848 KDB_ENABLE_INSPECT);
2849 kdb_register_flags("btc", kdb_bt, "",
2850 "Backtrace current process on each cpu", 0,
2851 KDB_ENABLE_INSPECT);
2852 kdb_register_flags("btt", kdb_bt, "<vaddr>",
2795 "Backtrace process given its struct task address", 0, 2853 "Backtrace process given its struct task address", 0,
2796 KDB_REPEAT_NONE); 2854 KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
2797 kdb_register_repeat("env", kdb_env, "", 2855 kdb_register_flags("env", kdb_env, "",
2798 "Show environment variables", 0, KDB_REPEAT_NONE); 2856 "Show environment variables", 0,
2799 kdb_register_repeat("set", kdb_set, "", 2857 KDB_ENABLE_ALWAYS_SAFE);
2800 "Set environment variables", 0, KDB_REPEAT_NONE); 2858 kdb_register_flags("set", kdb_set, "",
2801 kdb_register_repeat("help", kdb_help, "", 2859 "Set environment variables", 0,
2802 "Display Help Message", 1, KDB_REPEAT_NONE); 2860 KDB_ENABLE_ALWAYS_SAFE);
2803 kdb_register_repeat("?", kdb_help, "", 2861 kdb_register_flags("help", kdb_help, "",
2804 "Display Help Message", 0, KDB_REPEAT_NONE); 2862 "Display Help Message", 1,
2805 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>", 2863 KDB_ENABLE_ALWAYS_SAFE);
2806 "Switch to new cpu", 0, KDB_REPEAT_NONE); 2864 kdb_register_flags("?", kdb_help, "",
2807 kdb_register_repeat("kgdb", kdb_kgdb, "", 2865 "Display Help Message", 0,
2808 "Enter kgdb mode", 0, KDB_REPEAT_NONE); 2866 KDB_ENABLE_ALWAYS_SAFE);
2809 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]", 2867 kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
2810 "Display active task list", 0, KDB_REPEAT_NONE); 2868 "Switch to new cpu", 0,
2811 kdb_register_repeat("pid", kdb_pid, "<pidnum>", 2869 KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
2812 "Switch to another task", 0, KDB_REPEAT_NONE); 2870 kdb_register_flags("kgdb", kdb_kgdb, "",
2813 kdb_register_repeat("reboot", kdb_reboot, "", 2871 "Enter kgdb mode", 0, 0);
2814 "Reboot the machine immediately", 0, KDB_REPEAT_NONE); 2872 kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
2873 "Display active task list", 0,
2874 KDB_ENABLE_INSPECT);
2875 kdb_register_flags("pid", kdb_pid, "<pidnum>",
2876 "Switch to another task", 0,
2877 KDB_ENABLE_INSPECT);
2878 kdb_register_flags("reboot", kdb_reboot, "",
2879 "Reboot the machine immediately", 0,
2880 KDB_ENABLE_REBOOT);
2815#if defined(CONFIG_MODULES) 2881#if defined(CONFIG_MODULES)
2816 kdb_register_repeat("lsmod", kdb_lsmod, "", 2882 kdb_register_flags("lsmod", kdb_lsmod, "",
2817 "List loaded kernel modules", 0, KDB_REPEAT_NONE); 2883 "List loaded kernel modules", 0,
2884 KDB_ENABLE_INSPECT);
2818#endif 2885#endif
2819#if defined(CONFIG_MAGIC_SYSRQ) 2886#if defined(CONFIG_MAGIC_SYSRQ)
2820 kdb_register_repeat("sr", kdb_sr, "<key>", 2887 kdb_register_flags("sr", kdb_sr, "<key>",
2821 "Magic SysRq key", 0, KDB_REPEAT_NONE); 2888 "Magic SysRq key", 0,
2889 KDB_ENABLE_ALWAYS_SAFE);
2822#endif 2890#endif
2823#if defined(CONFIG_PRINTK) 2891#if defined(CONFIG_PRINTK)
2824 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", 2892 kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
2825 "Display syslog buffer", 0, KDB_REPEAT_NONE); 2893 "Display syslog buffer", 0,
2894 KDB_ENABLE_ALWAYS_SAFE);
2826#endif 2895#endif
2827 if (arch_kgdb_ops.enable_nmi) { 2896 if (arch_kgdb_ops.enable_nmi) {
2828 kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", 2897 kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
2829 "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); 2898 "Disable NMI entry to KDB", 0,
2830 } 2899 KDB_ENABLE_ALWAYS_SAFE);
2831 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", 2900 }
2832 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); 2901 kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2833 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", 2902 "Define a set of commands, down to endefcmd", 0,
2834 "Send a signal to a process", 0, KDB_REPEAT_NONE); 2903 KDB_ENABLE_ALWAYS_SAFE);
2835 kdb_register_repeat("summary", kdb_summary, "", 2904 kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
2836 "Summarize the system", 4, KDB_REPEAT_NONE); 2905 "Send a signal to a process", 0,
2837 kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", 2906 KDB_ENABLE_SIGNAL);
2838 "Display per_cpu variables", 3, KDB_REPEAT_NONE); 2907 kdb_register_flags("summary", kdb_summary, "",
2839 kdb_register_repeat("grephelp", kdb_grep_help, "", 2908 "Summarize the system", 4,
2840 "Display help on | grep", 0, KDB_REPEAT_NONE); 2909 KDB_ENABLE_ALWAYS_SAFE);
2910 kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
2911 "Display per_cpu variables", 3,
2912 KDB_ENABLE_MEM_READ);
2913 kdb_register_flags("grephelp", kdb_grep_help, "",
2914 "Display help on | grep", 0,
2915 KDB_ENABLE_ALWAYS_SAFE);
2841} 2916}
2842 2917
2843/* Execute any commands defined in kdb_cmds. */ 2918/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 7afd3c8c41d5..eaacd1693954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -172,10 +172,9 @@ typedef struct _kdbtab {
172 kdb_func_t cmd_func; /* Function to execute command */ 172 kdb_func_t cmd_func; /* Function to execute command */
173 char *cmd_usage; /* Usage String for this command */ 173 char *cmd_usage; /* Usage String for this command */
174 char *cmd_help; /* Help message for this command */ 174 char *cmd_help; /* Help message for this command */
175 short cmd_flags; /* Parsing flags */
176 short cmd_minlen; /* Minimum legal # command 175 short cmd_minlen; /* Minimum legal # command
177 * chars required */ 176 * chars required */
178 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ 177 kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
179} kdbtab_t; 178} kdbtab_t;
180 179
181extern int kdb_bt(int, const char **); /* KDB display back trace */ 180extern int kdb_bt(int, const char **); /* KDB display back trace */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e19d3ebc29c..882f835a0d85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
614 if (!f.file) 614 if (!f.file)
615 return -EBADF; 615 return -EBADF;
616 616
617 css = css_tryget_online_from_dir(f.file->f_dentry, 617 css = css_tryget_online_from_dir(f.file->f_path.dentry,
618 &perf_event_cgrp_subsys); 618 &perf_event_cgrp_subsys);
619 if (IS_ERR(css)) { 619 if (IS_ERR(css)) {
620 ret = PTR_ERR(css); 620 ret = PTR_ERR(css);
@@ -4461,18 +4461,14 @@ perf_output_sample_regs(struct perf_output_handle *handle,
4461} 4461}
4462 4462
4463static void perf_sample_regs_user(struct perf_regs *regs_user, 4463static void perf_sample_regs_user(struct perf_regs *regs_user,
4464 struct pt_regs *regs) 4464 struct pt_regs *regs,
4465 struct pt_regs *regs_user_copy)
4465{ 4466{
4466 if (!user_mode(regs)) { 4467 if (user_mode(regs)) {
4467 if (current->mm) 4468 regs_user->abi = perf_reg_abi(current);
4468 regs = task_pt_regs(current);
4469 else
4470 regs = NULL;
4471 }
4472
4473 if (regs) {
4474 regs_user->abi = perf_reg_abi(current);
4475 regs_user->regs = regs; 4469 regs_user->regs = regs;
4470 } else if (current->mm) {
4471 perf_get_regs_user(regs_user, regs, regs_user_copy);
4476 } else { 4472 } else {
4477 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 4473 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4478 regs_user->regs = NULL; 4474 regs_user->regs = NULL;
@@ -4951,7 +4947,8 @@ void perf_prepare_sample(struct perf_event_header *header,
4951 } 4947 }
4952 4948
4953 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) 4949 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
4954 perf_sample_regs_user(&data->regs_user, regs); 4950 perf_sample_regs_user(&data->regs_user, regs,
4951 &data->regs_user_copy);
4955 4952
4956 if (sample_type & PERF_SAMPLE_REGS_USER) { 4953 if (sample_type & PERF_SAMPLE_REGS_USER) {
4957 /* regs dump ABI info */ 4954 /* regs dump ABI info */
@@ -7477,11 +7474,11 @@ SYSCALL_DEFINE5(perf_event_open,
7477 7474
7478 if (move_group) { 7475 if (move_group) {
7479 synchronize_rcu(); 7476 synchronize_rcu();
7480 perf_install_in_context(ctx, group_leader, event->cpu); 7477 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7481 get_ctx(ctx); 7478 get_ctx(ctx);
7482 list_for_each_entry(sibling, &group_leader->sibling_list, 7479 list_for_each_entry(sibling, &group_leader->sibling_list,
7483 group_entry) { 7480 group_entry) {
7484 perf_install_in_context(ctx, sibling, event->cpu); 7481 perf_install_in_context(ctx, sibling, sibling->cpu);
7485 get_ctx(ctx); 7482 get_ctx(ctx);
7486 } 7483 }
7487 } 7484 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ed8f2cde34c5..cb346f26a22d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
193 } 193 }
194 194
195 flush_cache_page(vma, addr, pte_pfn(*ptep)); 195 flush_cache_page(vma, addr, pte_pfn(*ptep));
196 ptep_clear_flush(vma, addr, ptep); 196 ptep_clear_flush_notify(vma, addr, ptep);
197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 197 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
198 198
199 page_remove_rmap(page); 199 page_remove_rmap(page);
@@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
724 int more = 0; 724 int more = 0;
725 725
726 again: 726 again:
727 mutex_lock(&mapping->i_mmap_mutex); 727 i_mmap_lock_read(mapping);
728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 728 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
729 if (!valid_vma(vma, is_register)) 729 if (!valid_vma(vma, is_register))
730 continue; 730 continue;
731 731
732 if (!prev && !more) { 732 if (!prev && !more) {
733 /* 733 /*
734 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through 734 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
735 * reclaim. This is optimistic, no harm done if it fails. 735 * reclaim. This is optimistic, no harm done if it fails.
736 */ 736 */
737 prev = kmalloc(sizeof(struct map_info), 737 prev = kmalloc(sizeof(struct map_info),
@@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
755 info->mm = vma->vm_mm; 755 info->mm = vma->vm_mm;
756 info->vaddr = offset_to_vaddr(vma, offset); 756 info->vaddr = offset_to_vaddr(vma, offset);
757 } 757 }
758 mutex_unlock(&mapping->i_mmap_mutex); 758 i_mmap_unlock_read(mapping);
759 759
760 if (!more) 760 if (!more)
761 goto out; 761 goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc9..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
118 } 118 }
119 119
120 /* 120 /*
121 * Accumulate here the counters for all threads but the group leader 121 * Accumulate here the counters for all threads as they die. We could
122 * as they die, so they can be added into the process-wide totals 122 * skip the group leader because it is the last user of signal_struct,
123 * when those are taken. The group leader stays around as a zombie as 123 * but we want to avoid the race with thread_group_cputime() which can
124 * long as there are other threads. When it gets reaped, the exit.c 124 * see the empty ->thread_head list.
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */ 125 */
129 task_cputime(tsk, &utime, &stime); 126 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock); 127 write_seqlock(&sig->stats_lock);
@@ -215,27 +212,6 @@ repeat:
215} 212}
216 213
217/* 214/*
218 * This checks not only the pgrp, but falls back on the pid if no
219 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
220 * without this...
221 *
222 * The caller must hold rcu lock or the tasklist lock.
223 */
224struct pid *session_of_pgrp(struct pid *pgrp)
225{
226 struct task_struct *p;
227 struct pid *sid = NULL;
228
229 p = pid_task(pgrp, PIDTYPE_PGID);
230 if (p == NULL)
231 p = pid_task(pgrp, PIDTYPE_PID);
232 if (p != NULL)
233 sid = task_session(p);
234
235 return sid;
236}
237
238/*
239 * Determine if a process group is "orphaned", according to the POSIX 215 * Determine if a process group is "orphaned", according to the POSIX
240 * definition in 2.2.2.52. Orphaned process groups are not to be affected 216 * definition in 2.2.2.52. Orphaned process groups are not to be affected
241 * by terminal-generated stop signals. Newly orphaned process groups are 217 * by terminal-generated stop signals. Newly orphaned process groups are
@@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk)
462 clear_thread_flag(TIF_MEMDIE); 438 clear_thread_flag(TIF_MEMDIE);
463} 439}
464 440
441static struct task_struct *find_alive_thread(struct task_struct *p)
442{
443 struct task_struct *t;
444
445 for_each_thread(p, t) {
446 if (!(t->flags & PF_EXITING))
447 return t;
448 }
449 return NULL;
450}
451
452static struct task_struct *find_child_reaper(struct task_struct *father)
453 __releases(&tasklist_lock)
454 __acquires(&tasklist_lock)
455{
456 struct pid_namespace *pid_ns = task_active_pid_ns(father);
457 struct task_struct *reaper = pid_ns->child_reaper;
458
459 if (likely(reaper != father))
460 return reaper;
461
462 reaper = find_alive_thread(father);
463 if (reaper) {
464 pid_ns->child_reaper = reaper;
465 return reaper;
466 }
467
468 write_unlock_irq(&tasklist_lock);
469 if (unlikely(pid_ns == &init_pid_ns)) {
470 panic("Attempted to kill init! exitcode=0x%08x\n",
471 father->signal->group_exit_code ?: father->exit_code);
472 }
473 zap_pid_ns_processes(pid_ns);
474 write_lock_irq(&tasklist_lock);
475
476 return father;
477}
478
465/* 479/*
466 * When we die, we re-parent all our children, and try to: 480 * When we die, we re-parent all our children, and try to:
467 * 1. give them to another thread in our thread group, if such a member exists 481 * 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk)
469 * child_subreaper for its children (like a service manager) 483 * child_subreaper for its children (like a service manager)
470 * 3. give it to the init process (PID 1) in our pid namespace 484 * 3. give it to the init process (PID 1) in our pid namespace
471 */ 485 */
472static struct task_struct *find_new_reaper(struct task_struct *father) 486static struct task_struct *find_new_reaper(struct task_struct *father,
473 __releases(&tasklist_lock) 487 struct task_struct *child_reaper)
474 __acquires(&tasklist_lock)
475{ 488{
476 struct pid_namespace *pid_ns = task_active_pid_ns(father); 489 struct task_struct *thread, *reaper;
477 struct task_struct *thread;
478 490
479 thread = father; 491 thread = find_alive_thread(father);
480 while_each_thread(father, thread) { 492 if (thread)
481 if (thread->flags & PF_EXITING)
482 continue;
483 if (unlikely(pid_ns->child_reaper == father))
484 pid_ns->child_reaper = thread;
485 return thread; 493 return thread;
486 }
487
488 if (unlikely(pid_ns->child_reaper == father)) {
489 write_unlock_irq(&tasklist_lock);
490 if (unlikely(pid_ns == &init_pid_ns)) {
491 panic("Attempted to kill init! exitcode=0x%08x\n",
492 father->signal->group_exit_code ?:
493 father->exit_code);
494 }
495
496 zap_pid_ns_processes(pid_ns);
497 write_lock_irq(&tasklist_lock);
498 } else if (father->signal->has_child_subreaper) {
499 struct task_struct *reaper;
500 494
495 if (father->signal->has_child_subreaper) {
501 /* 496 /*
502 * Find the first ancestor marked as child_subreaper. 497 * Find the first ->is_child_subreaper ancestor in our pid_ns.
503 * Note that the code below checks same_thread_group(reaper, 498 * We start from father to ensure we can not look into another
504 * pid_ns->child_reaper). This is what we need to DTRT in a 499 * namespace, this is safe because all its threads are dead.
505 * PID namespace. However we still need the check above, see
506 * http://marc.info/?l=linux-kernel&m=131385460420380
507 */ 500 */
508 for (reaper = father->real_parent; 501 for (reaper = father;
509 reaper != &init_task; 502 !same_thread_group(reaper, child_reaper);
510 reaper = reaper->real_parent) { 503 reaper = reaper->real_parent) {
511 if (same_thread_group(reaper, pid_ns->child_reaper)) 504 /* call_usermodehelper() descendants need this check */
505 if (reaper == &init_task)
512 break; 506 break;
513 if (!reaper->signal->is_child_subreaper) 507 if (!reaper->signal->is_child_subreaper)
514 continue; 508 continue;
515 thread = reaper; 509 thread = find_alive_thread(reaper);
516 do { 510 if (thread)
517 if (!(thread->flags & PF_EXITING)) 511 return thread;
518 return reaper;
519 } while_each_thread(reaper, thread);
520 } 512 }
521 } 513 }
522 514
523 return pid_ns->child_reaper; 515 return child_reaper;
524} 516}
525 517
526/* 518/*
@@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
529static void reparent_leader(struct task_struct *father, struct task_struct *p, 521static void reparent_leader(struct task_struct *father, struct task_struct *p,
530 struct list_head *dead) 522 struct list_head *dead)
531{ 523{
532 list_move_tail(&p->sibling, &p->real_parent->children); 524 if (unlikely(p->exit_state == EXIT_DEAD))
533
534 if (p->exit_state == EXIT_DEAD)
535 return;
536 /*
537 * If this is a threaded reparent there is no need to
538 * notify anyone anything has happened.
539 */
540 if (same_thread_group(p->real_parent, father))
541 return; 525 return;
542 526
543 /* We don't want people slaying init. */ 527 /* We don't want people slaying init. */
@@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
548 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 532 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
549 if (do_notify_parent(p, p->exit_signal)) { 533 if (do_notify_parent(p, p->exit_signal)) {
550 p->exit_state = EXIT_DEAD; 534 p->exit_state = EXIT_DEAD;
551 list_move_tail(&p->sibling, dead); 535 list_add(&p->ptrace_entry, dead);
552 } 536 }
553 } 537 }
554 538
555 kill_orphaned_pgrp(p, father); 539 kill_orphaned_pgrp(p, father);
556} 540}
557 541
558static void forget_original_parent(struct task_struct *father) 542/*
543 * This does two things:
544 *
545 * A. Make init inherit all the child processes
546 * B. Check to see if any process groups have become orphaned
547 * as a result of our exiting, and if they have any stopped
548 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
549 */
550static void forget_original_parent(struct task_struct *father,
551 struct list_head *dead)
559{ 552{
560 struct task_struct *p, *n, *reaper; 553 struct task_struct *p, *t, *reaper;
561 LIST_HEAD(dead_children);
562 554
563 write_lock_irq(&tasklist_lock); 555 if (unlikely(!list_empty(&father->ptraced)))
564 /* 556 exit_ptrace(father, dead);
565 * Note that exit_ptrace() and find_new_reaper() might
566 * drop tasklist_lock and reacquire it.
567 */
568 exit_ptrace(father);
569 reaper = find_new_reaper(father);
570 557
571 list_for_each_entry_safe(p, n, &father->children, sibling) { 558 /* Can drop and reacquire tasklist_lock */
572 struct task_struct *t = p; 559 reaper = find_child_reaper(father);
560 if (list_empty(&father->children))
561 return;
573 562
574 do { 563 reaper = find_new_reaper(father, reaper);
564 list_for_each_entry(p, &father->children, sibling) {
565 for_each_thread(p, t) {
575 t->real_parent = reaper; 566 t->real_parent = reaper;
576 if (t->parent == father) { 567 BUG_ON((!t->ptrace) != (t->parent == father));
577 BUG_ON(t->ptrace); 568 if (likely(!t->ptrace))
578 t->parent = t->real_parent; 569 t->parent = t->real_parent;
579 }
580 if (t->pdeath_signal) 570 if (t->pdeath_signal)
581 group_send_sig_info(t->pdeath_signal, 571 group_send_sig_info(t->pdeath_signal,
582 SEND_SIG_NOINFO, t); 572 SEND_SIG_NOINFO, t);
583 } while_each_thread(p, t); 573 }
584 reparent_leader(father, p, &dead_children); 574 /*
585 } 575 * If this is a threaded reparent there is no need to
586 write_unlock_irq(&tasklist_lock); 576 * notify anyone anything has happened.
587 577 */
588 BUG_ON(!list_empty(&father->children)); 578 if (!same_thread_group(reaper, father))
589 579 reparent_leader(father, p, dead);
590 list_for_each_entry_safe(p, n, &dead_children, sibling) {
591 list_del_init(&p->sibling);
592 release_task(p);
593 } 580 }
581 list_splice_tail_init(&father->children, &reaper->children);
594} 582}
595 583
596/* 584/*
@@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father)
600static void exit_notify(struct task_struct *tsk, int group_dead) 588static void exit_notify(struct task_struct *tsk, int group_dead)
601{ 589{
602 bool autoreap; 590 bool autoreap;
603 591 struct task_struct *p, *n;
604 /* 592 LIST_HEAD(dead);
605 * This does two things:
606 *
607 * A. Make init inherit all the child processes
608 * B. Check to see if any process groups have become orphaned
609 * as a result of our exiting, and if they have any stopped
610 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
611 */
612 forget_original_parent(tsk);
613 593
614 write_lock_irq(&tasklist_lock); 594 write_lock_irq(&tasklist_lock);
595 forget_original_parent(tsk, &dead);
596
615 if (group_dead) 597 if (group_dead)
616 kill_orphaned_pgrp(tsk->group_leader, NULL); 598 kill_orphaned_pgrp(tsk->group_leader, NULL);
617 599
@@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
629 } 611 }
630 612
631 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; 613 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
614 if (tsk->exit_state == EXIT_DEAD)
615 list_add(&tsk->ptrace_entry, &dead);
632 616
633 /* mt-exec, de_thread() is waiting for group leader */ 617 /* mt-exec, de_thread() is waiting for group leader */
634 if (unlikely(tsk->signal->notify_count < 0)) 618 if (unlikely(tsk->signal->notify_count < 0))
635 wake_up_process(tsk->signal->group_exit_task); 619 wake_up_process(tsk->signal->group_exit_task);
636 write_unlock_irq(&tasklist_lock); 620 write_unlock_irq(&tasklist_lock);
637 621
638 /* If the process is dead, release it - nobody will wait for it */ 622 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
639 if (autoreap) 623 list_del_init(&p->ptrace_entry);
640 release_task(tsk); 624 release_task(p);
625 }
641} 626}
642 627
643#ifdef CONFIG_DEBUG_STACK_USAGE 628#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
982 */ 967 */
983static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) 968static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
984{ 969{
985 unsigned long state; 970 int state, retval, status;
986 int retval, status, traced;
987 pid_t pid = task_pid_vnr(p); 971 pid_t pid = task_pid_vnr(p);
988 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); 972 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
989 struct siginfo __user *infop; 973 struct siginfo __user *infop;
@@ -1008,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1008 } 992 }
1009 return wait_noreap_copyout(wo, p, pid, uid, why, status); 993 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1010 } 994 }
1011
1012 traced = ptrace_reparented(p);
1013 /* 995 /*
1014 * Move the task's state to DEAD/TRACE, only one thread can do this. 996 * Move the task's state to DEAD/TRACE, only one thread can do this.
1015 */ 997 */
1016 state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; 998 state = (ptrace_reparented(p) && thread_group_leader(p)) ?
999 EXIT_TRACE : EXIT_DEAD;
1017 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) 1000 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1018 return 0; 1001 return 0;
1019 /* 1002 /*
1020 * It can be ptraced but not reparented, check 1003 * We own this thread, nobody else can reap it.
1021 * thread_group_leader() to filter out sub-threads. 1004 */
1005 read_unlock(&tasklist_lock);
1006 sched_annotate_sleep();
1007
1008 /*
1009 * Check thread_group_leader() to exclude the traced sub-threads.
1022 */ 1010 */
1023 if (likely(!traced) && thread_group_leader(p)) { 1011 if (state == EXIT_DEAD && thread_group_leader(p)) {
1024 struct signal_struct *psig; 1012 struct signal_struct *sig = p->signal;
1025 struct signal_struct *sig; 1013 struct signal_struct *psig = current->signal;
1026 unsigned long maxrss; 1014 unsigned long maxrss;
1027 cputime_t tgutime, tgstime; 1015 cputime_t tgutime, tgstime;
1028 1016
@@ -1034,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1034 * accumulate in the parent's signal_struct c* fields. 1022 * accumulate in the parent's signal_struct c* fields.
1035 * 1023 *
1036 * We don't bother to take a lock here to protect these 1024 * We don't bother to take a lock here to protect these
1037 * p->signal fields, because they are only touched by 1025 * p->signal fields because the whole thread group is dead
1038 * __exit_signal, which runs with tasklist_lock 1026 * and nobody can change them.
1039 * write-locked anyway, and so is excluded here. We do 1027 *
1040 * need to protect the access to parent->signal fields, 1028 * psig->stats_lock also protects us from our sub-theads
1041 * as other threads in the parent group can be right 1029 * which can reap other children at the same time. Until
1042 * here reaping other children at the same time. 1030 * we change k_getrusage()-like users to rely on this lock
1031 * we have to take ->siglock as well.
1043 * 1032 *
1044 * We use thread_group_cputime_adjusted() to get times for 1033 * We use thread_group_cputime_adjusted() to get times for
1045 * the thread group, which consolidates times for all threads 1034 * the thread group, which consolidates times for all threads
1046 * in the group including the group leader. 1035 * in the group including the group leader.
1047 */ 1036 */
1048 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1037 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1049 spin_lock_irq(&p->real_parent->sighand->siglock); 1038 spin_lock_irq(&current->sighand->siglock);
1050 psig = p->real_parent->signal;
1051 sig = p->signal;
1052 write_seqlock(&psig->stats_lock); 1039 write_seqlock(&psig->stats_lock);
1053 psig->cutime += tgutime + sig->cutime; 1040 psig->cutime += tgutime + sig->cutime;
1054 psig->cstime += tgstime + sig->cstime; 1041 psig->cstime += tgstime + sig->cstime;
@@ -1073,16 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1073 task_io_accounting_add(&psig->ioac, &p->ioac); 1060 task_io_accounting_add(&psig->ioac, &p->ioac);
1074 task_io_accounting_add(&psig->ioac, &sig->ioac); 1061 task_io_accounting_add(&psig->ioac, &sig->ioac);
1075 write_sequnlock(&psig->stats_lock); 1062 write_sequnlock(&psig->stats_lock);
1076 spin_unlock_irq(&p->real_parent->sighand->siglock); 1063 spin_unlock_irq(&current->sighand->siglock);
1077 } 1064 }
1078 1065
1079 /*
1080 * Now we are sure this task is interesting, and no other
1081 * thread can reap it because we its state == DEAD/TRACE.
1082 */
1083 read_unlock(&tasklist_lock);
1084 sched_annotate_sleep();
1085
1086 retval = wo->wo_rusage 1066 retval = wo->wo_rusage
1087 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; 1067 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1088 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1068 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1307,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1307static int wait_consider_task(struct wait_opts *wo, int ptrace, 1287static int wait_consider_task(struct wait_opts *wo, int ptrace,
1308 struct task_struct *p) 1288 struct task_struct *p)
1309{ 1289{
1290 /*
1291 * We can race with wait_task_zombie() from another thread.
1292 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1293 * can't confuse the checks below.
1294 */
1295 int exit_state = ACCESS_ONCE(p->exit_state);
1310 int ret; 1296 int ret;
1311 1297
1312 if (unlikely(p->exit_state == EXIT_DEAD)) 1298 if (unlikely(exit_state == EXIT_DEAD))
1313 return 0; 1299 return 0;
1314 1300
1315 ret = eligible_child(wo, p); 1301 ret = eligible_child(wo, p);
@@ -1330,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1330 return 0; 1316 return 0;
1331 } 1317 }
1332 1318
1333 if (unlikely(p->exit_state == EXIT_TRACE)) { 1319 if (unlikely(exit_state == EXIT_TRACE)) {
1334 /* 1320 /*
1335 * ptrace == 0 means we are the natural parent. In this case 1321 * ptrace == 0 means we are the natural parent. In this case
1336 * we should clear notask_error, debugger will notify us. 1322 * we should clear notask_error, debugger will notify us.
@@ -1357,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1357 } 1343 }
1358 1344
1359 /* slay zombie? */ 1345 /* slay zombie? */
1360 if (p->exit_state == EXIT_ZOMBIE) { 1346 if (exit_state == EXIT_ZOMBIE) {
1361 /* we don't reap group leaders with subthreads */ 1347 /* we don't reap group leaders with subthreads */
1362 if (!delay_group_leader(p)) { 1348 if (!delay_group_leader(p)) {
1363 /* 1349 /*
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
21#include <linux/mutex.h> 22#include <linux/mutex.h>
22#include <linux/init.h> 23#include <linux/init.h>
23 24
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
102 return 1; 103 return 1;
103 if (is_module_text_address(addr)) 104 if (is_module_text_address(addr))
104 return 1; 105 return 1;
106 if (is_ftrace_trampoline(addr))
107 return 1;
105 /* 108 /*
106 * There might be init symbols in saved stacktraces. 109 * There might be init symbols in saved stacktraces.
107 * Give those symbols a chance to be printed in 110 * Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
119{ 122{
120 if (core_kernel_text(addr)) 123 if (core_kernel_text(addr))
121 return 1; 124 return 1;
122 return is_module_text_address(addr); 125 if (is_module_text_address(addr))
126 return 1;
127 return is_ftrace_trampoline(addr);
123} 128}
124 129
125/* 130/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ca84189cfc2..4dc2ddade9f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
433 get_file(file); 433 get_file(file);
434 if (tmp->vm_flags & VM_DENYWRITE) 434 if (tmp->vm_flags & VM_DENYWRITE)
435 atomic_dec(&inode->i_writecount); 435 atomic_dec(&inode->i_writecount);
436 mutex_lock(&mapping->i_mmap_mutex); 436 i_mmap_lock_write(mapping);
437 if (tmp->vm_flags & VM_SHARED) 437 if (tmp->vm_flags & VM_SHARED)
438 atomic_inc(&mapping->i_mmap_writable); 438 atomic_inc(&mapping->i_mmap_writable);
439 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
@@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
445 vma_interval_tree_insert_after(tmp, mpnt, 445 vma_interval_tree_insert_after(tmp, mpnt,
446 &mapping->i_mmap); 446 &mapping->i_mmap);
447 flush_dcache_mmap_unlock(mapping); 447 flush_dcache_mmap_unlock(mapping);
448 mutex_unlock(&mapping->i_mmap_mutex); 448 i_mmap_unlock_write(mapping);
449 } 449 }
450 450
451 /* 451 /*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3b7408759bdf..c92e44855ddd 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -32,10 +32,13 @@ config GCOV_KERNEL
32 Note that the debugfs filesystem has to be mounted to access 32 Note that the debugfs filesystem has to be mounted to access
33 profiling data. 33 profiling data.
34 34
35config ARCH_HAS_GCOV_PROFILE_ALL
36 def_bool n
37
35config GCOV_PROFILE_ALL 38config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 39 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 40 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 41 depends on ARCH_HAS_GCOV_PROFILE_ALL
39 default n 42 default n
40 ---help--- 43 ---help---
41 This options activates profiling for the entire kernel. 44 This options activates profiling for the entire kernel.
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/user_namespace.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10 11
11/* init to 2 - one for init_task, one to ensure it is never freed */ 12/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
213 return i; 214 return i;
214} 215}
215 216
217bool may_setgroups(void)
218{
219 struct user_namespace *user_ns = current_user_ns();
220
221 return ns_capable(user_ns, CAP_SETGID) &&
222 userns_may_setgroups(user_ns);
223}
224
216/* 225/*
217 * SMP: Our groups are copy-on-write. We can set them safely 226 * SMP: Our groups are copy-on-write. We can set them safely
218 * without another task interfering. 227 * without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
223 struct group_info *group_info; 232 struct group_info *group_info;
224 int retval; 233 int retval;
225 234
226 if (!ns_capable(current_user_ns(), CAP_SETGID)) 235 if (!may_setgroups())
227 return -EPERM; 236 return -EPERM;
228 if ((unsigned)gidsetsize > NGROUPS_MAX) 237 if ((unsigned)gidsetsize > NGROUPS_MAX)
229 return -EINVAL; 238 return -EINVAL;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4332d766619d..df553b0af936 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -78,8 +78,12 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
78 78
79#ifdef CONFIG_SPARSE_IRQ 79#ifdef CONFIG_SPARSE_IRQ
80static inline void irq_mark_irq(unsigned int irq) { } 80static inline void irq_mark_irq(unsigned int irq) { }
81extern void irq_lock_sparse(void);
82extern void irq_unlock_sparse(void);
81#else 83#else
82extern void irq_mark_irq(unsigned int irq); 84extern void irq_mark_irq(unsigned int irq);
85static inline void irq_lock_sparse(void) { }
86static inline void irq_unlock_sparse(void) { }
83#endif 87#endif
84 88
85extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 89extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a1782f88f0af..99793b9b6d23 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -132,6 +132,16 @@ static void free_masks(struct irq_desc *desc)
132static inline void free_masks(struct irq_desc *desc) { } 132static inline void free_masks(struct irq_desc *desc) { }
133#endif 133#endif
134 134
135void irq_lock_sparse(void)
136{
137 mutex_lock(&sparse_irq_lock);
138}
139
140void irq_unlock_sparse(void)
141{
142 mutex_unlock(&sparse_irq_lock);
143}
144
135static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) 145static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
136{ 146{
137 struct irq_desc *desc; 147 struct irq_desc *desc;
@@ -168,6 +178,12 @@ static void free_desc(unsigned int irq)
168 178
169 unregister_irq_proc(irq, desc); 179 unregister_irq_proc(irq, desc);
170 180
181 /*
182 * sparse_irq_lock protects also show_interrupts() and
183 * kstat_irq_usr(). Once we deleted the descriptor from the
184 * sparse tree we can free it. Access in proc will fail to
185 * lookup the descriptor.
186 */
171 mutex_lock(&sparse_irq_lock); 187 mutex_lock(&sparse_irq_lock);
172 delete_irq_desc(irq); 188 delete_irq_desc(irq);
173 mutex_unlock(&sparse_irq_lock); 189 mutex_unlock(&sparse_irq_lock);
@@ -574,6 +590,15 @@ void kstat_incr_irq_this_cpu(unsigned int irq)
574 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 590 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
575} 591}
576 592
593/**
594 * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
595 * @irq: The interrupt number
596 * @cpu: The cpu number
597 *
598 * Returns the sum of interrupt counts on @cpu since boot for
599 * @irq. The caller must ensure that the interrupt is not removed
600 * concurrently.
601 */
577unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 602unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
578{ 603{
579 struct irq_desc *desc = irq_to_desc(irq); 604 struct irq_desc *desc = irq_to_desc(irq);
@@ -582,6 +607,14 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
582 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 607 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
583} 608}
584 609
610/**
611 * kstat_irqs - Get the statistics for an interrupt
612 * @irq: The interrupt number
613 *
614 * Returns the sum of interrupt counts on all cpus since boot for
615 * @irq. The caller must ensure that the interrupt is not removed
616 * concurrently.
617 */
585unsigned int kstat_irqs(unsigned int irq) 618unsigned int kstat_irqs(unsigned int irq)
586{ 619{
587 struct irq_desc *desc = irq_to_desc(irq); 620 struct irq_desc *desc = irq_to_desc(irq);
@@ -594,3 +627,22 @@ unsigned int kstat_irqs(unsigned int irq)
594 sum += *per_cpu_ptr(desc->kstat_irqs, cpu); 627 sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
595 return sum; 628 return sum;
596} 629}
630
631/**
632 * kstat_irqs_usr - Get the statistics for an interrupt
633 * @irq: The interrupt number
634 *
635 * Returns the sum of interrupt counts on all cpus since boot for
636 * @irq. Contrary to kstat_irqs() this can be called from any
637 * preemptible context. It's protected against concurrent removal of
638 * an interrupt descriptor when sparse irqs are enabled.
639 */
640unsigned int kstat_irqs_usr(unsigned int irq)
641{
642 int sum;
643
644 irq_lock_sparse();
645 sum = kstat_irqs(irq);
646 irq_unlock_sparse();
647 return sum;
648}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ac1ba2f11032..9dc9bfd8a678 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -15,6 +15,23 @@
15 15
16#include "internals.h" 16#include "internals.h"
17 17
18/*
19 * Access rules:
20 *
21 * procfs protects read/write of /proc/irq/N/ files against a
22 * concurrent free of the interrupt descriptor. remove_proc_entry()
23 * immediately prevents new read/writes to happen and waits for
24 * already running read/write functions to complete.
25 *
26 * We remove the proc entries first and then delete the interrupt
27 * descriptor from the radix tree and free it. So it is guaranteed
28 * that irq_to_desc(N) is valid as long as the read/writes are
29 * permitted by procfs.
30 *
31 * The read from /proc/interrupts is a different problem because there
32 * is no protection. So the lookup and the access to irqdesc
33 * information must be protected by sparse_irq_lock.
34 */
18static struct proc_dir_entry *root_irq_dir; 35static struct proc_dir_entry *root_irq_dir;
19 36
20#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
@@ -437,9 +454,10 @@ int show_interrupts(struct seq_file *p, void *v)
437 seq_putc(p, '\n'); 454 seq_putc(p, '\n');
438 } 455 }
439 456
457 irq_lock_sparse();
440 desc = irq_to_desc(i); 458 desc = irq_to_desc(i);
441 if (!desc) 459 if (!desc)
442 return 0; 460 goto outsparse;
443 461
444 raw_spin_lock_irqsave(&desc->lock, flags); 462 raw_spin_lock_irqsave(&desc->lock, flags);
445 for_each_online_cpu(j) 463 for_each_online_cpu(j)
@@ -479,6 +497,8 @@ int show_interrupts(struct seq_file *p, void *v)
479 seq_putc(p, '\n'); 497 seq_putc(p, '\n');
480out: 498out:
481 raw_spin_unlock_irqrestore(&desc->lock, flags); 499 raw_spin_unlock_irqrestore(&desc->lock, flags);
500outsparse:
501 irq_unlock_sparse();
482 return 0; 502 return 0;
483} 503}
484#endif 504#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3ab9048483fa..cbf9fb899d92 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
175 175
176void irq_work_tick(void) 176void irq_work_tick(void)
177{ 177{
178 struct llist_head *raised = &__get_cpu_var(raised_list); 178 struct llist_head *raised = this_cpu_ptr(&raised_list);
179 179
180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) 180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
181 irq_work_run_list(raised); 181 irq_work_run_list(raised);
182 irq_work_run_list(&__get_cpu_var(lazy_list)); 182 irq_work_run_list(this_cpu_ptr(&lazy_list));
183} 183}
184 184
185/* 185/*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2abf9f6e9a61..9a8a01abbaed 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
600 if (!kexec_on_panic) { 600 if (!kexec_on_panic) {
601 image->swap_page = kimage_alloc_control_pages(image, 0); 601 image->swap_page = kimage_alloc_control_pages(image, 0);
602 if (!image->swap_page) { 602 if (!image->swap_page) {
603 pr_err(KERN_ERR "Could not allocate swap buffer\n"); 603 pr_err("Could not allocate swap buffer\n");
604 goto out_free_control_pages; 604 goto out_free_control_pages;
605 } 605 }
606 } 606 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq; 48static struct workqueue_struct *khelper_wq;
49 49
50/*
51 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
52 * locking to protect this global - it is private to the singleton khelper
53 * thread and should only ever be modified by that thread.
54 */
55static const struct task_struct *kmod_thread_locker;
56
57#define CAP_BSET (void *)1 50#define CAP_BSET (void *)1
58#define CAP_PI (void *)2 51#define CAP_PI (void *)2
59 52
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
223static int ____call_usermodehelper(void *data) 216static int ____call_usermodehelper(void *data)
224{ 217{
225 struct subprocess_info *sub_info = data; 218 struct subprocess_info *sub_info = data;
226 int wait = sub_info->wait & ~UMH_KILLABLE;
227 struct cred *new; 219 struct cred *new;
228 int retval; 220 int retval;
229 221
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
267out: 259out:
268 sub_info->retval = retval; 260 sub_info->retval = retval;
269 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ 261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
270 if (wait != UMH_WAIT_PROC) 262 if (!(sub_info->wait & UMH_WAIT_PROC))
271 umh_complete(sub_info); 263 umh_complete(sub_info);
272 if (!retval) 264 if (!retval)
273 return 0; 265 return 0;
274 do_exit(0); 266 do_exit(0);
275} 267}
276 268
277static int call_helper(void *data)
278{
279 /* Worker thread started blocking khelper thread. */
280 kmod_thread_locker = current;
281 return ____call_usermodehelper(data);
282}
283
284/* Keventd can't block, but this (a child) can. */ 269/* Keventd can't block, but this (a child) can. */
285static int wait_for_helper(void *data) 270static int wait_for_helper(void *data)
286{ 271{
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
323{ 308{
324 struct subprocess_info *sub_info = 309 struct subprocess_info *sub_info =
325 container_of(work, struct subprocess_info, work); 310 container_of(work, struct subprocess_info, work);
326 int wait = sub_info->wait & ~UMH_KILLABLE;
327 pid_t pid; 311 pid_t pid;
328 312
329 /* CLONE_VFORK: wait until the usermode helper has execve'd 313 if (sub_info->wait & UMH_WAIT_PROC)
330 * successfully We need the data structures to stay around
331 * until that is done. */
332 if (wait == UMH_WAIT_PROC)
333 pid = kernel_thread(wait_for_helper, sub_info, 314 pid = kernel_thread(wait_for_helper, sub_info,
334 CLONE_FS | CLONE_FILES | SIGCHLD); 315 CLONE_FS | CLONE_FILES | SIGCHLD);
335 else { 316 else
336 pid = kernel_thread(call_helper, sub_info, 317 pid = kernel_thread(____call_usermodehelper, sub_info,
337 CLONE_VFORK | SIGCHLD); 318 SIGCHLD);
338 /* Worker thread stopped blocking khelper thread. */
339 kmod_thread_locker = NULL;
340 }
341 319
342 if (pid < 0) { 320 if (pid < 0) {
343 sub_info->retval = pid; 321 sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
571 goto out; 549 goto out;
572 } 550 }
573 /* 551 /*
574 * Worker thread must not wait for khelper thread at below
575 * wait_for_completion() if the thread was created with CLONE_VFORK
576 * flag, for khelper thread is already waiting for the thread at
577 * wait_for_completion() in do_fork().
578 */
579 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
580 retval = -EBUSY;
581 goto out;
582 }
583
584 /*
585 * Set the completion pointer only if there is a waiter. 552 * Set the completion pointer only if there is a waiter.
586 * This makes it possible to use umh_complete to free 553 * This makes it possible to use umh_complete to free
587 * the data structure in case of UMH_NO_WAIT. 554 * the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
127 127
128static void free_insn_page(void *page) 128static void free_insn_page(void *page)
129{ 129{
130 module_free(NULL, page); 130 module_memfree(page);
131} 131}
132 132
133struct kprobe_insn_cache kprobe_insn_slots = { 133struct kprobe_insn_cache kprobe_insn_slots = {
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
915#ifdef CONFIG_KPROBES_ON_FTRACE 915#ifdef CONFIG_KPROBES_ON_FTRACE
916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 916static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
917 .func = kprobe_ftrace_handler, 917 .func = kprobe_ftrace_handler,
918 .flags = FTRACE_OPS_FL_SAVE_REGS, 918 .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
919}; 919};
920static int kprobe_ftrace_enabled; 920static int kprobe_ftrace_enabled;
921 921
@@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p)
1410 return ret; 1410 return ret;
1411} 1411}
1412 1412
1413static int check_kprobe_address_safe(struct kprobe *p, 1413int __weak arch_check_ftrace_location(struct kprobe *p)
1414 struct module **probed_mod)
1415{ 1414{
1416 int ret = 0;
1417 unsigned long ftrace_addr; 1415 unsigned long ftrace_addr;
1418 1416
1419 /*
1420 * If the address is located on a ftrace nop, set the
1421 * breakpoint to the following instruction.
1422 */
1423 ftrace_addr = ftrace_location((unsigned long)p->addr); 1417 ftrace_addr = ftrace_location((unsigned long)p->addr);
1424 if (ftrace_addr) { 1418 if (ftrace_addr) {
1425#ifdef CONFIG_KPROBES_ON_FTRACE 1419#ifdef CONFIG_KPROBES_ON_FTRACE
@@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p,
1431 return -EINVAL; 1425 return -EINVAL;
1432#endif 1426#endif
1433 } 1427 }
1428 return 0;
1429}
1434 1430
1431static int check_kprobe_address_safe(struct kprobe *p,
1432 struct module **probed_mod)
1433{
1434 int ret;
1435
1436 ret = arch_check_ftrace_location(p);
1437 if (ret)
1438 return ret;
1435 jump_label_lock(); 1439 jump_label_lock();
1436 preempt_disable(); 1440 preempt_disable();
1437 1441
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 5cf6731b98e9..3ef3736002d8 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -80,13 +80,13 @@ void debug_mutex_unlock(struct mutex *lock)
80 DEBUG_LOCKS_WARN_ON(lock->owner != current); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current);
81 81
82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
83 mutex_clear_owner(lock);
84 } 83 }
85 84
86 /* 85 /*
87 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug 86 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
88 * mutexes so that we can do it here after we've verified state. 87 * mutexes so that we can do it here after we've verified state.
89 */ 88 */
89 mutex_clear_owner(lock);
90 atomic_set(&lock->count, 1); 90 atomic_set(&lock->count, 1);
91} 91}
92 92
diff --git a/kernel/module.c b/kernel/module.c
index e52a8739361a..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
42#include <linux/vermagic.h> 42#include <linux/vermagic.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
44#include <linux/sched.h> 44#include <linux/sched.h>
45#include <linux/stop_machine.h>
46#include <linux/device.h> 45#include <linux/device.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/mutex.h> 47#include <linux/mutex.h>
@@ -98,7 +97,7 @@
98 * 1) List of modules (also safely readable with preempt_disable), 97 * 1) List of modules (also safely readable with preempt_disable),
99 * 2) module_use links, 98 * 2) module_use links,
100 * 3) module_addr_min/module_addr_max. 99 * 3) module_addr_min/module_addr_max.
101 * (delete uses stop_machine/add uses RCU list operations). */ 100 * (delete and add uses RCU list operations). */
102DEFINE_MUTEX(module_mutex); 101DEFINE_MUTEX(module_mutex);
103EXPORT_SYMBOL_GPL(module_mutex); 102EXPORT_SYMBOL_GPL(module_mutex);
104static LIST_HEAD(modules); 103static LIST_HEAD(modules);
@@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
158 * Protected by module_mutex. */ 157 * Protected by module_mutex. */
159static unsigned long module_addr_min = -1UL, module_addr_max = 0; 158static unsigned long module_addr_min = -1UL, module_addr_max = 0;
160 159
161int register_module_notifier(struct notifier_block * nb) 160int register_module_notifier(struct notifier_block *nb)
162{ 161{
163 return blocking_notifier_chain_register(&module_notify_list, nb); 162 return blocking_notifier_chain_register(&module_notify_list, nb);
164} 163}
165EXPORT_SYMBOL(register_module_notifier); 164EXPORT_SYMBOL(register_module_notifier);
166 165
167int unregister_module_notifier(struct notifier_block * nb) 166int unregister_module_notifier(struct notifier_block *nb)
168{ 167{
169 return blocking_notifier_chain_unregister(&module_notify_list, nb); 168 return blocking_notifier_chain_unregister(&module_notify_list, nb);
170} 169}
@@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
628 627
629EXPORT_TRACEPOINT_SYMBOL(module_get); 628EXPORT_TRACEPOINT_SYMBOL(module_get);
630 629
630/* MODULE_REF_BASE is the base reference count by kmodule loader. */
631#define MODULE_REF_BASE 1
632
631/* Init the unload section of the module. */ 633/* Init the unload section of the module. */
632static int module_unload_init(struct module *mod) 634static int module_unload_init(struct module *mod)
633{ 635{
634 mod->refptr = alloc_percpu(struct module_ref); 636 /*
635 if (!mod->refptr) 637 * Initialize reference counter to MODULE_REF_BASE.
636 return -ENOMEM; 638 * refcnt == 0 means module is going.
639 */
640 atomic_set(&mod->refcnt, MODULE_REF_BASE);
637 641
638 INIT_LIST_HEAD(&mod->source_list); 642 INIT_LIST_HEAD(&mod->source_list);
639 INIT_LIST_HEAD(&mod->target_list); 643 INIT_LIST_HEAD(&mod->target_list);
640 644
641 /* Hold reference count during initialization. */ 645 /* Hold reference count during initialization. */
642 raw_cpu_write(mod->refptr->incs, 1); 646 atomic_inc(&mod->refcnt);
643 647
644 return 0; 648 return 0;
645} 649}
@@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod)
721 kfree(use); 725 kfree(use);
722 } 726 }
723 mutex_unlock(&module_mutex); 727 mutex_unlock(&module_mutex);
724
725 free_percpu(mod->refptr);
726} 728}
727 729
728#ifdef CONFIG_MODULE_FORCE_UNLOAD 730#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +742,48 @@ static inline int try_force_unload(unsigned int flags)
740} 742}
741#endif /* CONFIG_MODULE_FORCE_UNLOAD */ 743#endif /* CONFIG_MODULE_FORCE_UNLOAD */
742 744
743struct stopref 745/* Try to release refcount of module, 0 means success. */
746static int try_release_module_ref(struct module *mod)
744{ 747{
745 struct module *mod; 748 int ret;
746 int flags;
747 int *forced;
748};
749 749
750/* Whole machine is stopped with interrupts off when this runs. */ 750 /* Try to decrement refcnt which we set at loading */
751static int __try_stop_module(void *_sref) 751 ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
752{ 752 BUG_ON(ret < 0);
753 struct stopref *sref = _sref; 753 if (ret)
754 /* Someone can put this right now, recover with checking */
755 ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
754 756
757 return ret;
758}
759
760static int try_stop_module(struct module *mod, int flags, int *forced)
761{
755 /* If it's not unused, quit unless we're forcing. */ 762 /* If it's not unused, quit unless we're forcing. */
756 if (module_refcount(sref->mod) != 0) { 763 if (try_release_module_ref(mod) != 0) {
757 if (!(*sref->forced = try_force_unload(sref->flags))) 764 *forced = try_force_unload(flags);
765 if (!(*forced))
758 return -EWOULDBLOCK; 766 return -EWOULDBLOCK;
759 } 767 }
760 768
761 /* Mark it as dying. */ 769 /* Mark it as dying. */
762 sref->mod->state = MODULE_STATE_GOING; 770 mod->state = MODULE_STATE_GOING;
763 return 0;
764}
765
766static int try_stop_module(struct module *mod, int flags, int *forced)
767{
768 struct stopref sref = { mod, flags, forced };
769 771
770 return stop_machine(__try_stop_module, &sref, NULL); 772 return 0;
771} 773}
772 774
773unsigned long module_refcount(struct module *mod) 775/**
776 * module_refcount - return the refcount or -1 if unloading
777 *
778 * @mod: the module we're checking
779 *
780 * Returns:
781 * -1 if the module is in the process of unloading
782 * otherwise the number of references in the kernel to the module
783 */
784int module_refcount(struct module *mod)
774{ 785{
775 unsigned long incs = 0, decs = 0; 786 return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
776 int cpu;
777
778 for_each_possible_cpu(cpu)
779 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
780 /*
781 * ensure the incs are added up after the decs.
782 * module_put ensures incs are visible before decs with smp_wmb.
783 *
784 * This 2-count scheme avoids the situation where the refcount
785 * for CPU0 is read, then CPU0 increments the module refcount,
786 * then CPU1 drops that refcount, then the refcount for CPU1 is
787 * read. We would record a decrement but not its corresponding
788 * increment so we would see a low count (disaster).
789 *
790 * Rare situation? But module_refcount can be preempted, and we
791 * might be tallying up 4096+ CPUs. So it is not impossible.
792 */
793 smp_rmb();
794 for_each_possible_cpu(cpu)
795 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
796 return incs - decs;
797} 787}
798EXPORT_SYMBOL(module_refcount); 788EXPORT_SYMBOL(module_refcount);
799 789
@@ -875,10 +865,12 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
875 struct module_use *use; 865 struct module_use *use;
876 int printed_something = 0; 866 int printed_something = 0;
877 867
878 seq_printf(m, " %lu ", module_refcount(mod)); 868 seq_printf(m, " %i ", module_refcount(mod));
879 869
880 /* Always include a trailing , so userspace can differentiate 870 /*
881 between this and the old multi-field proc format. */ 871 * Always include a trailing , so userspace can differentiate
872 * between this and the old multi-field proc format.
873 */
882 list_for_each_entry(use, &mod->source_list, source_list) { 874 list_for_each_entry(use, &mod->source_list, source_list) {
883 printed_something = 1; 875 printed_something = 1;
884 seq_printf(m, "%s,", use->source->name); 876 seq_printf(m, "%s,", use->source->name);
@@ -886,11 +878,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
886 878
887 if (mod->init != NULL && mod->exit == NULL) { 879 if (mod->init != NULL && mod->exit == NULL) {
888 printed_something = 1; 880 printed_something = 1;
889 seq_printf(m, "[permanent],"); 881 seq_puts(m, "[permanent],");
890 } 882 }
891 883
892 if (!printed_something) 884 if (!printed_something)
893 seq_printf(m, "-"); 885 seq_puts(m, "-");
894} 886}
895 887
896void __symbol_put(const char *symbol) 888void __symbol_put(const char *symbol)
@@ -925,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
925static ssize_t show_refcnt(struct module_attribute *mattr, 917static ssize_t show_refcnt(struct module_attribute *mattr,
926 struct module_kobject *mk, char *buffer) 918 struct module_kobject *mk, char *buffer)
927{ 919{
928 return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); 920 return sprintf(buffer, "%i\n", module_refcount(mk->mod));
929} 921}
930 922
931static struct module_attribute modinfo_refcnt = 923static struct module_attribute modinfo_refcnt =
@@ -935,7 +927,7 @@ void __module_get(struct module *module)
935{ 927{
936 if (module) { 928 if (module) {
937 preempt_disable(); 929 preempt_disable();
938 __this_cpu_inc(module->refptr->incs); 930 atomic_inc(&module->refcnt);
939 trace_module_get(module, _RET_IP_); 931 trace_module_get(module, _RET_IP_);
940 preempt_enable(); 932 preempt_enable();
941 } 933 }
@@ -948,11 +940,11 @@ bool try_module_get(struct module *module)
948 940
949 if (module) { 941 if (module) {
950 preempt_disable(); 942 preempt_disable();
951 943 /* Note: here, we can fail to get a reference */
952 if (likely(module_is_live(module))) { 944 if (likely(module_is_live(module) &&
953 __this_cpu_inc(module->refptr->incs); 945 atomic_inc_not_zero(&module->refcnt) != 0))
954 trace_module_get(module, _RET_IP_); 946 trace_module_get(module, _RET_IP_);
955 } else 947 else
956 ret = false; 948 ret = false;
957 949
958 preempt_enable(); 950 preempt_enable();
@@ -963,11 +955,12 @@ EXPORT_SYMBOL(try_module_get);
963 955
964void module_put(struct module *module) 956void module_put(struct module *module)
965{ 957{
958 int ret;
959
966 if (module) { 960 if (module) {
967 preempt_disable(); 961 preempt_disable();
968 smp_wmb(); /* see comment in module_refcount */ 962 ret = atomic_dec_if_positive(&module->refcnt);
969 __this_cpu_inc(module->refptr->decs); 963 WARN_ON(ret < 0); /* Failed to put refcount */
970
971 trace_module_put(module, _RET_IP_); 964 trace_module_put(module, _RET_IP_);
972 preempt_enable(); 965 preempt_enable();
973 } 966 }
@@ -978,7 +971,7 @@ EXPORT_SYMBOL(module_put);
978static inline void print_unload_info(struct seq_file *m, struct module *mod) 971static inline void print_unload_info(struct seq_file *m, struct module *mod)
979{ 972{
980 /* We don't know the usage count, or what modules are using. */ 973 /* We don't know the usage count, or what modules are using. */
981 seq_printf(m, " - -"); 974 seq_puts(m, " - -");
982} 975}
983 976
984static inline void module_unload_free(struct module *mod) 977static inline void module_unload_free(struct module *mod)
@@ -1131,7 +1124,7 @@ static unsigned long maybe_relocated(unsigned long crc,
1131static int check_version(Elf_Shdr *sechdrs, 1124static int check_version(Elf_Shdr *sechdrs,
1132 unsigned int versindex, 1125 unsigned int versindex,
1133 const char *symname, 1126 const char *symname,
1134 struct module *mod, 1127 struct module *mod,
1135 const unsigned long *crc, 1128 const unsigned long *crc,
1136 const struct module *crc_owner) 1129 const struct module *crc_owner)
1137{ 1130{
@@ -1165,7 +1158,7 @@ static int check_version(Elf_Shdr *sechdrs,
1165 return 0; 1158 return 0;
1166 1159
1167bad_version: 1160bad_version:
1168 printk("%s: disagrees about version of symbol %s\n", 1161 pr_warn("%s: disagrees about version of symbol %s\n",
1169 mod->name, symname); 1162 mod->name, symname);
1170 return 0; 1163 return 0;
1171} 1164}
@@ -1200,7 +1193,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1200static inline int check_version(Elf_Shdr *sechdrs, 1193static inline int check_version(Elf_Shdr *sechdrs,
1201 unsigned int versindex, 1194 unsigned int versindex,
1202 const char *symname, 1195 const char *symname,
1203 struct module *mod, 1196 struct module *mod,
1204 const unsigned long *crc, 1197 const unsigned long *crc,
1205 const struct module *crc_owner) 1198 const struct module *crc_owner)
1206{ 1199{
@@ -1288,15 +1281,13 @@ static inline bool sect_empty(const Elf_Shdr *sect)
1288 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; 1281 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1289} 1282}
1290 1283
1291struct module_sect_attr 1284struct module_sect_attr {
1292{
1293 struct module_attribute mattr; 1285 struct module_attribute mattr;
1294 char *name; 1286 char *name;
1295 unsigned long address; 1287 unsigned long address;
1296}; 1288};
1297 1289
1298struct module_sect_attrs 1290struct module_sect_attrs {
1299{
1300 struct attribute_group grp; 1291 struct attribute_group grp;
1301 unsigned int nsections; 1292 unsigned int nsections;
1302 struct module_sect_attr attrs[0]; 1293 struct module_sect_attr attrs[0];
@@ -1550,7 +1541,8 @@ static int module_add_modinfo_attrs(struct module *mod)
1550 (attr->test && attr->test(mod))) { 1541 (attr->test && attr->test(mod))) {
1551 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1542 memcpy(temp_attr, attr, sizeof(*temp_attr));
1552 sysfs_attr_init(&temp_attr->attr); 1543 sysfs_attr_init(&temp_attr->attr);
1553 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1544 error = sysfs_create_file(&mod->mkobj.kobj,
1545 &temp_attr->attr);
1554 ++temp_attr; 1546 ++temp_attr;
1555 } 1547 }
1556 } 1548 }
@@ -1566,7 +1558,7 @@ static void module_remove_modinfo_attrs(struct module *mod)
1566 /* pick a field to test for end of list */ 1558 /* pick a field to test for end of list */
1567 if (!attr->attr.name) 1559 if (!attr->attr.name)
1568 break; 1560 break;
1569 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); 1561 sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
1570 if (attr->free) 1562 if (attr->free)
1571 attr->free(mod); 1563 attr->free(mod);
1572 } 1564 }
@@ -1697,18 +1689,6 @@ static void mod_sysfs_teardown(struct module *mod)
1697 mod_sysfs_fini(mod); 1689 mod_sysfs_fini(mod);
1698} 1690}
1699 1691
1700/*
1701 * unlink the module with the whole machine is stopped with interrupts off
1702 * - this defends against kallsyms not taking locks
1703 */
1704static int __unlink_module(void *_mod)
1705{
1706 struct module *mod = _mod;
1707 list_del(&mod->list);
1708 module_bug_cleanup(mod);
1709 return 0;
1710}
1711
1712#ifdef CONFIG_DEBUG_SET_MODULE_RONX 1692#ifdef CONFIG_DEBUG_SET_MODULE_RONX
1713/* 1693/*
1714 * LKM RO/NX protection: protect module's text/ro-data 1694 * LKM RO/NX protection: protect module's text/ro-data
@@ -1824,7 +1804,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1824static void unset_module_init_ro_nx(struct module *mod) { } 1804static void unset_module_init_ro_nx(struct module *mod) { }
1825#endif 1805#endif
1826 1806
1827void __weak module_free(struct module *mod, void *module_region) 1807void __weak module_memfree(void *module_region)
1828{ 1808{
1829 vfree(module_region); 1809 vfree(module_region);
1830} 1810}
@@ -1833,6 +1813,10 @@ void __weak module_arch_cleanup(struct module *mod)
1833{ 1813{
1834} 1814}
1835 1815
1816void __weak module_arch_freeing_init(struct module *mod)
1817{
1818}
1819
1836/* Free a module, remove from lists, etc. */ 1820/* Free a module, remove from lists, etc. */
1837static void free_module(struct module *mod) 1821static void free_module(struct module *mod)
1838{ 1822{
@@ -1860,12 +1844,18 @@ static void free_module(struct module *mod)
1860 1844
1861 /* Now we can delete it from the lists */ 1845 /* Now we can delete it from the lists */
1862 mutex_lock(&module_mutex); 1846 mutex_lock(&module_mutex);
1863 stop_machine(__unlink_module, mod, NULL); 1847 /* Unlink carefully: kallsyms could be walking list. */
1848 list_del_rcu(&mod->list);
1849 /* Remove this module from bug list, this uses list_del_rcu */
1850 module_bug_cleanup(mod);
1851 /* Wait for RCU synchronizing before releasing mod->list and buglist. */
1852 synchronize_rcu();
1864 mutex_unlock(&module_mutex); 1853 mutex_unlock(&module_mutex);
1865 1854
1866 /* This may be NULL, but that's OK */ 1855 /* This may be NULL, but that's OK */
1867 unset_module_init_ro_nx(mod); 1856 unset_module_init_ro_nx(mod);
1868 module_free(mod, mod->module_init); 1857 module_arch_freeing_init(mod);
1858 module_memfree(mod->module_init);
1869 kfree(mod->args); 1859 kfree(mod->args);
1870 percpu_modfree(mod); 1860 percpu_modfree(mod);
1871 1861
@@ -1874,7 +1864,7 @@ static void free_module(struct module *mod)
1874 1864
1875 /* Finally, free the core (containing the module structure) */ 1865 /* Finally, free the core (containing the module structure) */
1876 unset_module_core_ro_nx(mod); 1866 unset_module_core_ro_nx(mod);
1877 module_free(mod, mod->module_core); 1867 module_memfree(mod->module_core);
1878 1868
1879#ifdef CONFIG_MPU 1869#ifdef CONFIG_MPU
1880 update_protections(current->mm); 1870 update_protections(current->mm);
@@ -1955,7 +1945,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1955 /* We compiled with -fno-common. These are not 1945 /* We compiled with -fno-common. These are not
1956 supposed to happen. */ 1946 supposed to happen. */
1957 pr_debug("Common symbol: %s\n", name); 1947 pr_debug("Common symbol: %s\n", name);
1958 printk("%s: please compile with -fno-common\n", 1948 pr_warn("%s: please compile with -fno-common\n",
1959 mod->name); 1949 mod->name);
1960 ret = -ENOEXEC; 1950 ret = -ENOEXEC;
1961 break; 1951 break;
@@ -2259,7 +2249,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
2259} 2249}
2260 2250
2261static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, 2251static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2262 unsigned int shnum) 2252 unsigned int shnum)
2263{ 2253{
2264 const Elf_Shdr *sec; 2254 const Elf_Shdr *sec;
2265 2255
@@ -2735,7 +2725,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
2735 * This shouldn't happen with same compiler and binutils 2725 * This shouldn't happen with same compiler and binutils
2736 * building all parts of the module. 2726 * building all parts of the module.
2737 */ 2727 */
2738 printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", 2728 pr_warn("%s: has both .ctors and .init_array.\n",
2739 mod->name); 2729 mod->name);
2740 return -EINVAL; 2730 return -EINVAL;
2741 } 2731 }
@@ -2809,7 +2799,7 @@ static int move_module(struct module *mod, struct load_info *info)
2809 */ 2799 */
2810 kmemleak_ignore(ptr); 2800 kmemleak_ignore(ptr);
2811 if (!ptr) { 2801 if (!ptr) {
2812 module_free(mod, mod->module_core); 2802 module_memfree(mod->module_core);
2813 return -ENOMEM; 2803 return -ENOMEM;
2814 } 2804 }
2815 memset(ptr, 0, mod->init_size); 2805 memset(ptr, 0, mod->init_size);
@@ -2954,8 +2944,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2954static void module_deallocate(struct module *mod, struct load_info *info) 2944static void module_deallocate(struct module *mod, struct load_info *info)
2955{ 2945{
2956 percpu_modfree(mod); 2946 percpu_modfree(mod);
2957 module_free(mod, mod->module_init); 2947 module_arch_freeing_init(mod);
2958 module_free(mod, mod->module_core); 2948 module_memfree(mod->module_init);
2949 module_memfree(mod->module_core);
2959} 2950}
2960 2951
2961int __weak module_finalize(const Elf_Ehdr *hdr, 2952int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3007,10 +2998,31 @@ static void do_mod_ctors(struct module *mod)
3007#endif 2998#endif
3008} 2999}
3009 3000
3001/* For freeing module_init on success, in case kallsyms traversing */
3002struct mod_initfree {
3003 struct rcu_head rcu;
3004 void *module_init;
3005};
3006
3007static void do_free_init(struct rcu_head *head)
3008{
3009 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
3010 module_memfree(m->module_init);
3011 kfree(m);
3012}
3013
3010/* This is where the real work happens */ 3014/* This is where the real work happens */
3011static int do_init_module(struct module *mod) 3015static int do_init_module(struct module *mod)
3012{ 3016{
3013 int ret = 0; 3017 int ret = 0;
3018 struct mod_initfree *freeinit;
3019
3020 freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
3021 if (!freeinit) {
3022 ret = -ENOMEM;
3023 goto fail;
3024 }
3025 freeinit->module_init = mod->module_init;
3014 3026
3015 /* 3027 /*
3016 * We want to find out whether @mod uses async during init. Clear 3028 * We want to find out whether @mod uses async during init. Clear
@@ -3023,16 +3035,7 @@ static int do_init_module(struct module *mod)
3023 if (mod->init != NULL) 3035 if (mod->init != NULL)
3024 ret = do_one_initcall(mod->init); 3036 ret = do_one_initcall(mod->init);
3025 if (ret < 0) { 3037 if (ret < 0) {
3026 /* Init routine failed: abort. Try to protect us from 3038 goto fail_free_freeinit;
3027 buggy refcounters. */
3028 mod->state = MODULE_STATE_GOING;
3029 synchronize_sched();
3030 module_put(mod);
3031 blocking_notifier_call_chain(&module_notify_list,
3032 MODULE_STATE_GOING, mod);
3033 free_module(mod);
3034 wake_up_all(&module_wq);
3035 return ret;
3036 } 3039 }
3037 if (ret > 0) { 3040 if (ret > 0) {
3038 pr_warn("%s: '%s'->init suspiciously returned %d, it should " 3041 pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3077,15 +3080,35 @@ static int do_init_module(struct module *mod)
3077 mod->strtab = mod->core_strtab; 3080 mod->strtab = mod->core_strtab;
3078#endif 3081#endif
3079 unset_module_init_ro_nx(mod); 3082 unset_module_init_ro_nx(mod);
3080 module_free(mod, mod->module_init); 3083 module_arch_freeing_init(mod);
3081 mod->module_init = NULL; 3084 mod->module_init = NULL;
3082 mod->init_size = 0; 3085 mod->init_size = 0;
3083 mod->init_ro_size = 0; 3086 mod->init_ro_size = 0;
3084 mod->init_text_size = 0; 3087 mod->init_text_size = 0;
3088 /*
3089 * We want to free module_init, but be aware that kallsyms may be
3090 * walking this with preempt disabled. In all the failure paths,
3091 * we call synchronize_rcu/synchronize_sched, but we don't want
3092 * to slow down the success path, so use actual RCU here.
3093 */
3094 call_rcu(&freeinit->rcu, do_free_init);
3085 mutex_unlock(&module_mutex); 3095 mutex_unlock(&module_mutex);
3086 wake_up_all(&module_wq); 3096 wake_up_all(&module_wq);
3087 3097
3088 return 0; 3098 return 0;
3099
3100fail_free_freeinit:
3101 kfree(freeinit);
3102fail:
3103 /* Try to protect us from buggy refcounters. */
3104 mod->state = MODULE_STATE_GOING;
3105 synchronize_sched();
3106 module_put(mod);
3107 blocking_notifier_call_chain(&module_notify_list,
3108 MODULE_STATE_GOING, mod);
3109 free_module(mod);
3110 wake_up_all(&module_wq);
3111 return ret;
3089} 3112}
3090 3113
3091static int may_init_module(void) 3114static int may_init_module(void)
@@ -3202,7 +3225,7 @@ out:
3202 3225
3203static int unknown_module_param_cb(char *param, char *val, const char *modname) 3226static int unknown_module_param_cb(char *param, char *val, const char *modname)
3204{ 3227{
3205 /* Check for magic 'dyndbg' arg */ 3228 /* Check for magic 'dyndbg' arg */
3206 int ret = ddebug_dyndbg_module_param_cb(param, val, modname); 3229 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3207 if (ret != 0) 3230 if (ret != 0)
3208 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); 3231 pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
@@ -3352,6 +3375,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
3352 /* Unlink carefully: kallsyms could be walking list. */ 3375 /* Unlink carefully: kallsyms could be walking list. */
3353 list_del_rcu(&mod->list); 3376 list_del_rcu(&mod->list);
3354 wake_up_all(&module_wq); 3377 wake_up_all(&module_wq);
3378 /* Wait for RCU synchronizing before releasing mod->list. */
3379 synchronize_rcu();
3355 mutex_unlock(&module_mutex); 3380 mutex_unlock(&module_mutex);
3356 free_module: 3381 free_module:
3357 module_deallocate(mod, info); 3382 module_deallocate(mod, info);
@@ -3685,8 +3710,8 @@ static int m_show(struct seq_file *m, void *p)
3685 3710
3686 /* Informative for users. */ 3711 /* Informative for users. */
3687 seq_printf(m, " %s", 3712 seq_printf(m, " %s",
3688 mod->state == MODULE_STATE_GOING ? "Unloading": 3713 mod->state == MODULE_STATE_GOING ? "Unloading" :
3689 mod->state == MODULE_STATE_COMING ? "Loading": 3714 mod->state == MODULE_STATE_COMING ? "Loading" :
3690 "Live"); 3715 "Live");
3691 /* Used by oprofile and other similar tools. */ 3716 /* Used by oprofile and other similar tools. */
3692 seq_printf(m, " 0x%pK", mod->module_core); 3717 seq_printf(m, " 0x%pK", mod->module_core);
@@ -3695,7 +3720,7 @@ static int m_show(struct seq_file *m, void *p)
3695 if (mod->taints) 3720 if (mod->taints)
3696 seq_printf(m, " %s", module_flags(mod, buf)); 3721 seq_printf(m, " %s", module_flags(mod, buf));
3697 3722
3698 seq_printf(m, "\n"); 3723 seq_puts(m, "\n");
3699 return 0; 3724 return 0;
3700} 3725}
3701 3726
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index ef42d0ab3115..49746c81ad8d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p)
220 220
221SYSCALL_DEFINE2(setns, int, fd, int, nstype) 221SYSCALL_DEFINE2(setns, int, fd, int, nstype)
222{ 222{
223 const struct proc_ns_operations *ops;
224 struct task_struct *tsk = current; 223 struct task_struct *tsk = current;
225 struct nsproxy *new_nsproxy; 224 struct nsproxy *new_nsproxy;
226 struct proc_ns *ei;
227 struct file *file; 225 struct file *file;
226 struct ns_common *ns;
228 int err; 227 int err;
229 228
230 file = proc_ns_fget(fd); 229 file = proc_ns_fget(fd);
@@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
232 return PTR_ERR(file); 231 return PTR_ERR(file);
233 232
234 err = -EINVAL; 233 err = -EINVAL;
235 ei = get_proc_ns(file_inode(file)); 234 ns = get_proc_ns(file_inode(file));
236 ops = ei->ns_ops; 235 if (nstype && (ns->ops->type != nstype))
237 if (nstype && (ops->type != nstype))
238 goto out; 236 goto out;
239 237
240 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 238 new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
@@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
243 goto out; 241 goto out;
244 } 242 }
245 243
246 err = ops->install(new_nsproxy, ei->ns); 244 err = ns->ops->install(new_nsproxy, ns);
247 if (err) { 245 if (err) {
248 free_nsproxy(new_nsproxy); 246 free_nsproxy(new_nsproxy);
249 goto out; 247 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
33static int pause_on_oops_flag; 33static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 34static DEFINE_SPINLOCK(pause_on_oops_lock);
35static bool crash_kexec_post_notifiers; 35static bool crash_kexec_post_notifiers;
36int panic_on_warn __read_mostly;
36 37
37int panic_timeout = CONFIG_PANIC_TIMEOUT; 38int panic_timeout = CONFIG_PANIC_TIMEOUT;
38EXPORT_SYMBOL_GPL(panic_timeout); 39EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
428 if (args) 429 if (args)
429 vprintk(args->fmt, args->args); 430 vprintk(args->fmt, args->args);
430 431
432 if (panic_on_warn) {
433 /*
434 * This thread may hit another WARN() in the panic path.
435 * Resetting this prevents additional WARN() from panicking the
436 * system on this thread. Other threads are blocked by the
437 * panic_mutex in panic().
438 */
439 panic_on_warn = 0;
440 panic("panic_on_warn set ...\n");
441 }
442
431 print_modules(); 443 print_modules();
432 dump_stack(); 444 dump_stack();
433 print_oops_end_marker(); 445 print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
485 497
486core_param(panic, panic_timeout, int, 0644); 498core_param(panic, panic_timeout, int, 0644);
487core_param(pause_on_oops, pause_on_oops, int, 0644); 499core_param(pause_on_oops, pause_on_oops, int, 0644);
500core_param(panic_on_warn, panic_on_warn, int, 0644);
488 501
489static int __init setup_crash_kexec_post_notifiers(char *s) 502static int __init setup_crash_kexec_post_notifiers(char *s)
490{ 503{
diff --git a/kernel/params.c b/kernel/params.c
index db97b791390f..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -603,74 +603,70 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
603 const struct kernel_param *kp, 603 const struct kernel_param *kp,
604 const char *name) 604 const char *name)
605{ 605{
606 struct module_param_attrs *new; 606 struct module_param_attrs *new_mp;
607 struct attribute **attrs; 607 struct attribute **new_attrs;
608 int err, num; 608 unsigned int i;
609 609
610 /* We don't bother calling this with invisible parameters. */ 610 /* We don't bother calling this with invisible parameters. */
611 BUG_ON(!kp->perm); 611 BUG_ON(!kp->perm);
612 612
613 if (!mk->mp) { 613 if (!mk->mp) {
614 num = 0; 614 /* First allocation. */
615 attrs = NULL; 615 mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
616 } else { 616 if (!mk->mp)
617 num = mk->mp->num; 617 return -ENOMEM;
618 attrs = mk->mp->grp.attrs; 618 mk->mp->grp.name = "parameters";
619 /* NULL-terminated attribute array. */
620 mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
621 GFP_KERNEL);
622 /* Caller will cleanup via free_module_param_attrs */
623 if (!mk->mp->grp.attrs)
624 return -ENOMEM;
619 } 625 }
620 626
621 /* Enlarge. */ 627 /* Enlarge allocations. */
622 new = krealloc(mk->mp, 628 new_mp = krealloc(mk->mp,
623 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), 629 sizeof(*mk->mp) +
624 GFP_KERNEL); 630 sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
625 if (!new) { 631 GFP_KERNEL);
626 kfree(attrs); 632 if (!new_mp)
627 err = -ENOMEM; 633 return -ENOMEM;
628 goto fail; 634 mk->mp = new_mp;
629 }
630 /* Despite looking like the typical realloc() bug, this is safe.
631 * We *want* the old 'attrs' to be freed either way, and we'll store
632 * the new one in the success case. */
633 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
634 if (!attrs) {
635 err = -ENOMEM;
636 goto fail_free_new;
637 }
638 635
639 /* Sysfs wants everything zeroed. */ 636 /* Extra pointer for NULL terminator */
640 memset(new, 0, sizeof(*new)); 637 new_attrs = krealloc(mk->mp->grp.attrs,
641 memset(&new->attrs[num], 0, sizeof(new->attrs[num])); 638 sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
642 memset(&attrs[num], 0, sizeof(attrs[num])); 639 GFP_KERNEL);
643 new->grp.name = "parameters"; 640 if (!new_attrs)
644 new->grp.attrs = attrs; 641 return -ENOMEM;
642 mk->mp->grp.attrs = new_attrs;
645 643
646 /* Tack new one on the end. */ 644 /* Tack new one on the end. */
647 sysfs_attr_init(&new->attrs[num].mattr.attr); 645 memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
648 new->attrs[num].param = kp; 646 sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
649 new->attrs[num].mattr.show = param_attr_show; 647 mk->mp->attrs[mk->mp->num].param = kp;
650 new->attrs[num].mattr.store = param_attr_store; 648 mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
651 new->attrs[num].mattr.attr.name = (char *)name; 649 /* Do not allow runtime DAC changes to make param writable. */
652 new->attrs[num].mattr.attr.mode = kp->perm; 650 if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
653 new->num = num+1; 651 mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
652 else
653 mk->mp->attrs[mk->mp->num].mattr.store = NULL;
654 mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
655 mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
656 mk->mp->num++;
654 657
655 /* Fix up all the pointers, since krealloc can move us */ 658 /* Fix up all the pointers, since krealloc can move us */
656 for (num = 0; num < new->num; num++) 659 for (i = 0; i < mk->mp->num; i++)
657 new->grp.attrs[num] = &new->attrs[num].mattr.attr; 660 mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
658 new->grp.attrs[num] = NULL; 661 mk->mp->grp.attrs[mk->mp->num] = NULL;
659
660 mk->mp = new;
661 return 0; 662 return 0;
662
663fail_free_new:
664 kfree(new);
665fail:
666 mk->mp = NULL;
667 return err;
668} 663}
669 664
670#ifdef CONFIG_MODULES 665#ifdef CONFIG_MODULES
671static void free_module_param_attrs(struct module_kobject *mk) 666static void free_module_param_attrs(struct module_kobject *mk)
672{ 667{
673 kfree(mk->mp->grp.attrs); 668 if (mk->mp)
669 kfree(mk->mp->grp.attrs);
674 kfree(mk->mp); 670 kfree(mk->mp);
675 mk->mp = NULL; 671 mk->mp = NULL;
676} 672}
@@ -695,8 +691,10 @@ int module_param_sysfs_setup(struct module *mod,
695 if (kparam[i].perm == 0) 691 if (kparam[i].perm == 0)
696 continue; 692 continue;
697 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); 693 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
698 if (err) 694 if (err) {
695 free_module_param_attrs(&mod->mkobj);
699 return err; 696 return err;
697 }
700 params = true; 698 params = true;
701 } 699 }
702 700
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..cd36a5e0d173 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = {
79 .level = 0, 79 .level = 0,
80 .child_reaper = &init_task, 80 .child_reaper = &init_task,
81 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
82 .proc_inum = PROC_PID_INIT_INO, 82 .ns.inum = PROC_PID_INIT_INO,
83#ifdef CONFIG_PID_NS
84 .ns.ops = &pidns_operations,
85#endif
83}; 86};
84EXPORT_SYMBOL_GPL(init_pid_ns); 87EXPORT_SYMBOL_GPL(init_pid_ns);
85 88
@@ -341,6 +344,8 @@ out:
341 344
342out_unlock: 345out_unlock:
343 spin_unlock_irq(&pidmap_lock); 346 spin_unlock_irq(&pidmap_lock);
347 put_pid_ns(ns);
348
344out_free: 349out_free:
345 while (++i <= ns->level) 350 while (++i <= ns->level)
346 free_pidmap(pid->numbers + i); 351 free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..a65ba137fd15 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
105 if (ns->pid_cachep == NULL) 105 if (ns->pid_cachep == NULL)
106 goto out_free_map; 106 goto out_free_map;
107 107
108 err = proc_alloc_inum(&ns->proc_inum); 108 err = ns_alloc_inum(&ns->ns);
109 if (err) 109 if (err)
110 goto out_free_map; 110 goto out_free_map;
111 ns->ns.ops = &pidns_operations;
111 112
112 kref_init(&ns->kref); 113 kref_init(&ns->kref);
113 ns->level = level; 114 ns->level = level;
@@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
142{ 143{
143 int i; 144 int i;
144 145
145 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
146 for (i = 0; i < PIDMAP_ENTRIES; i++) 147 for (i = 0; i < PIDMAP_ENTRIES; i++)
147 kfree(ns->pidmap[i].page); 148 kfree(ns->pidmap[i].page);
148 put_user_ns(ns->user_ns); 149 put_user_ns(ns->user_ns);
@@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
190 /* Don't allow any more processes into the pid namespace */ 191 /* Don't allow any more processes into the pid namespace */
191 disable_pid_allocation(pid_ns); 192 disable_pid_allocation(pid_ns);
192 193
193 /* Ignore SIGCHLD causing any terminated children to autoreap */ 194 /*
195 * Ignore SIGCHLD causing any terminated children to autoreap.
196 * This speeds up the namespace shutdown, plus see the comment
197 * below.
198 */
194 spin_lock_irq(&me->sighand->siglock); 199 spin_lock_irq(&me->sighand->siglock);
195 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; 200 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
196 spin_unlock_irq(&me->sighand->siglock); 201 spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
223 } 228 }
224 read_unlock(&tasklist_lock); 229 read_unlock(&tasklist_lock);
225 230
226 /* Firstly reap the EXIT_ZOMBIE children we may have. */ 231 /*
232 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
233 * sys_wait4() will also block until our children traced from the
234 * parent namespace are detached and become EXIT_DEAD.
235 */
227 do { 236 do {
228 clear_thread_flag(TIF_SIGPENDING); 237 clear_thread_flag(TIF_SIGPENDING);
229 rc = sys_wait4(-1, NULL, __WALL, NULL); 238 rc = sys_wait4(-1, NULL, __WALL, NULL);
230 } while (rc != -ECHILD); 239 } while (rc != -ECHILD);
231 240
232 /* 241 /*
233 * sys_wait4() above can't reap the TASK_DEAD children. 242 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
234 * Make sure they all go away, see free_pid(). 243 * really care, we could reparent them to the global init. We could
244 * exit and reap ->child_reaper even if it is not the last thread in
245 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
246 * pid_ns can not go away until proc_kill_sb() drops the reference.
247 *
248 * But this ns can also have other tasks injected by setns()+fork().
249 * Again, ignoring the user visible semantics we do not really need
250 * to wait until they are all reaped, but they can be reparented to
251 * us and thus we need to ensure that pid->child_reaper stays valid
252 * until they all go away. See free_pid()->wake_up_process().
253 *
254 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
255 * if reparented.
235 */ 256 */
236 for (;;) { 257 for (;;) {
237 set_current_state(TASK_UNINTERRUPTIBLE); 258 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
313 return 0; 334 return 0;
314} 335}
315 336
316static void *pidns_get(struct task_struct *task) 337static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
338{
339 return container_of(ns, struct pid_namespace, ns);
340}
341
342static struct ns_common *pidns_get(struct task_struct *task)
317{ 343{
318 struct pid_namespace *ns; 344 struct pid_namespace *ns;
319 345
@@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task)
323 get_pid_ns(ns); 349 get_pid_ns(ns);
324 rcu_read_unlock(); 350 rcu_read_unlock();
325 351
326 return ns; 352 return ns ? &ns->ns : NULL;
327} 353}
328 354
329static void pidns_put(void *ns) 355static void pidns_put(struct ns_common *ns)
330{ 356{
331 put_pid_ns(ns); 357 put_pid_ns(to_pid_ns(ns));
332} 358}
333 359
334static int pidns_install(struct nsproxy *nsproxy, void *ns) 360static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
335{ 361{
336 struct pid_namespace *active = task_active_pid_ns(current); 362 struct pid_namespace *active = task_active_pid_ns(current);
337 struct pid_namespace *ancestor, *new = ns; 363 struct pid_namespace *ancestor, *new = to_pid_ns(ns);
338 364
339 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || 365 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
340 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 366 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
362 return 0; 388 return 0;
363} 389}
364 390
365static unsigned int pidns_inum(void *ns)
366{
367 struct pid_namespace *pid_ns = ns;
368 return pid_ns->proc_inum;
369}
370
371const struct proc_ns_operations pidns_operations = { 391const struct proc_ns_operations pidns_operations = {
372 .name = "pid", 392 .name = "pid",
373 .type = CLONE_NEWPID, 393 .type = CLONE_NEWPID,
374 .get = pidns_get, 394 .get = pidns_get,
375 .put = pidns_put, 395 .put = pidns_put,
376 .install = pidns_install, 396 .install = pidns_install,
377 .inum = pidns_inum,
378}; 397};
379 398
380static __init int pid_namespaces_init(void) 399static __init int pid_namespaces_init(void)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..48b28d387c7f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
94config PM_SLEEP 94config PM_SLEEP
95 def_bool y 95 def_bool y
96 depends on SUSPEND || HIBERNATE_CALLBACKS 96 depends on SUSPEND || HIBERNATE_CALLBACKS
97 select PM
97 98
98config PM_SLEEP_SMP 99config PM_SLEEP_SMP
99 def_bool y 100 def_bool y
@@ -129,24 +130,19 @@ config PM_WAKELOCKS_GC
129 depends on PM_WAKELOCKS 130 depends on PM_WAKELOCKS
130 default y 131 default y
131 132
132config PM_RUNTIME 133config PM
133 bool "Run-time PM core functionality" 134 bool "Device power management core functionality"
134 depends on !IA64_HP_SIM
135 ---help--- 135 ---help---
136 Enable functionality allowing I/O devices to be put into energy-saving 136 Enable functionality allowing I/O devices to be put into energy-saving
137 (low power) states at run time (or autosuspended) after a specified 137 (low power) states, for example after a specified period of inactivity
138 period of inactivity and woken up in response to a hardware-generated 138 (autosuspended), and woken up in response to a hardware-generated
139 wake-up event or a driver's request. 139 wake-up event or a driver's request.
140 140
141 Hardware support is generally required for this functionality to work 141 Hardware support is generally required for this functionality to work
142 and the bus type drivers of the buses the devices are on are 142 and the bus type drivers of the buses the devices are on are
143 responsible for the actual handling of the autosuspend requests and 143 responsible for the actual handling of device suspend requests and
144 wake-up events. 144 wake-up events.
145 145
146config PM
147 def_bool y
148 depends on PM_SLEEP || PM_RUNTIME
149
150config PM_DEBUG 146config PM_DEBUG
151 bool "Power Management Debug Support" 147 bool "Power Management Debug Support"
152 depends on PM 148 depends on PM
@@ -298,14 +294,9 @@ config PM_GENERIC_DOMAINS_SLEEP
298 def_bool y 294 def_bool y
299 depends on PM_SLEEP && PM_GENERIC_DOMAINS 295 depends on PM_SLEEP && PM_GENERIC_DOMAINS
300 296
301config PM_GENERIC_DOMAINS_RUNTIME
302 def_bool y
303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
304
305config PM_GENERIC_DOMAINS_OF 297config PM_GENERIC_DOMAINS_OF
306 def_bool y 298 def_bool y
307 depends on PM_GENERIC_DOMAINS && OF 299 depends on PM_GENERIC_DOMAINS && OF
308 300
309config CPU_PM 301config CPU_PM
310 bool 302 bool
311 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
28#include <linux/syscore_ops.h> 28#include <linux/syscore_ops.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/genhd.h> 30#include <linux/genhd.h>
31#include <linux/ktime.h>
31#include <trace/events/power.h> 32#include <trace/events/power.h>
32 33
33#include "power.h" 34#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
232 * @nr_pages: Number of memory pages processed between @start and @stop. 233 * @nr_pages: Number of memory pages processed between @start and @stop.
233 * @msg: Additional diagnostic message to print. 234 * @msg: Additional diagnostic message to print.
234 */ 235 */
235void swsusp_show_speed(struct timeval *start, struct timeval *stop, 236void swsusp_show_speed(ktime_t start, ktime_t stop,
236 unsigned nr_pages, char *msg) 237 unsigned nr_pages, char *msg)
237{ 238{
239 ktime_t diff;
238 u64 elapsed_centisecs64; 240 u64 elapsed_centisecs64;
239 unsigned int centisecs; 241 unsigned int centisecs;
240 unsigned int k; 242 unsigned int k;
241 unsigned int kps; 243 unsigned int kps;
242 244
243 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 245 diff = ktime_sub(stop, start);
244 /* 246 elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
245 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
246 * it is obvious enough for what went wrong.
247 */
248 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
249 centisecs = elapsed_centisecs64; 247 centisecs = elapsed_centisecs64;
250 if (centisecs == 0) 248 if (centisecs == 0)
251 centisecs = 1; /* avoid div-by-zero */ 249 centisecs = 1; /* avoid div-by-zero */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
174 174
175struct timeval; 175struct timeval;
176/* kernel/power/swsusp.c */ 176/* kernel/power/swsusp.c */
177extern void swsusp_show_speed(struct timeval *, struct timeval *, 177extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
178 unsigned int, char *);
179 178
180#ifdef CONFIG_SUSPEND 179#ifdef CONFIG_SUSPEND
181/* kernel/power/suspend.c */ 180/* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/ktime.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
1576 struct zone *zone; 1577 struct zone *zone;
1577 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1578 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1578 unsigned long alloc, save_highmem, pages_highmem, avail_normal; 1579 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1579 struct timeval start, stop; 1580 ktime_t start, stop;
1580 int error; 1581 int error;
1581 1582
1582 printk(KERN_INFO "PM: Preallocating image memory... "); 1583 printk(KERN_INFO "PM: Preallocating image memory... ");
1583 do_gettimeofday(&start); 1584 start = ktime_get();
1584 1585
1585 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); 1586 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1586 if (error) 1587 if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
1709 free_unnecessary_pages(); 1710 free_unnecessary_pages();
1710 1711
1711 out: 1712 out:
1712 do_gettimeofday(&stop); 1713 stop = ktime_get();
1713 printk(KERN_CONT "done (allocated %lu pages)\n", pages); 1714 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1714 swsusp_show_speed(&start, &stop, pages, "Allocated"); 1715 swsusp_show_speed(start, stop, pages, "Allocated");
1715 1716
1716 return 0; 1717 return 0;
1717 1718
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
30#include <linux/atomic.h> 30#include <linux/atomic.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/ktime.h>
33 34
34#include "power.h" 35#include "power.h"
35 36
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
445 int nr_pages; 446 int nr_pages;
446 int err2; 447 int err2;
447 struct bio *bio; 448 struct bio *bio;
448 struct timeval start; 449 ktime_t start;
449 struct timeval stop; 450 ktime_t stop;
450 451
451 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", 452 printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
452 nr_to_write); 453 nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
455 m = 1; 456 m = 1;
456 nr_pages = 0; 457 nr_pages = 0;
457 bio = NULL; 458 bio = NULL;
458 do_gettimeofday(&start); 459 start = ktime_get();
459 while (1) { 460 while (1) {
460 ret = snapshot_read_next(snapshot); 461 ret = snapshot_read_next(snapshot);
461 if (ret <= 0) 462 if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
469 nr_pages++; 470 nr_pages++;
470 } 471 }
471 err2 = hib_wait_on_bio_chain(&bio); 472 err2 = hib_wait_on_bio_chain(&bio);
472 do_gettimeofday(&stop); 473 stop = ktime_get();
473 if (!ret) 474 if (!ret)
474 ret = err2; 475 ret = err2;
475 if (!ret) 476 if (!ret)
476 printk(KERN_INFO "PM: Image saving done.\n"); 477 printk(KERN_INFO "PM: Image saving done.\n");
477 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 478 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
478 return ret; 479 return ret;
479} 480}
480 481
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
580 int nr_pages; 581 int nr_pages;
581 int err2; 582 int err2;
582 struct bio *bio; 583 struct bio *bio;
583 struct timeval start; 584 ktime_t start;
584 struct timeval stop; 585 ktime_t stop;
585 size_t off; 586 size_t off;
586 unsigned thr, run_threads, nr_threads; 587 unsigned thr, run_threads, nr_threads;
587 unsigned char *page = NULL; 588 unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
674 m = 1; 675 m = 1;
675 nr_pages = 0; 676 nr_pages = 0;
676 bio = NULL; 677 bio = NULL;
677 do_gettimeofday(&start); 678 start = ktime_get();
678 for (;;) { 679 for (;;) {
679 for (thr = 0; thr < nr_threads; thr++) { 680 for (thr = 0; thr < nr_threads; thr++) {
680 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { 681 for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
759 760
760out_finish: 761out_finish:
761 err2 = hib_wait_on_bio_chain(&bio); 762 err2 = hib_wait_on_bio_chain(&bio);
762 do_gettimeofday(&stop); 763 stop = ktime_get();
763 if (!ret) 764 if (!ret)
764 ret = err2; 765 ret = err2;
765 if (!ret) 766 if (!ret)
766 printk(KERN_INFO "PM: Image saving done.\n"); 767 printk(KERN_INFO "PM: Image saving done.\n");
767 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 768 swsusp_show_speed(start, stop, nr_to_write, "Wrote");
768out_clean: 769out_clean:
769 if (crc) { 770 if (crc) {
770 if (crc->thr) 771 if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
965{ 966{
966 unsigned int m; 967 unsigned int m;
967 int ret = 0; 968 int ret = 0;
968 struct timeval start; 969 ktime_t start;
969 struct timeval stop; 970 ktime_t stop;
970 struct bio *bio; 971 struct bio *bio;
971 int err2; 972 int err2;
972 unsigned nr_pages; 973 unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
978 m = 1; 979 m = 1;
979 nr_pages = 0; 980 nr_pages = 0;
980 bio = NULL; 981 bio = NULL;
981 do_gettimeofday(&start); 982 start = ktime_get();
982 for ( ; ; ) { 983 for ( ; ; ) {
983 ret = snapshot_write_next(snapshot); 984 ret = snapshot_write_next(snapshot);
984 if (ret <= 0) 985 if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
996 nr_pages++; 997 nr_pages++;
997 } 998 }
998 err2 = hib_wait_on_bio_chain(&bio); 999 err2 = hib_wait_on_bio_chain(&bio);
999 do_gettimeofday(&stop); 1000 stop = ktime_get();
1000 if (!ret) 1001 if (!ret)
1001 ret = err2; 1002 ret = err2;
1002 if (!ret) { 1003 if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
1005 if (!snapshot_image_loaded(snapshot)) 1006 if (!snapshot_image_loaded(snapshot))
1006 ret = -ENODATA; 1007 ret = -ENODATA;
1007 } 1008 }
1008 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1009 swsusp_show_speed(start, stop, nr_to_read, "Read");
1009 return ret; 1010 return ret;
1010} 1011}
1011 1012
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
1067 int ret = 0; 1068 int ret = 0;
1068 int eof = 0; 1069 int eof = 0;
1069 struct bio *bio; 1070 struct bio *bio;
1070 struct timeval start; 1071 ktime_t start;
1071 struct timeval stop; 1072 ktime_t stop;
1072 unsigned nr_pages; 1073 unsigned nr_pages;
1073 size_t off; 1074 size_t off;
1074 unsigned i, thr, run_threads, nr_threads; 1075 unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1190 m = 1; 1191 m = 1;
1191 nr_pages = 0; 1192 nr_pages = 0;
1192 bio = NULL; 1193 bio = NULL;
1193 do_gettimeofday(&start); 1194 start = ktime_get();
1194 1195
1195 ret = snapshot_write_next(snapshot); 1196 ret = snapshot_write_next(snapshot);
1196 if (ret <= 0) 1197 if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
1343 wait_event(crc->done, atomic_read(&crc->stop)); 1344 wait_event(crc->done, atomic_read(&crc->stop));
1344 atomic_set(&crc->stop, 0); 1345 atomic_set(&crc->stop, 0);
1345 } 1346 }
1346 do_gettimeofday(&stop); 1347 stop = ktime_get();
1347 if (!ret) { 1348 if (!ret) {
1348 printk(KERN_INFO "PM: Image loading done.\n"); 1349 printk(KERN_INFO "PM: Image loading done.\n");
1349 snapshot_write_finalize(snapshot); 1350 snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
1359 } 1360 }
1360 } 1361 }
1361 } 1362 }
1362 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 1363 swsusp_show_speed(start, stop, nr_to_read, "Read");
1363out_clean: 1364out_clean:
1364 for (i = 0; i < ring_size; i++) 1365 for (i = 0; i < ring_size; i++)
1365 free_page((unsigned long)page[i]); 1366 free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
1374 kthread_stop(data[thr].thr); 1375 kthread_stop(data[thr].thr);
1375 vfree(data); 1376 vfree(data);
1376 } 1377 }
1377 if (page) vfree(page); 1378 vfree(page);
1378 1379
1379 return ret; 1380 return ret;
1380} 1381}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..02d6b6d28796 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
63}; 63};
64 64
65/* Deferred messaged from sched code are marked by this special level */
66#define SCHED_MESSAGE_LOGLEVEL -2
67
68/* 65/*
69 * Low level drivers may need that to know if they can schedule in 66 * Low level drivers may need that to know if they can schedule in
70 * their unblank() callback or not. So let's export it. 67 * their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
480 type != SYSLOG_ACTION_SIZE_BUFFER; 477 type != SYSLOG_ACTION_SIZE_BUFFER;
481} 478}
482 479
483static int check_syslog_permissions(int type, bool from_file) 480int check_syslog_permissions(int type, bool from_file)
484{ 481{
485 /* 482 /*
486 * If this is from /proc/kmsg and we've already opened it, then we've 483 * If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1259int do_syslog(int type, char __user *buf, int len, bool from_file) 1256int do_syslog(int type, char __user *buf, int len, bool from_file)
1260{ 1257{
1261 bool clear = false; 1258 bool clear = false;
1262 static int saved_console_loglevel = -1; 1259 static int saved_console_loglevel = LOGLEVEL_DEFAULT;
1263 int error; 1260 int error;
1264 1261
1265 error = check_syslog_permissions(type, from_file); 1262 error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1316 break; 1313 break;
1317 /* Disable logging to console */ 1314 /* Disable logging to console */
1318 case SYSLOG_ACTION_CONSOLE_OFF: 1315 case SYSLOG_ACTION_CONSOLE_OFF:
1319 if (saved_console_loglevel == -1) 1316 if (saved_console_loglevel == LOGLEVEL_DEFAULT)
1320 saved_console_loglevel = console_loglevel; 1317 saved_console_loglevel = console_loglevel;
1321 console_loglevel = minimum_console_loglevel; 1318 console_loglevel = minimum_console_loglevel;
1322 break; 1319 break;
1323 /* Enable logging to console */ 1320 /* Enable logging to console */
1324 case SYSLOG_ACTION_CONSOLE_ON: 1321 case SYSLOG_ACTION_CONSOLE_ON:
1325 if (saved_console_loglevel != -1) { 1322 if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
1326 console_loglevel = saved_console_loglevel; 1323 console_loglevel = saved_console_loglevel;
1327 saved_console_loglevel = -1; 1324 saved_console_loglevel = LOGLEVEL_DEFAULT;
1328 } 1325 }
1329 break; 1326 break;
1330 /* Set level of messages printed to console */ 1327 /* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1336 len = minimum_console_loglevel; 1333 len = minimum_console_loglevel;
1337 console_loglevel = len; 1334 console_loglevel = len;
1338 /* Implicitly re-enable logging to console */ 1335 /* Implicitly re-enable logging to console */
1339 saved_console_loglevel = -1; 1336 saved_console_loglevel = LOGLEVEL_DEFAULT;
1340 error = 0; 1337 error = 0;
1341 break; 1338 break;
1342 /* Number of chars in the log buffer */ 1339 /* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1627 int printed_len = 0; 1624 int printed_len = 0;
1628 bool in_sched = false; 1625 bool in_sched = false;
1629 /* cpu currently holding logbuf_lock in this function */ 1626 /* cpu currently holding logbuf_lock in this function */
1630 static volatile unsigned int logbuf_cpu = UINT_MAX; 1627 static unsigned int logbuf_cpu = UINT_MAX;
1631 1628
1632 if (level == SCHED_MESSAGE_LOGLEVEL) { 1629 if (level == LOGLEVEL_SCHED) {
1633 level = -1; 1630 level = LOGLEVEL_DEFAULT;
1634 in_sched = true; 1631 in_sched = true;
1635 } 1632 }
1636 1633
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1695 const char *end_of_header = printk_skip_level(text); 1692 const char *end_of_header = printk_skip_level(text);
1696 switch (kern_level) { 1693 switch (kern_level) {
1697 case '0' ... '7': 1694 case '0' ... '7':
1698 if (level == -1) 1695 if (level == LOGLEVEL_DEFAULT)
1699 level = kern_level - '0'; 1696 level = kern_level - '0';
1697 /* fallthrough */
1700 case 'd': /* KERN_DEFAULT */ 1698 case 'd': /* KERN_DEFAULT */
1701 lflags |= LOG_PREFIX; 1699 lflags |= LOG_PREFIX;
1702 } 1700 }
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1710 } 1708 }
1711 } 1709 }
1712 1710
1713 if (level == -1) 1711 if (level == LOGLEVEL_DEFAULT)
1714 level = default_message_loglevel; 1712 level = default_message_loglevel;
1715 1713
1716 if (dict) 1714 if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
1788 1786
1789asmlinkage int vprintk(const char *fmt, va_list args) 1787asmlinkage int vprintk(const char *fmt, va_list args)
1790{ 1788{
1791 return vprintk_emit(0, -1, NULL, 0, fmt, args); 1789 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1792} 1790}
1793EXPORT_SYMBOL(vprintk); 1791EXPORT_SYMBOL(vprintk);
1794 1792
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
1807} 1805}
1808EXPORT_SYMBOL(printk_emit); 1806EXPORT_SYMBOL(printk_emit);
1809 1807
1808int vprintk_default(const char *fmt, va_list args)
1809{
1810 int r;
1811
1812#ifdef CONFIG_KGDB_KDB
1813 if (unlikely(kdb_trap_printk)) {
1814 r = vkdb_printf(fmt, args);
1815 return r;
1816 }
1817#endif
1818 r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
1819
1820 return r;
1821}
1822EXPORT_SYMBOL_GPL(vprintk_default);
1823
1824/*
1825 * This allows printk to be diverted to another function per cpu.
1826 * This is useful for calling printk functions from within NMI
1827 * without worrying about race conditions that can lock up the
1828 * box.
1829 */
1830DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
1831
1810/** 1832/**
1811 * printk - print a kernel message 1833 * printk - print a kernel message
1812 * @fmt: format string 1834 * @fmt: format string
@@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit);
1830 */ 1852 */
1831asmlinkage __visible int printk(const char *fmt, ...) 1853asmlinkage __visible int printk(const char *fmt, ...)
1832{ 1854{
1855 printk_func_t vprintk_func;
1833 va_list args; 1856 va_list args;
1834 int r; 1857 int r;
1835 1858
1836#ifdef CONFIG_KGDB_KDB
1837 if (unlikely(kdb_trap_printk)) {
1838 va_start(args, fmt);
1839 r = vkdb_printf(fmt, args);
1840 va_end(args);
1841 return r;
1842 }
1843#endif
1844 va_start(args, fmt); 1859 va_start(args, fmt);
1845 r = vprintk_emit(0, -1, NULL, 0, fmt, args); 1860
1861 /*
1862 * If a caller overrides the per_cpu printk_func, then it needs
1863 * to disable preemption when calling printk(). Otherwise
1864 * the printk_func should be set to the default. No need to
1865 * disable preemption here.
1866 */
1867 vprintk_func = this_cpu_read(printk_func);
1868 r = vprintk_func(fmt, args);
1869
1846 va_end(args); 1870 va_end(args);
1847 1871
1848 return r; 1872 return r;
@@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1876 bool syslog, char *buf, size_t size) { return 0; } 1900 bool syslog, char *buf, size_t size) { return 0; }
1877static size_t cont_print_text(char *text, size_t size) { return 0; } 1901static size_t cont_print_text(char *text, size_t size) { return 0; }
1878 1902
1903/* Still needs to be defined for users */
1904DEFINE_PER_CPU(printk_func_t, printk_func);
1905
1879#endif /* CONFIG_PRINTK */ 1906#endif /* CONFIG_PRINTK */
1880 1907
1881#ifdef CONFIG_EARLY_PRINTK 1908#ifdef CONFIG_EARLY_PRINTK
1882struct console *early_console; 1909struct console *early_console;
1883 1910
1884void early_vprintk(const char *fmt, va_list ap)
1885{
1886 if (early_console) {
1887 char buf[512];
1888 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1889
1890 early_console->write(early_console, buf, n);
1891 }
1892}
1893
1894asmlinkage __visible void early_printk(const char *fmt, ...) 1911asmlinkage __visible void early_printk(const char *fmt, ...)
1895{ 1912{
1896 va_list ap; 1913 va_list ap;
1914 char buf[512];
1915 int n;
1916
1917 if (!early_console)
1918 return;
1897 1919
1898 va_start(ap, fmt); 1920 va_start(ap, fmt);
1899 early_vprintk(fmt, ap); 1921 n = vscnprintf(buf, sizeof(buf), fmt, ap);
1900 va_end(ap); 1922 va_end(ap);
1923
1924 early_console->write(early_console, buf, n);
1901} 1925}
1902#endif 1926#endif
1903 1927
@@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...)
2634 2658
2635 preempt_disable(); 2659 preempt_disable();
2636 va_start(args, fmt); 2660 va_start(args, fmt);
2637 r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); 2661 r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
2638 va_end(args); 2662 va_end(args);
2639 2663
2640 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2664 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
485 485
486/* 486/*
487 * Detach all tasks we were using ptrace on. Called with tasklist held 487 * Detach all tasks we were using ptrace on. Called with tasklist held
488 * for writing, and returns with it held too. But note it can release 488 * for writing.
489 * and reacquire the lock.
490 */ 489 */
491void exit_ptrace(struct task_struct *tracer) 490void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
492 __releases(&tasklist_lock)
493 __acquires(&tasklist_lock)
494{ 491{
495 struct task_struct *p, *n; 492 struct task_struct *p, *n;
496 LIST_HEAD(ptrace_dead);
497
498 if (likely(list_empty(&tracer->ptraced)))
499 return;
500 493
501 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { 494 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
502 if (unlikely(p->ptrace & PT_EXITKILL)) 495 if (unlikely(p->ptrace & PT_EXITKILL))
503 send_sig_info(SIGKILL, SEND_SIG_FORCED, p); 496 send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
504 497
505 if (__ptrace_detach(tracer, p)) 498 if (__ptrace_detach(tracer, p))
506 list_add(&p->ptrace_entry, &ptrace_dead); 499 list_add(&p->ptrace_entry, dead);
507 }
508
509 write_unlock_irq(&tasklist_lock);
510 BUG_ON(!list_empty(&tracer->ptraced));
511
512 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
513 list_del_init(&p->ptrace_entry);
514 release_task(p);
515 } 500 }
516
517 write_lock_irq(&tasklist_lock);
518} 501}
519 502
520int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 503int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 322ea8e93e4b..82cfc285b046 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2)
113{ 113{
114 const struct range *r1 = x1; 114 const struct range *r1 = x1;
115 const struct range *r2 = x2; 115 const struct range *r2 = x2;
116 s64 start1, start2;
117 116
118 start1 = r1->start; 117 if (r1->start < r2->start)
119 start2 = r2->start; 118 return -1;
120 119 if (r1->start > r2->start)
121 return start1 - start2; 120 return 1;
121 return 0;
122} 122}
123 123
124int clean_sort_range(struct range *range, int az) 124int clean_sort_range(struct range *range, int az)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15#include <linux/mm.h>
16
17void res_counter_init(struct res_counter *counter, struct res_counter *parent)
18{
19 spin_lock_init(&counter->lock);
20 counter->limit = RES_COUNTER_MAX;
21 counter->soft_limit = RES_COUNTER_MAX;
22 counter->parent = parent;
23}
24
25static u64 res_counter_uncharge_locked(struct res_counter *counter,
26 unsigned long val)
27{
28 if (WARN_ON(counter->usage < val))
29 val = counter->usage;
30
31 counter->usage -= val;
32 return counter->usage;
33}
34
35static int res_counter_charge_locked(struct res_counter *counter,
36 unsigned long val, bool force)
37{
38 int ret = 0;
39
40 if (counter->usage + val > counter->limit) {
41 counter->failcnt++;
42 ret = -ENOMEM;
43 if (!force)
44 return ret;
45 }
46
47 counter->usage += val;
48 if (counter->usage > counter->max_usage)
49 counter->max_usage = counter->usage;
50 return ret;
51}
52
53static int __res_counter_charge(struct res_counter *counter, unsigned long val,
54 struct res_counter **limit_fail_at, bool force)
55{
56 int ret, r;
57 unsigned long flags;
58 struct res_counter *c, *u;
59
60 r = ret = 0;
61 *limit_fail_at = NULL;
62 local_irq_save(flags);
63 for (c = counter; c != NULL; c = c->parent) {
64 spin_lock(&c->lock);
65 r = res_counter_charge_locked(c, val, force);
66 spin_unlock(&c->lock);
67 if (r < 0 && !ret) {
68 ret = r;
69 *limit_fail_at = c;
70 if (!force)
71 break;
72 }
73 }
74
75 if (ret < 0 && !force) {
76 for (u = counter; u != c; u = u->parent) {
77 spin_lock(&u->lock);
78 res_counter_uncharge_locked(u, val);
79 spin_unlock(&u->lock);
80 }
81 }
82 local_irq_restore(flags);
83
84 return ret;
85}
86
87int res_counter_charge(struct res_counter *counter, unsigned long val,
88 struct res_counter **limit_fail_at)
89{
90 return __res_counter_charge(counter, val, limit_fail_at, false);
91}
92
93int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
94 struct res_counter **limit_fail_at)
95{
96 return __res_counter_charge(counter, val, limit_fail_at, true);
97}
98
99u64 res_counter_uncharge_until(struct res_counter *counter,
100 struct res_counter *top,
101 unsigned long val)
102{
103 unsigned long flags;
104 struct res_counter *c;
105 u64 ret = 0;
106
107 local_irq_save(flags);
108 for (c = counter; c != top; c = c->parent) {
109 u64 r;
110 spin_lock(&c->lock);
111 r = res_counter_uncharge_locked(c, val);
112 if (c == counter)
113 ret = r;
114 spin_unlock(&c->lock);
115 }
116 local_irq_restore(flags);
117 return ret;
118}
119
120u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
121{
122 return res_counter_uncharge_until(counter, NULL, val);
123}
124
125static inline unsigned long long *
126res_counter_member(struct res_counter *counter, int member)
127{
128 switch (member) {
129 case RES_USAGE:
130 return &counter->usage;
131 case RES_MAX_USAGE:
132 return &counter->max_usage;
133 case RES_LIMIT:
134 return &counter->limit;
135 case RES_FAILCNT:
136 return &counter->failcnt;
137 case RES_SOFT_LIMIT:
138 return &counter->soft_limit;
139 };
140
141 BUG();
142 return NULL;
143}
144
145ssize_t res_counter_read(struct res_counter *counter, int member,
146 const char __user *userbuf, size_t nbytes, loff_t *pos,
147 int (*read_strategy)(unsigned long long val, char *st_buf))
148{
149 unsigned long long *val;
150 char buf[64], *s;
151
152 s = buf;
153 val = res_counter_member(counter, member);
154 if (read_strategy)
155 s += read_strategy(*val, s);
156 else
157 s += sprintf(s, "%llu\n", *val);
158 return simple_read_from_buffer((void __user *)userbuf, nbytes,
159 pos, buf, s - buf);
160}
161
162#if BITS_PER_LONG == 32
163u64 res_counter_read_u64(struct res_counter *counter, int member)
164{
165 unsigned long flags;
166 u64 ret;
167
168 spin_lock_irqsave(&counter->lock, flags);
169 ret = *res_counter_member(counter, member);
170 spin_unlock_irqrestore(&counter->lock, flags);
171
172 return ret;
173}
174#else
175u64 res_counter_read_u64(struct res_counter *counter, int member)
176{
177 return *res_counter_member(counter, member);
178}
179#endif
180
181int res_counter_memparse_write_strategy(const char *buf,
182 unsigned long long *resp)
183{
184 char *end;
185 unsigned long long res;
186
187 /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
188 if (*buf == '-') {
189 int rc = kstrtoull(buf + 1, 10, &res);
190
191 if (rc)
192 return rc;
193 if (res != 1)
194 return -EINVAL;
195 *resp = RES_COUNTER_MAX;
196 return 0;
197 }
198
199 res = memparse(buf, &end);
200 if (*end != '\0')
201 return -EINVAL;
202
203 if (PAGE_ALIGN(res) >= res)
204 res = PAGE_ALIGN(res);
205 else
206 res = RES_COUNTER_MAX;
207
208 *resp = res;
209
210 return 0;
211}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb398c0c5f08..c0accc00566e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
4527#ifdef CONFIG_DEBUG_STACK_USAGE 4527#ifdef CONFIG_DEBUG_STACK_USAGE
4528 free = stack_not_used(p); 4528 free = stack_not_used(p);
4529#endif 4529#endif
4530 ppid = 0;
4530 rcu_read_lock(); 4531 rcu_read_lock();
4531 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4532 if (pid_alive(p))
4533 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4532 rcu_read_unlock(); 4534 rcu_read_unlock();
4533 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4535 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4534 task_pid_nr(p), ppid, 4536 task_pid_nr(p), ppid,
@@ -7111,9 +7113,6 @@ void __init sched_init(void)
7111#ifdef CONFIG_RT_GROUP_SCHED 7113#ifdef CONFIG_RT_GROUP_SCHED
7112 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7114 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7113#endif 7115#endif
7114#ifdef CONFIG_CPUMASK_OFFSTACK
7115 alloc_size += num_possible_cpus() * cpumask_size();
7116#endif
7117 if (alloc_size) { 7116 if (alloc_size) {
7118 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7117 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7119 7118
@@ -7133,13 +7132,13 @@ void __init sched_init(void)
7133 ptr += nr_cpu_ids * sizeof(void **); 7132 ptr += nr_cpu_ids * sizeof(void **);
7134 7133
7135#endif /* CONFIG_RT_GROUP_SCHED */ 7134#endif /* CONFIG_RT_GROUP_SCHED */
7135 }
7136#ifdef CONFIG_CPUMASK_OFFSTACK 7136#ifdef CONFIG_CPUMASK_OFFSTACK
7137 for_each_possible_cpu(i) { 7137 for_each_possible_cpu(i) {
7138 per_cpu(load_balance_mask, i) = (void *)ptr; 7138 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7139 ptr += cpumask_size(); 7139 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7140 }
7141#endif /* CONFIG_CPUMASK_OFFSTACK */
7142 } 7140 }
7141#endif /* CONFIG_CPUMASK_OFFSTACK */
7143 7142
7144 init_rt_bandwidth(&def_rt_bandwidth, 7143 init_rt_bandwidth(&def_rt_bandwidth,
7145 global_rt_period(), global_rt_runtime()); 7144 global_rt_period(), global_rt_runtime());
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e5db8c6feebd..b52092f2636d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -570,24 +570,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
570static 570static
571int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) 571int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
572{ 572{
573 int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); 573 return (dl_se->runtime <= 0);
574 int rorun = dl_se->runtime <= 0;
575
576 if (!rorun && !dmiss)
577 return 0;
578
579 /*
580 * If we are beyond our current deadline and we are still
581 * executing, then we have already used some of the runtime of
582 * the next instance. Thus, if we do not account that, we are
583 * stealing bandwidth from the system at each deadline miss!
584 */
585 if (dmiss) {
586 dl_se->runtime = rorun ? dl_se->runtime : 0;
587 dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
588 }
589
590 return 1;
591} 574}
592 575
593extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 576extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
@@ -826,10 +809,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
826 * parameters of the task might need updating. Otherwise, 809 * parameters of the task might need updating. Otherwise,
827 * we want a replenishment of its runtime. 810 * we want a replenishment of its runtime.
828 */ 811 */
829 if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) 812 if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
830 replenish_dl_entity(dl_se, pi_se);
831 else
832 update_dl_entity(dl_se, pi_se); 813 update_dl_entity(dl_se, pi_se);
814 else if (flags & ENQUEUE_REPLENISH)
815 replenish_dl_entity(dl_se, pi_se);
833 816
834 __enqueue_dl_entity(dl_se); 817 __enqueue_dl_entity(dl_se);
835} 818}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2cdf77f899..40667cbf371b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4005,6 +4005,10 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
4005 4005
4006static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 4006static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4007{ 4007{
4008 /* init_cfs_bandwidth() was not called */
4009 if (!cfs_b->throttled_cfs_rq.next)
4010 return;
4011
4008 hrtimer_cancel(&cfs_b->period_timer); 4012 hrtimer_cancel(&cfs_b->period_timer);
4009 hrtimer_cancel(&cfs_b->slack_timer); 4013 hrtimer_cancel(&cfs_b->slack_timer);
4010} 4014}
@@ -4424,7 +4428,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4424 * wl = S * s'_i; see (2) 4428 * wl = S * s'_i; see (2)
4425 */ 4429 */
4426 if (W > 0 && w < W) 4430 if (W > 0 && w < W)
4427 wl = (w * tg->shares) / W; 4431 wl = (w * (long)tg->shares) / W;
4428 else 4432 else
4429 wl = tg->shares; 4433 wl = tg->shares;
4430 4434
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a82..b6e4c16377c7 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
25} 25}
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28int snprint_stack_trace(char *buf, size_t size,
29 struct stack_trace *trace, int spaces)
30{
31 int i;
32 unsigned long ip;
33 int generated;
34 int total = 0;
35
36 if (WARN_ON(!trace->entries))
37 return 0;
38
39 for (i = 0; i < trace->nr_entries; i++) {
40 ip = trace->entries[i];
41 generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
42 1 + spaces, ' ', (void *) ip, (void *) ip);
43
44 total += generated;
45
46 /* Assume that generated isn't a negative number */
47 if (generated >= size) {
48 buf += size;
49 size = 0;
50 } else {
51 buf += generated;
52 size -= generated;
53 }
54 }
55
56 return total;
57}
58EXPORT_SYMBOL_GPL(snprint_stack_trace);
59
28/* 60/*
29 * Architectures that do not implement save_stack_trace_tsk or 61 * Architectures that do not implement save_stack_trace_tsk or
30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning 62 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..5adcb0ae3a58 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -169,6 +169,8 @@ cond_syscall(ppc_rtas);
169cond_syscall(sys_spu_run); 169cond_syscall(sys_spu_run);
170cond_syscall(sys_spu_create); 170cond_syscall(sys_spu_create);
171cond_syscall(sys_subpage_prot); 171cond_syscall(sys_subpage_prot);
172cond_syscall(sys_s390_pci_mmio_read);
173cond_syscall(sys_s390_pci_mmio_write);
172 174
173/* mmu depending weak syscall entries */ 175/* mmu depending weak syscall entries */
174cond_syscall(sys_mprotect); 176cond_syscall(sys_mprotect);
@@ -224,3 +226,6 @@ cond_syscall(sys_seccomp);
224 226
225/* access BPF programs and maps */ 227/* access BPF programs and maps */
226cond_syscall(sys_bpf); 228cond_syscall(sys_bpf);
229
230/* execveat */
231cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..137c7f69b264 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = {
623 .mode = 0644, 623 .mode = 0644,
624 .proc_handler = proc_dointvec, 624 .proc_handler = proc_dointvec,
625 }, 625 },
626 {
627 .procname = "tracepoint_printk",
628 .data = &tracepoint_printk,
629 .maxlen = sizeof(tracepoint_printk),
630 .mode = 0644,
631 .proc_handler = proc_dointvec,
632 },
626#endif 633#endif
627#ifdef CONFIG_KEXEC 634#ifdef CONFIG_KEXEC
628 { 635 {
@@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = {
1104 .proc_handler = proc_dointvec, 1111 .proc_handler = proc_dointvec,
1105 }, 1112 },
1106#endif 1113#endif
1114 {
1115 .procname = "panic_on_warn",
1116 .data = &panic_on_warn,
1117 .maxlen = sizeof(int),
1118 .mode = 0644,
1119 .proc_handler = proc_dointvec_minmax,
1120 .extra1 = &zero,
1121 .extra2 = &one,
1122 },
1107 { } 1123 { }
1108}; 1124};
1109 1125
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, 137 { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, 138 { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, 139 { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
140 { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
140 {} 141 {}
141}; 142};
142 143
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
459 stats = nla_data(na); 459 stats = nla_data(na);
460 memset(stats, 0, sizeof(*stats)); 460 memset(stats, 0, sizeof(*stats));
461 461
462 rc = cgroupstats_build(stats, f.file->f_dentry); 462 rc = cgroupstats_build(stats, f.file->f_path.dentry);
463 if (rc < 0) { 463 if (rc < 0) {
464 nlmsg_free(rep_skb); 464 nlmsg_free(rep_skb);
465 goto err; 465 goto err;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2e949cc9c9f1..b79f39bda7e1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
792 /* Initialize mult/shift and max_idle_ns */ 792 /* Initialize mult/shift and max_idle_ns */
793 __clocksource_updatefreq_scale(cs, scale, freq); 793 __clocksource_updatefreq_scale(cs, scale, freq);
794 794
795 /* Add clocksource to the clcoksource list */ 795 /* Add clocksource to the clocksource list */
796 mutex_lock(&clocksource_mutex); 796 mutex_lock(&clocksource_mutex);
797 clocksource_enqueue(cs); 797 clocksource_enqueue(cs);
798 clocksource_enqueue_watchdog(cs); 798 clocksource_enqueue_watchdog(cs);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ff3ec34702e8..1363d58f07e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,7 +235,7 @@ void tick_nohz_full_kick(void)
235 if (!tick_nohz_full_cpu(smp_processor_id())) 235 if (!tick_nohz_full_cpu(smp_processor_id()))
236 return; 236 return;
237 237
238 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 238 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
239} 239}
240 240
241/* 241/*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 887e7d505974..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -745,6 +745,7 @@ u64 nsecs_to_jiffies64(u64 n)
745 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); 745 return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
746#endif 746#endif
747} 747}
748EXPORT_SYMBOL(nsecs_to_jiffies64);
748 749
749/** 750/**
750 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 751 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..979ccde26720 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_TRACEPOINTS) += power-traces.o 57obj-$(CONFIG_TRACEPOINTS) += power-traces.o
58ifeq ($(CONFIG_PM_RUNTIME),y) 58ifeq ($(CONFIG_PM),y)
59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o 59obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
60endif 60endif
61ifeq ($(CONFIG_TRACING),y) 61ifeq ($(CONFIG_TRACING),y)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..483cecfa5c17 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
1142 r->sector_from = be64_to_cpu(sector_from); 1142 r->sector_from = be64_to_cpu(sector_from);
1143} 1143}
1144 1144
1145typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1145typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1146 1146
1147static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1147static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148{ 1148{
1149 char rwbs[RWBS_LEN]; 1149 char rwbs[RWBS_LEN];
1150 unsigned long long ts = iter->ts; 1150 unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1154 1154
1155 fill_rwbs(rwbs, t); 1155 fill_rwbs(rwbs, t);
1156 1156
1157 return trace_seq_printf(&iter->seq, 1157 trace_seq_printf(&iter->seq,
1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", 1158 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1159 MAJOR(t->device), MINOR(t->device), iter->cpu, 1159 MAJOR(t->device), MINOR(t->device), iter->cpu,
1160 secs, nsec_rem, iter->ent->pid, act, rwbs); 1160 secs, nsec_rem, iter->ent->pid, act, rwbs);
1161} 1161}
1162 1162
1163static int blk_log_action(struct trace_iterator *iter, const char *act) 1163static void blk_log_action(struct trace_iterator *iter, const char *act)
1164{ 1164{
1165 char rwbs[RWBS_LEN]; 1165 char rwbs[RWBS_LEN];
1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1166 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1167 1167
1168 fill_rwbs(rwbs, t); 1168 fill_rwbs(rwbs, t);
1169 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", 1169 trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1170 MAJOR(t->device), MINOR(t->device), act, rwbs); 1170 MAJOR(t->device), MINOR(t->device), act, rwbs);
1171} 1171}
1172 1172
1173static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) 1173static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1174{ 1174{
1175 const unsigned char *pdu_buf; 1175 const unsigned char *pdu_buf;
1176 int pdu_len; 1176 int pdu_len;
1177 int i, end, ret; 1177 int i, end;
1178 1178
1179 pdu_buf = pdu_start(ent); 1179 pdu_buf = pdu_start(ent);
1180 pdu_len = te_blk_io_trace(ent)->pdu_len; 1180 pdu_len = te_blk_io_trace(ent)->pdu_len;
1181 1181
1182 if (!pdu_len) 1182 if (!pdu_len)
1183 return 1; 1183 return;
1184 1184
1185 /* find the last zero that needs to be printed */ 1185 /* find the last zero that needs to be printed */
1186 for (end = pdu_len - 1; end >= 0; end--) 1186 for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1188 break; 1188 break;
1189 end++; 1189 end++;
1190 1190
1191 if (!trace_seq_putc(s, '(')) 1191 trace_seq_putc(s, '(');
1192 return 0;
1193 1192
1194 for (i = 0; i < pdu_len; i++) { 1193 for (i = 0; i < pdu_len; i++) {
1195 1194
1196 ret = trace_seq_printf(s, "%s%02x", 1195 trace_seq_printf(s, "%s%02x",
1197 i == 0 ? "" : " ", pdu_buf[i]); 1196 i == 0 ? "" : " ", pdu_buf[i]);
1198 if (!ret)
1199 return ret;
1200 1197
1201 /* 1198 /*
1202 * stop when the rest is just zeroes and indicate so 1199 * stop when the rest is just zeroes and indicate so
1203 * with a ".." appended 1200 * with a ".." appended
1204 */ 1201 */
1205 if (i == end && end != pdu_len - 1) 1202 if (i == end && end != pdu_len - 1) {
1206 return trace_seq_puts(s, " ..) "); 1203 trace_seq_puts(s, " ..) ");
1204 return;
1205 }
1207 } 1206 }
1208 1207
1209 return trace_seq_puts(s, ") "); 1208 trace_seq_puts(s, ") ");
1210} 1209}
1211 1210
1212static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1211static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1213{ 1212{
1214 char cmd[TASK_COMM_LEN]; 1213 char cmd[TASK_COMM_LEN];
1215 1214
1216 trace_find_cmdline(ent->pid, cmd); 1215 trace_find_cmdline(ent->pid, cmd);
1217 1216
1218 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1217 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1219 int ret; 1218 trace_seq_printf(s, "%u ", t_bytes(ent));
1220 1219 blk_log_dump_pdu(s, ent);
1221 ret = trace_seq_printf(s, "%u ", t_bytes(ent)); 1220 trace_seq_printf(s, "[%s]\n", cmd);
1222 if (!ret)
1223 return 0;
1224 ret = blk_log_dump_pdu(s, ent);
1225 if (!ret)
1226 return 0;
1227 return trace_seq_printf(s, "[%s]\n", cmd);
1228 } else { 1221 } else {
1229 if (t_sec(ent)) 1222 if (t_sec(ent))
1230 return trace_seq_printf(s, "%llu + %u [%s]\n", 1223 trace_seq_printf(s, "%llu + %u [%s]\n",
1231 t_sector(ent), t_sec(ent), cmd); 1224 t_sector(ent), t_sec(ent), cmd);
1232 return trace_seq_printf(s, "[%s]\n", cmd); 1225 else
1226 trace_seq_printf(s, "[%s]\n", cmd);
1233 } 1227 }
1234} 1228}
1235 1229
1236static int blk_log_with_error(struct trace_seq *s, 1230static void blk_log_with_error(struct trace_seq *s,
1237 const struct trace_entry *ent) 1231 const struct trace_entry *ent)
1238{ 1232{
1239 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { 1233 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1240 int ret; 1234 blk_log_dump_pdu(s, ent);
1241 1235 trace_seq_printf(s, "[%d]\n", t_error(ent));
1242 ret = blk_log_dump_pdu(s, ent);
1243 if (ret)
1244 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1245 return 0;
1246 } else { 1236 } else {
1247 if (t_sec(ent)) 1237 if (t_sec(ent))
1248 return trace_seq_printf(s, "%llu + %u [%d]\n", 1238 trace_seq_printf(s, "%llu + %u [%d]\n",
1249 t_sector(ent), 1239 t_sector(ent),
1250 t_sec(ent), t_error(ent)); 1240 t_sec(ent), t_error(ent));
1251 return trace_seq_printf(s, "%llu [%d]\n", 1241 else
1252 t_sector(ent), t_error(ent)); 1242 trace_seq_printf(s, "%llu [%d]\n",
1243 t_sector(ent), t_error(ent));
1253 } 1244 }
1254} 1245}
1255 1246
1256static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1247static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1257{ 1248{
1258 struct blk_io_trace_remap r = { .device_from = 0, }; 1249 struct blk_io_trace_remap r = { .device_from = 0, };
1259 1250
1260 get_pdu_remap(ent, &r); 1251 get_pdu_remap(ent, &r);
1261 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1252 trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1262 t_sector(ent), t_sec(ent), 1253 t_sector(ent), t_sec(ent),
1263 MAJOR(r.device_from), MINOR(r.device_from), 1254 MAJOR(r.device_from), MINOR(r.device_from),
1264 (unsigned long long)r.sector_from); 1255 (unsigned long long)r.sector_from);
1265} 1256}
1266 1257
1267static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1258static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1268{ 1259{
1269 char cmd[TASK_COMM_LEN]; 1260 char cmd[TASK_COMM_LEN];
1270 1261
1271 trace_find_cmdline(ent->pid, cmd); 1262 trace_find_cmdline(ent->pid, cmd);
1272 1263
1273 return trace_seq_printf(s, "[%s]\n", cmd); 1264 trace_seq_printf(s, "[%s]\n", cmd);
1274} 1265}
1275 1266
1276static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) 1267static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1277{ 1268{
1278 char cmd[TASK_COMM_LEN]; 1269 char cmd[TASK_COMM_LEN];
1279 1270
1280 trace_find_cmdline(ent->pid, cmd); 1271 trace_find_cmdline(ent->pid, cmd);
1281 1272
1282 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); 1273 trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1283} 1274}
1284 1275
1285static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) 1276static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1286{ 1277{
1287 char cmd[TASK_COMM_LEN]; 1278 char cmd[TASK_COMM_LEN];
1288 1279
1289 trace_find_cmdline(ent->pid, cmd); 1280 trace_find_cmdline(ent->pid, cmd);
1290 1281
1291 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), 1282 trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1292 get_pdu_int(ent), cmd); 1283 get_pdu_int(ent), cmd);
1293} 1284}
1294 1285
1295static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) 1286static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1296{ 1287{
1297 int ret;
1298 const struct blk_io_trace *t = te_blk_io_trace(ent); 1288 const struct blk_io_trace *t = te_blk_io_trace(ent);
1299 1289
1300 ret = trace_seq_putmem(s, t + 1, t->pdu_len); 1290 trace_seq_putmem(s, t + 1, t->pdu_len);
1301 if (ret) 1291 trace_seq_putc(s, '\n');
1302 return trace_seq_putc(s, '\n');
1303 return ret;
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
1339 1327
1340static const struct { 1328static const struct {
1341 const char *act[2]; 1329 const char *act[2];
1342 int (*print)(struct trace_seq *s, const struct trace_entry *ent); 1330 void (*print)(struct trace_seq *s, const struct trace_entry *ent);
1343} what2act[] = { 1331} what2act[] = {
1344 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, 1332 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1345 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, 1333 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1364 struct trace_seq *s = &iter->seq; 1352 struct trace_seq *s = &iter->seq;
1365 const struct blk_io_trace *t; 1353 const struct blk_io_trace *t;
1366 u16 what; 1354 u16 what;
1367 int ret;
1368 bool long_act; 1355 bool long_act;
1369 blk_log_action_t *log_action; 1356 blk_log_action_t *log_action;
1370 1357
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1374 log_action = classic ? &blk_log_action_classic : &blk_log_action; 1361 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1375 1362
1376 if (t->action == BLK_TN_MESSAGE) { 1363 if (t->action == BLK_TN_MESSAGE) {
1377 ret = log_action(iter, long_act ? "message" : "m"); 1364 log_action(iter, long_act ? "message" : "m");
1378 if (ret) 1365 blk_log_msg(s, iter->ent);
1379 ret = blk_log_msg(s, iter->ent);
1380 goto out;
1381 } 1366 }
1382 1367
1383 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1368 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1384 ret = trace_seq_printf(s, "Unknown action %x\n", what); 1369 trace_seq_printf(s, "Unknown action %x\n", what);
1385 else { 1370 else {
1386 ret = log_action(iter, what2act[what].act[long_act]); 1371 log_action(iter, what2act[what].act[long_act]);
1387 if (ret) 1372 what2act[what].print(s, iter->ent);
1388 ret = what2act[what].print(s, iter->ent);
1389 } 1373 }
1390out: 1374
1391 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1375 return trace_handle_return(s);
1392} 1376}
1393 1377
1394static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1378static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1397 return print_one_line(iter, false); 1381 return print_one_line(iter, false);
1398} 1382}
1399 1383
1400static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) 1384static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1401{ 1385{
1402 struct trace_seq *s = &iter->seq; 1386 struct trace_seq *s = &iter->seq;
1403 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; 1387 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1407 .time = iter->ts, 1391 .time = iter->ts,
1408 }; 1392 };
1409 1393
1410 if (!trace_seq_putmem(s, &old, offset)) 1394 trace_seq_putmem(s, &old, offset);
1411 return 0; 1395 trace_seq_putmem(s, &t->sector,
1412 return trace_seq_putmem(s, &t->sector, 1396 sizeof(old) - offset + t->pdu_len);
1413 sizeof(old) - offset + t->pdu_len);
1414} 1397}
1415 1398
1416static enum print_line_t 1399static enum print_line_t
1417blk_trace_event_print_binary(struct trace_iterator *iter, int flags, 1400blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1418 struct trace_event *event) 1401 struct trace_event *event)
1419{ 1402{
1420 return blk_trace_synthesize_old_trace(iter) ? 1403 blk_trace_synthesize_old_trace(iter);
1421 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1404
1405 return trace_handle_return(&iter->seq);
1422} 1406}
1423 1407
1424static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) 1408static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
@@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
1493 if (atomic_dec_and_test(&blk_probes_ref)) 1477 if (atomic_dec_and_test(&blk_probes_ref))
1494 blk_unregister_tracepoints(); 1478 blk_unregister_tracepoints();
1495 1479
1496 spin_lock_irq(&running_trace_lock);
1497 list_del(&bt->running_list);
1498 spin_unlock_irq(&running_trace_lock);
1499 blk_trace_free(bt); 1480 blk_trace_free(bt);
1500 return 0; 1481 return 0;
1501} 1482}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31c90fec4158..224e768bdc73 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
387 return ret; 387 return ret;
388} 388}
389 389
390static void ftrace_update_trampoline(struct ftrace_ops *ops);
391
390static int __register_ftrace_function(struct ftrace_ops *ops) 392static int __register_ftrace_function(struct ftrace_ops *ops)
391{ 393{
392 if (ops->flags & FTRACE_OPS_FL_DELETED) 394 if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
416 if (control_ops_alloc(ops)) 418 if (control_ops_alloc(ops))
417 return -ENOMEM; 419 return -ENOMEM;
418 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); 420 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
421 /* The control_ops needs the trampoline update */
422 ops = &control_ops;
419 } else 423 } else
420 add_ftrace_ops(&ftrace_ops_list, ops); 424 add_ftrace_ops(&ftrace_ops_list, ops);
421 425
426 ftrace_update_trampoline(ops);
427
422 if (ftrace_enabled) 428 if (ftrace_enabled)
423 update_ftrace_function(); 429 update_ftrace_function();
424 430
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
565static int function_stat_headers(struct seq_file *m) 571static int function_stat_headers(struct seq_file *m)
566{ 572{
567#ifdef CONFIG_FUNCTION_GRAPH_TRACER 573#ifdef CONFIG_FUNCTION_GRAPH_TRACER
568 seq_printf(m, " Function " 574 seq_puts(m, " Function "
569 "Hit Time Avg s^2\n" 575 "Hit Time Avg s^2\n"
570 " -------- " 576 " -------- "
571 "--- ---- --- ---\n"); 577 "--- ---- --- ---\n");
572#else 578#else
573 seq_printf(m, " Function Hit\n" 579 seq_puts(m, " Function Hit\n"
574 " -------- ---\n"); 580 " -------- ---\n");
575#endif 581#endif
576 return 0; 582 return 0;
577} 583}
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
598 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 604 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
599 605
600#ifdef CONFIG_FUNCTION_GRAPH_TRACER 606#ifdef CONFIG_FUNCTION_GRAPH_TRACER
601 seq_printf(m, " "); 607 seq_puts(m, " ");
602 avg = rec->time; 608 avg = rec->time;
603 do_div(avg, rec->counter); 609 do_div(avg, rec->counter);
604 610
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
1111 FTRACE_OPS_FL_INITIALIZED, 1117 FTRACE_OPS_FL_INITIALIZED,
1112}; 1118};
1113 1119
1120/*
1121 * This is used by __kernel_text_address() to return true if the
1122 * address is on a dynamically allocated trampoline that would
1123 * not return true for either core_kernel_text() or
1124 * is_module_text_address().
1125 */
1126bool is_ftrace_trampoline(unsigned long addr)
1127{
1128 struct ftrace_ops *op;
1129 bool ret = false;
1130
1131 /*
1132 * Some of the ops may be dynamically allocated,
1133 * they are freed after a synchronize_sched().
1134 */
1135 preempt_disable_notrace();
1136
1137 do_for_each_ftrace_op(op, ftrace_ops_list) {
1138 /*
1139 * This is to check for dynamically allocated trampolines.
1140 * Trampolines that are in kernel text will have
1141 * core_kernel_text() return true.
1142 */
1143 if (op->trampoline && op->trampoline_size)
1144 if (addr >= op->trampoline &&
1145 addr < op->trampoline + op->trampoline_size) {
1146 ret = true;
1147 goto out;
1148 }
1149 } while_for_each_ftrace_op(op);
1150
1151 out:
1152 preempt_enable_notrace();
1153
1154 return ret;
1155}
1156
1114struct ftrace_page { 1157struct ftrace_page {
1115 struct ftrace_page *next; 1158 struct ftrace_page *next;
1116 struct dyn_ftrace *records; 1159 struct dyn_ftrace *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
1315static void 1358static void
1316ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); 1359ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
1317 1360
1361static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1362 struct ftrace_hash *new_hash);
1363
1318static int 1364static int
1319ftrace_hash_move(struct ftrace_ops *ops, int enable, 1365ftrace_hash_move(struct ftrace_ops *ops, int enable,
1320 struct ftrace_hash **dst, struct ftrace_hash *src) 1366 struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1325 struct ftrace_hash *new_hash; 1371 struct ftrace_hash *new_hash;
1326 int size = src->count; 1372 int size = src->count;
1327 int bits = 0; 1373 int bits = 0;
1374 int ret;
1328 int i; 1375 int i;
1329 1376
1377 /* Reject setting notrace hash on IPMODIFY ftrace_ops */
1378 if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
1379 return -EINVAL;
1380
1330 /* 1381 /*
1331 * If the new source is empty, just free dst and assign it 1382 * If the new source is empty, just free dst and assign it
1332 * the empty_hash. 1383 * the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1360 } 1411 }
1361 1412
1362update: 1413update:
1414 /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
1415 if (enable) {
1416 /* IPMODIFY should be updated only when filter_hash updating */
1417 ret = ftrace_hash_ipmodify_update(ops, new_hash);
1418 if (ret < 0) {
1419 free_ftrace_hash(new_hash);
1420 return ret;
1421 }
1422 }
1423
1363 /* 1424 /*
1364 * Remove the current set, update the hash and add 1425 * Remove the current set, update the hash and add
1365 * them back. 1426 * them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
1724 ftrace_hash_rec_update_modify(ops, filter_hash, 1); 1785 ftrace_hash_rec_update_modify(ops, filter_hash, 1);
1725} 1786}
1726 1787
1788/*
1789 * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
1790 * or no-needed to update, -EBUSY if it detects a conflict of the flag
1791 * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
1792 * Note that old_hash and new_hash has below meanings
1793 * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
1794 * - If the hash is EMPTY_HASH, it hits nothing
1795 * - Anything else hits the recs which match the hash entries.
1796 */
1797static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
1798 struct ftrace_hash *old_hash,
1799 struct ftrace_hash *new_hash)
1800{
1801 struct ftrace_page *pg;
1802 struct dyn_ftrace *rec, *end = NULL;
1803 int in_old, in_new;
1804
1805 /* Only update if the ops has been registered */
1806 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
1807 return 0;
1808
1809 if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
1810 return 0;
1811
1812 /*
1813 * Since the IPMODIFY is a very address sensitive action, we do not
1814 * allow ftrace_ops to set all functions to new hash.
1815 */
1816 if (!new_hash || !old_hash)
1817 return -EINVAL;
1818
1819 /* Update rec->flags */
1820 do_for_each_ftrace_rec(pg, rec) {
1821 /* We need to update only differences of filter_hash */
1822 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1823 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1824 if (in_old == in_new)
1825 continue;
1826
1827 if (in_new) {
1828 /* New entries must ensure no others are using it */
1829 if (rec->flags & FTRACE_FL_IPMODIFY)
1830 goto rollback;
1831 rec->flags |= FTRACE_FL_IPMODIFY;
1832 } else /* Removed entry */
1833 rec->flags &= ~FTRACE_FL_IPMODIFY;
1834 } while_for_each_ftrace_rec();
1835
1836 return 0;
1837
1838rollback:
1839 end = rec;
1840
1841 /* Roll back what we did above */
1842 do_for_each_ftrace_rec(pg, rec) {
1843 if (rec == end)
1844 goto err_out;
1845
1846 in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
1847 in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
1848 if (in_old == in_new)
1849 continue;
1850
1851 if (in_new)
1852 rec->flags &= ~FTRACE_FL_IPMODIFY;
1853 else
1854 rec->flags |= FTRACE_FL_IPMODIFY;
1855 } while_for_each_ftrace_rec();
1856
1857err_out:
1858 return -EBUSY;
1859}
1860
1861static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
1862{
1863 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1864
1865 if (ftrace_hash_empty(hash))
1866 hash = NULL;
1867
1868 return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
1869}
1870
1871/* Disabling always succeeds */
1872static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
1873{
1874 struct ftrace_hash *hash = ops->func_hash->filter_hash;
1875
1876 if (ftrace_hash_empty(hash))
1877 hash = NULL;
1878
1879 __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
1880}
1881
1882static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
1883 struct ftrace_hash *new_hash)
1884{
1885 struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
1886
1887 if (ftrace_hash_empty(old_hash))
1888 old_hash = NULL;
1889
1890 if (ftrace_hash_empty(new_hash))
1891 new_hash = NULL;
1892
1893 return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
1894}
1895
1727static void print_ip_ins(const char *fmt, unsigned char *p) 1896static void print_ip_ins(const char *fmt, unsigned char *p)
1728{ 1897{
1729 int i; 1898 int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1734 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1903 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1735} 1904}
1736 1905
1906static struct ftrace_ops *
1907ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
1908
1737/** 1909/**
1738 * ftrace_bug - report and shutdown function tracer 1910 * ftrace_bug - report and shutdown function tracer
1739 * @failed: The failed type (EFAULT, EINVAL, EPERM) 1911 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1740 * @ip: The address that failed 1912 * @rec: The record that failed
1741 * 1913 *
1742 * The arch code that enables or disables the function tracing 1914 * The arch code that enables or disables the function tracing
1743 * can call ftrace_bug() when it has detected a problem in 1915 * can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1746 * EINVAL - if what is read at @ip is not what was expected 1918 * EINVAL - if what is read at @ip is not what was expected
1747 * EPERM - if the problem happens on writting to the @ip address 1919 * EPERM - if the problem happens on writting to the @ip address
1748 */ 1920 */
1749void ftrace_bug(int failed, unsigned long ip) 1921void ftrace_bug(int failed, struct dyn_ftrace *rec)
1750{ 1922{
1923 unsigned long ip = rec ? rec->ip : 0;
1924
1751 switch (failed) { 1925 switch (failed) {
1752 case -EFAULT: 1926 case -EFAULT:
1753 FTRACE_WARN_ON_ONCE(1); 1927 FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
1759 pr_info("ftrace failed to modify "); 1933 pr_info("ftrace failed to modify ");
1760 print_ip_sym(ip); 1934 print_ip_sym(ip);
1761 print_ip_ins(" actual: ", (unsigned char *)ip); 1935 print_ip_ins(" actual: ", (unsigned char *)ip);
1762 printk(KERN_CONT "\n"); 1936 pr_cont("\n");
1763 break; 1937 break;
1764 case -EPERM: 1938 case -EPERM:
1765 FTRACE_WARN_ON_ONCE(1); 1939 FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
1771 pr_info("ftrace faulted on unknown error "); 1945 pr_info("ftrace faulted on unknown error ");
1772 print_ip_sym(ip); 1946 print_ip_sym(ip);
1773 } 1947 }
1948 if (rec) {
1949 struct ftrace_ops *ops = NULL;
1950
1951 pr_info("ftrace record flags: %lx\n", rec->flags);
1952 pr_cont(" (%ld)%s", ftrace_rec_count(rec),
1953 rec->flags & FTRACE_FL_REGS ? " R" : " ");
1954 if (rec->flags & FTRACE_FL_TRAMP_EN) {
1955 ops = ftrace_find_tramp_ops_any(rec);
1956 if (ops)
1957 pr_cont("\ttramp: %pS",
1958 (void *)ops->trampoline);
1959 else
1960 pr_cont("\ttramp: ERROR!");
1961
1962 }
1963 ip = ftrace_get_addr_curr(rec);
1964 pr_cont(" expected tramp: %lx\n", ip);
1965 }
1774} 1966}
1775 1967
1776static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1968static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
2093 do_for_each_ftrace_rec(pg, rec) { 2285 do_for_each_ftrace_rec(pg, rec) {
2094 failed = __ftrace_replace_code(rec, enable); 2286 failed = __ftrace_replace_code(rec, enable);
2095 if (failed) { 2287 if (failed) {
2096 ftrace_bug(failed, rec->ip); 2288 ftrace_bug(failed, rec);
2097 /* Stop processing */ 2289 /* Stop processing */
2098 return; 2290 return;
2099 } 2291 }
@@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
2175static int 2367static int
2176ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 2368ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
2177{ 2369{
2178 unsigned long ip;
2179 int ret; 2370 int ret;
2180 2371
2181 ip = rec->ip;
2182
2183 if (unlikely(ftrace_disabled)) 2372 if (unlikely(ftrace_disabled))
2184 return 0; 2373 return 0;
2185 2374
2186 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); 2375 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
2187 if (ret) { 2376 if (ret) {
2188 ftrace_bug(ret, ip); 2377 ftrace_bug(ret, rec);
2189 return 0; 2378 return 0;
2190 } 2379 }
2191 return 1; 2380 return 1;
@@ -2308,18 +2497,24 @@ static void ftrace_run_update_code(int command)
2308} 2497}
2309 2498
2310static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, 2499static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2311 struct ftrace_hash *old_hash) 2500 struct ftrace_ops_hash *old_hash)
2312{ 2501{
2313 ops->flags |= FTRACE_OPS_FL_MODIFYING; 2502 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2314 ops->old_hash.filter_hash = old_hash; 2503 ops->old_hash.filter_hash = old_hash->filter_hash;
2504 ops->old_hash.notrace_hash = old_hash->notrace_hash;
2315 ftrace_run_update_code(command); 2505 ftrace_run_update_code(command);
2316 ops->old_hash.filter_hash = NULL; 2506 ops->old_hash.filter_hash = NULL;
2507 ops->old_hash.notrace_hash = NULL;
2317 ops->flags &= ~FTRACE_OPS_FL_MODIFYING; 2508 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2318} 2509}
2319 2510
2320static ftrace_func_t saved_ftrace_func; 2511static ftrace_func_t saved_ftrace_func;
2321static int ftrace_start_up; 2512static int ftrace_start_up;
2322 2513
2514void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
2515{
2516}
2517
2323static void control_ops_free(struct ftrace_ops *ops) 2518static void control_ops_free(struct ftrace_ops *ops)
2324{ 2519{
2325 free_percpu(ops->disabled); 2520 free_percpu(ops->disabled);
@@ -2369,6 +2564,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2369 */ 2564 */
2370 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; 2565 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
2371 2566
2567 ret = ftrace_hash_ipmodify_enable(ops);
2568 if (ret < 0) {
2569 /* Rollback registration process */
2570 __unregister_ftrace_function(ops);
2571 ftrace_start_up--;
2572 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
2573 return ret;
2574 }
2575
2372 ftrace_hash_rec_enable(ops, 1); 2576 ftrace_hash_rec_enable(ops, 1);
2373 2577
2374 ftrace_startup_enable(command); 2578 ftrace_startup_enable(command);
@@ -2397,6 +2601,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2397 */ 2601 */
2398 WARN_ON_ONCE(ftrace_start_up < 0); 2602 WARN_ON_ONCE(ftrace_start_up < 0);
2399 2603
2604 /* Disabling ipmodify never fails */
2605 ftrace_hash_ipmodify_disable(ops);
2400 ftrace_hash_rec_disable(ops, 1); 2606 ftrace_hash_rec_disable(ops, 1);
2401 2607
2402 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 2608 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2471,6 +2677,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2471 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { 2677 if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
2472 schedule_on_each_cpu(ftrace_sync); 2678 schedule_on_each_cpu(ftrace_sync);
2473 2679
2680 arch_ftrace_trampoline_free(ops);
2681
2474 if (ops->flags & FTRACE_OPS_FL_CONTROL) 2682 if (ops->flags & FTRACE_OPS_FL_CONTROL)
2475 control_ops_free(ops); 2683 control_ops_free(ops);
2476 } 2684 }
@@ -2623,7 +2831,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2623 if (ftrace_start_up && cnt) { 2831 if (ftrace_start_up && cnt) {
2624 int failed = __ftrace_replace_code(p, 1); 2832 int failed = __ftrace_replace_code(p, 1);
2625 if (failed) 2833 if (failed)
2626 ftrace_bug(failed, p->ip); 2834 ftrace_bug(failed, p);
2627 } 2835 }
2628 } 2836 }
2629 } 2837 }
@@ -2948,6 +3156,22 @@ static void t_stop(struct seq_file *m, void *p)
2948 mutex_unlock(&ftrace_lock); 3156 mutex_unlock(&ftrace_lock);
2949} 3157}
2950 3158
3159void * __weak
3160arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
3161{
3162 return NULL;
3163}
3164
3165static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
3166 struct dyn_ftrace *rec)
3167{
3168 void *ptr;
3169
3170 ptr = arch_ftrace_trampoline_func(ops, rec);
3171 if (ptr)
3172 seq_printf(m, " ->%pS", ptr);
3173}
3174
2951static int t_show(struct seq_file *m, void *v) 3175static int t_show(struct seq_file *m, void *v)
2952{ 3176{
2953 struct ftrace_iterator *iter = m->private; 3177 struct ftrace_iterator *iter = m->private;
@@ -2958,9 +3182,9 @@ static int t_show(struct seq_file *m, void *v)
2958 3182
2959 if (iter->flags & FTRACE_ITER_PRINTALL) { 3183 if (iter->flags & FTRACE_ITER_PRINTALL) {
2960 if (iter->flags & FTRACE_ITER_NOTRACE) 3184 if (iter->flags & FTRACE_ITER_NOTRACE)
2961 seq_printf(m, "#### no functions disabled ####\n"); 3185 seq_puts(m, "#### no functions disabled ####\n");
2962 else 3186 else
2963 seq_printf(m, "#### all functions enabled ####\n"); 3187 seq_puts(m, "#### all functions enabled ####\n");
2964 return 0; 3188 return 0;
2965 } 3189 }
2966 3190
@@ -2971,22 +3195,25 @@ static int t_show(struct seq_file *m, void *v)
2971 3195
2972 seq_printf(m, "%ps", (void *)rec->ip); 3196 seq_printf(m, "%ps", (void *)rec->ip);
2973 if (iter->flags & FTRACE_ITER_ENABLED) { 3197 if (iter->flags & FTRACE_ITER_ENABLED) {
2974 seq_printf(m, " (%ld)%s", 3198 struct ftrace_ops *ops = NULL;
3199
3200 seq_printf(m, " (%ld)%s%s",
2975 ftrace_rec_count(rec), 3201 ftrace_rec_count(rec),
2976 rec->flags & FTRACE_FL_REGS ? " R" : " "); 3202 rec->flags & FTRACE_FL_REGS ? " R" : " ",
3203 rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
2977 if (rec->flags & FTRACE_FL_TRAMP_EN) { 3204 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2978 struct ftrace_ops *ops;
2979
2980 ops = ftrace_find_tramp_ops_any(rec); 3205 ops = ftrace_find_tramp_ops_any(rec);
2981 if (ops) 3206 if (ops)
2982 seq_printf(m, "\ttramp: %pS", 3207 seq_printf(m, "\ttramp: %pS",
2983 (void *)ops->trampoline); 3208 (void *)ops->trampoline);
2984 else 3209 else
2985 seq_printf(m, "\ttramp: ERROR!"); 3210 seq_puts(m, "\ttramp: ERROR!");
3211
2986 } 3212 }
3213 add_trampoline_func(m, ops, rec);
2987 } 3214 }
2988 3215
2989 seq_printf(m, "\n"); 3216 seq_putc(m, '\n');
2990 3217
2991 return 0; 3218 return 0;
2992} 3219}
@@ -3020,9 +3247,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
3020{ 3247{
3021 struct ftrace_iterator *iter; 3248 struct ftrace_iterator *iter;
3022 3249
3023 if (unlikely(ftrace_disabled))
3024 return -ENODEV;
3025
3026 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); 3250 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
3027 if (iter) { 3251 if (iter) {
3028 iter->pg = ftrace_pages_start; 3252 iter->pg = ftrace_pages_start;
@@ -3357,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3357 3581
3358static int ftrace_probe_registered; 3582static int ftrace_probe_registered;
3359 3583
3360static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) 3584static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
3361{ 3585{
3362 int ret; 3586 int ret;
3363 int i; 3587 int i;
@@ -3415,6 +3639,7 @@ int
3415register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3639register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3416 void *data) 3640 void *data)
3417{ 3641{
3642 struct ftrace_ops_hash old_hash_ops;
3418 struct ftrace_func_probe *entry; 3643 struct ftrace_func_probe *entry;
3419 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; 3644 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3420 struct ftrace_hash *old_hash = *orig_hash; 3645 struct ftrace_hash *old_hash = *orig_hash;
@@ -3436,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3436 3661
3437 mutex_lock(&trace_probe_ops.func_hash->regex_lock); 3662 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3438 3663
3664 old_hash_ops.filter_hash = old_hash;
3665 /* Probes only have filters */
3666 old_hash_ops.notrace_hash = NULL;
3667
3439 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); 3668 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3440 if (!hash) { 3669 if (!hash) {
3441 count = -ENOMEM; 3670 count = -ENOMEM;
@@ -3496,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3496 3725
3497 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3726 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3498 3727
3499 __enable_ftrace_function_probe(old_hash); 3728 __enable_ftrace_function_probe(&old_hash_ops);
3500 3729
3501 if (!ret) 3730 if (!ret)
3502 free_ftrace_hash_rcu(old_hash); 3731 free_ftrace_hash_rcu(old_hash);
@@ -3784,10 +4013,34 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3784} 4013}
3785 4014
3786static void ftrace_ops_update_code(struct ftrace_ops *ops, 4015static void ftrace_ops_update_code(struct ftrace_ops *ops,
3787 struct ftrace_hash *old_hash) 4016 struct ftrace_ops_hash *old_hash)
3788{ 4017{
3789 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 4018 struct ftrace_ops *op;
4019
4020 if (!ftrace_enabled)
4021 return;
4022
4023 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
3790 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); 4024 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
4025 return;
4026 }
4027
4028 /*
4029 * If this is the shared global_ops filter, then we need to
4030 * check if there is another ops that shares it, is enabled.
4031 * If so, we still need to run the modify code.
4032 */
4033 if (ops->func_hash != &global_ops.local_hash)
4034 return;
4035
4036 do_for_each_ftrace_op(op, ftrace_ops_list) {
4037 if (op->func_hash == &global_ops.local_hash &&
4038 op->flags & FTRACE_OPS_FL_ENABLED) {
4039 ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
4040 /* Only need to do this once */
4041 return;
4042 }
4043 } while_for_each_ftrace_op(op);
3791} 4044}
3792 4045
3793static int 4046static int
@@ -3795,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3795 unsigned long ip, int remove, int reset, int enable) 4048 unsigned long ip, int remove, int reset, int enable)
3796{ 4049{
3797 struct ftrace_hash **orig_hash; 4050 struct ftrace_hash **orig_hash;
4051 struct ftrace_ops_hash old_hash_ops;
3798 struct ftrace_hash *old_hash; 4052 struct ftrace_hash *old_hash;
3799 struct ftrace_hash *hash; 4053 struct ftrace_hash *hash;
3800 int ret; 4054 int ret;
@@ -3831,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3831 4085
3832 mutex_lock(&ftrace_lock); 4086 mutex_lock(&ftrace_lock);
3833 old_hash = *orig_hash; 4087 old_hash = *orig_hash;
4088 old_hash_ops.filter_hash = ops->func_hash->filter_hash;
4089 old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
3834 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 4090 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3835 if (!ret) { 4091 if (!ret) {
3836 ftrace_ops_update_code(ops, old_hash); 4092 ftrace_ops_update_code(ops, &old_hash_ops);
3837 free_ftrace_hash_rcu(old_hash); 4093 free_ftrace_hash_rcu(old_hash);
3838 } 4094 }
3839 mutex_unlock(&ftrace_lock); 4095 mutex_unlock(&ftrace_lock);
@@ -3975,6 +4231,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
3975static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 4231static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3976static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); 4232static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
3977 4233
4234static unsigned long save_global_trampoline;
4235static unsigned long save_global_flags;
4236
3978static int __init set_graph_function(char *str) 4237static int __init set_graph_function(char *str)
3979{ 4238{
3980 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 4239 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4042,6 +4301,7 @@ static void __init set_ftrace_early_filters(void)
4042int ftrace_regex_release(struct inode *inode, struct file *file) 4301int ftrace_regex_release(struct inode *inode, struct file *file)
4043{ 4302{
4044 struct seq_file *m = (struct seq_file *)file->private_data; 4303 struct seq_file *m = (struct seq_file *)file->private_data;
4304 struct ftrace_ops_hash old_hash_ops;
4045 struct ftrace_iterator *iter; 4305 struct ftrace_iterator *iter;
4046 struct ftrace_hash **orig_hash; 4306 struct ftrace_hash **orig_hash;
4047 struct ftrace_hash *old_hash; 4307 struct ftrace_hash *old_hash;
@@ -4075,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
4075 4335
4076 mutex_lock(&ftrace_lock); 4336 mutex_lock(&ftrace_lock);
4077 old_hash = *orig_hash; 4337 old_hash = *orig_hash;
4338 old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
4339 old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
4078 ret = ftrace_hash_move(iter->ops, filter_hash, 4340 ret = ftrace_hash_move(iter->ops, filter_hash,
4079 orig_hash, iter->hash); 4341 orig_hash, iter->hash);
4080 if (!ret) { 4342 if (!ret) {
4081 ftrace_ops_update_code(iter->ops, old_hash); 4343 ftrace_ops_update_code(iter->ops, &old_hash_ops);
4082 free_ftrace_hash_rcu(old_hash); 4344 free_ftrace_hash_rcu(old_hash);
4083 } 4345 }
4084 mutex_unlock(&ftrace_lock); 4346 mutex_unlock(&ftrace_lock);
@@ -4183,9 +4445,9 @@ static int g_show(struct seq_file *m, void *v)
4183 struct ftrace_graph_data *fgd = m->private; 4445 struct ftrace_graph_data *fgd = m->private;
4184 4446
4185 if (fgd->table == ftrace_graph_funcs) 4447 if (fgd->table == ftrace_graph_funcs)
4186 seq_printf(m, "#### all functions enabled ####\n"); 4448 seq_puts(m, "#### all functions enabled ####\n");
4187 else 4449 else
4188 seq_printf(m, "#### no functions disabled ####\n"); 4450 seq_puts(m, "#### no functions disabled ####\n");
4189 return 0; 4451 return 0;
4190 } 4452 }
4191 4453
@@ -4696,6 +4958,32 @@ void __init ftrace_init(void)
4696 ftrace_disabled = 1; 4958 ftrace_disabled = 1;
4697} 4959}
4698 4960
4961/* Do nothing if arch does not support this */
4962void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
4963{
4964}
4965
4966static void ftrace_update_trampoline(struct ftrace_ops *ops)
4967{
4968
4969/*
4970 * Currently there's no safe way to free a trampoline when the kernel
4971 * is configured with PREEMPT. That is because a task could be preempted
4972 * when it jumped to the trampoline, it may be preempted for a long time
4973 * depending on the system load, and currently there's no way to know
4974 * when it will be off the trampoline. If the trampoline is freed
4975 * too early, when the task runs again, it will be executing on freed
4976 * memory and crash.
4977 */
4978#ifdef CONFIG_PREEMPT
4979 /* Currently, only non dynamic ops can have a trampoline */
4980 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
4981 return;
4982#endif
4983
4984 arch_ftrace_update_trampoline(ops);
4985}
4986
4699#else 4987#else
4700 4988
4701static struct ftrace_ops global_ops = { 4989static struct ftrace_ops global_ops = {
@@ -4738,6 +5026,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4738 return 1; 5026 return 1;
4739} 5027}
4740 5028
5029static void ftrace_update_trampoline(struct ftrace_ops *ops)
5030{
5031}
5032
4741#endif /* CONFIG_DYNAMIC_FTRACE */ 5033#endif /* CONFIG_DYNAMIC_FTRACE */
4742 5034
4743__init void ftrace_init_global_array_ops(struct trace_array *tr) 5035__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5075,12 +5367,12 @@ static int fpid_show(struct seq_file *m, void *v)
5075 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); 5367 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
5076 5368
5077 if (v == (void *)1) { 5369 if (v == (void *)1) {
5078 seq_printf(m, "no pid\n"); 5370 seq_puts(m, "no pid\n");
5079 return 0; 5371 return 0;
5080 } 5372 }
5081 5373
5082 if (fpid->pid == ftrace_swapper_pid) 5374 if (fpid->pid == ftrace_swapper_pid)
5083 seq_printf(m, "swapper tasks\n"); 5375 seq_puts(m, "swapper tasks\n");
5084 else 5376 else
5085 seq_printf(m, "%u\n", pid_vnr(fpid->pid)); 5377 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
5086 5378
@@ -5293,6 +5585,7 @@ static struct ftrace_ops graph_ops = {
5293 FTRACE_OPS_FL_STUB, 5585 FTRACE_OPS_FL_STUB,
5294#ifdef FTRACE_GRAPH_TRAMP_ADDR 5586#ifdef FTRACE_GRAPH_TRAMP_ADDR
5295 .trampoline = FTRACE_GRAPH_TRAMP_ADDR, 5587 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
5588 /* trampoline_size is only needed for dynamically allocated tramps */
5296#endif 5589#endif
5297 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) 5590 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
5298}; 5591};
@@ -5522,7 +5815,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5522 update_function_graph_func(); 5815 update_function_graph_func();
5523 5816
5524 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); 5817 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
5525
5526out: 5818out:
5527 mutex_unlock(&ftrace_lock); 5819 mutex_unlock(&ftrace_lock);
5528 return ret; 5820 return ret;
@@ -5543,6 +5835,17 @@ void unregister_ftrace_graph(void)
5543 unregister_pm_notifier(&ftrace_suspend_notifier); 5835 unregister_pm_notifier(&ftrace_suspend_notifier);
5544 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5836 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5545 5837
5838#ifdef CONFIG_DYNAMIC_FTRACE
5839 /*
5840 * Function graph does not allocate the trampoline, but
5841 * other global_ops do. We need to reset the ALLOC_TRAMP flag
5842 * if one was used.
5843 */
5844 global_ops.trampoline = save_global_trampoline;
5845 if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
5846 global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
5847#endif
5848
5546 out: 5849 out:
5547 mutex_unlock(&ftrace_lock); 5850 mutex_unlock(&ftrace_lock);
5548} 5851}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a56e07c8d15b..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
34 */ 34 */
35int ring_buffer_print_entry_header(struct trace_seq *s) 35int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 trace_seq_puts(s, "# compressed entry header\n");
38 38 trace_seq_puts(s, "\ttype_len : 5 bits\n");
39 ret = trace_seq_puts(s, "# compressed entry header\n"); 39 trace_seq_puts(s, "\ttime_delta : 27 bits\n");
40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); 40 trace_seq_puts(s, "\tarray : 32 bits\n");
41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); 41 trace_seq_putc(s, '\n');
42 ret = trace_seq_puts(s, "\tarray : 32 bits\n"); 42 trace_seq_printf(s, "\tpadding : type == %d\n",
43 ret = trace_seq_putc(s, '\n'); 43 RINGBUF_TYPE_PADDING);
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 trace_seq_printf(s, "\ttime_extend : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_TIME_EXTEND);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 trace_seq_printf(s, "\tdata max type_len == %d\n",
47 RINGBUF_TYPE_TIME_EXTEND); 47 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
48 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
49 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
50 48
51 return ret; 49 return !trace_seq_has_overflowed(s);
52} 50}
53 51
54/* 52/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
419int ring_buffer_print_page_header(struct trace_seq *s) 417int ring_buffer_print_page_header(struct trace_seq *s)
420{ 418{
421 struct buffer_data_page field; 419 struct buffer_data_page field;
422 int ret;
423
424 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
425 "offset:0;\tsize:%u;\tsigned:%u;\n",
426 (unsigned int)sizeof(field.time_stamp),
427 (unsigned int)is_signed_type(u64));
428
429 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
430 "offset:%u;\tsize:%u;\tsigned:%u;\n",
431 (unsigned int)offsetof(typeof(field), commit),
432 (unsigned int)sizeof(field.commit),
433 (unsigned int)is_signed_type(long));
434
435 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
436 "offset:%u;\tsize:%u;\tsigned:%u;\n",
437 (unsigned int)offsetof(typeof(field), commit),
438 1,
439 (unsigned int)is_signed_type(long));
440
441 ret = trace_seq_printf(s, "\tfield: char data;\t"
442 "offset:%u;\tsize:%u;\tsigned:%u;\n",
443 (unsigned int)offsetof(typeof(field), data),
444 (unsigned int)BUF_PAGE_SIZE,
445 (unsigned int)is_signed_type(char));
446 420
447 return ret; 421 trace_seq_printf(s, "\tfield: u64 timestamp;\t"
422 "offset:0;\tsize:%u;\tsigned:%u;\n",
423 (unsigned int)sizeof(field.time_stamp),
424 (unsigned int)is_signed_type(u64));
425
426 trace_seq_printf(s, "\tfield: local_t commit;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 (unsigned int)sizeof(field.commit),
430 (unsigned int)is_signed_type(long));
431
432 trace_seq_printf(s, "\tfield: int overwrite;\t"
433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
434 (unsigned int)offsetof(typeof(field), commit),
435 1,
436 (unsigned int)is_signed_type(long));
437
438 trace_seq_printf(s, "\tfield: char data;\t"
439 "offset:%u;\tsize:%u;\tsigned:%u;\n",
440 (unsigned int)offsetof(typeof(field), data),
441 (unsigned int)BUF_PAGE_SIZE,
442 (unsigned int)is_signed_type(char));
443
444 return !trace_seq_has_overflowed(s);
448} 445}
449 446
450struct rb_irq_work { 447struct rb_irq_work {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92f4a6cee172..4a9079b9f082 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running;
63 */ 63 */
64bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
65 65
66/* Pipe tracepoints to printk */
67struct trace_iterator *tracepoint_print_iter;
68int tracepoint_printk;
69
66/* For tracers that don't implement custom flags */ 70/* For tracers that don't implement custom flags */
67static struct tracer_opt dummy_tracer_opt[] = { 71static struct tracer_opt dummy_tracer_opt[] = {
68 { } 72 { }
@@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
155 159
156static int __init stop_trace_on_warning(char *str) 160static int __init stop_trace_on_warning(char *str)
157{ 161{
158 __disable_trace_on_warning = 1; 162 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
163 __disable_trace_on_warning = 1;
159 return 1; 164 return 1;
160} 165}
161__setup("traceoff_on_warning=", stop_trace_on_warning); 166__setup("traceoff_on_warning", stop_trace_on_warning);
162 167
163static int __init boot_alloc_snapshot(char *str) 168static int __init boot_alloc_snapshot(char *str)
164{ 169{
@@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str)
192} 197}
193__setup("trace_clock=", set_trace_boot_clock); 198__setup("trace_clock=", set_trace_boot_clock);
194 199
200static int __init set_tracepoint_printk(char *str)
201{
202 if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
203 tracepoint_printk = 1;
204 return 1;
205}
206__setup("tp_printk", set_tracepoint_printk);
195 207
196unsigned long long ns2usecs(cycle_t nsec) 208unsigned long long ns2usecs(cycle_t nsec)
197{ 209{
@@ -938,19 +950,20 @@ out:
938 return ret; 950 return ret;
939} 951}
940 952
953/* TODO add a seq_buf_to_buffer() */
941static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 954static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
942{ 955{
943 int len; 956 int len;
944 957
945 if (s->len <= s->readpos) 958 if (trace_seq_used(s) <= s->seq.readpos)
946 return -EBUSY; 959 return -EBUSY;
947 960
948 len = s->len - s->readpos; 961 len = trace_seq_used(s) - s->seq.readpos;
949 if (cnt > len) 962 if (cnt > len)
950 cnt = len; 963 cnt = len;
951 memcpy(buf, s->buffer + s->readpos, cnt); 964 memcpy(buf, s->buffer + s->seq.readpos, cnt);
952 965
953 s->readpos += cnt; 966 s->seq.readpos += cnt;
954 return cnt; 967 return cnt;
955} 968}
956 969
@@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void)
2029 pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); 2042 pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
2030 pr_warning("** **\n"); 2043 pr_warning("** **\n");
2031 pr_warning("** This means that this is a DEBUG kernel and it is **\n"); 2044 pr_warning("** This means that this is a DEBUG kernel and it is **\n");
2032 pr_warning("** unsafe for produciton use. **\n"); 2045 pr_warning("** unsafe for production use. **\n");
2033 pr_warning("** **\n"); 2046 pr_warning("** **\n");
2034 pr_warning("** If you see this message and you are not debugging **\n"); 2047 pr_warning("** If you see this message and you are not debugging **\n");
2035 pr_warning("** the kernel, report this immediately to your vendor! **\n"); 2048 pr_warning("** the kernel, report this immediately to your vendor! **\n");
@@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2158 goto out; 2171 goto out;
2159 } 2172 }
2160 2173
2161 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); 2174 len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
2162 if (len > TRACE_BUF_SIZE)
2163 goto out;
2164 2175
2165 local_save_flags(flags); 2176 local_save_flags(flags);
2166 size = sizeof(*entry) + len + 1; 2177 size = sizeof(*entry) + len + 1;
@@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
2171 entry = ring_buffer_event_data(event); 2182 entry = ring_buffer_event_data(event);
2172 entry->ip = ip; 2183 entry->ip = ip;
2173 2184
2174 memcpy(&entry->buf, tbuffer, len); 2185 memcpy(&entry->buf, tbuffer, len + 1);
2175 entry->buf[len] = '\0';
2176 if (!call_filter_check_discard(call, entry, buffer, event)) { 2186 if (!call_filter_check_discard(call, entry, buffer, event)) {
2177 __buffer_unlock_commit(buffer, event); 2187 __buffer_unlock_commit(buffer, event);
2178 ftrace_trace_stack(buffer, flags, 6, pc); 2188 ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf,
2509 2519
2510static void print_lat_help_header(struct seq_file *m) 2520static void print_lat_help_header(struct seq_file *m)
2511{ 2521{
2512 seq_puts(m, "# _------=> CPU# \n"); 2522 seq_puts(m, "# _------=> CPU# \n"
2513 seq_puts(m, "# / _-----=> irqs-off \n"); 2523 "# / _-----=> irqs-off \n"
2514 seq_puts(m, "# | / _----=> need-resched \n"); 2524 "# | / _----=> need-resched \n"
2515 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 2525 "# || / _---=> hardirq/softirq \n"
2516 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 2526 "# ||| / _--=> preempt-depth \n"
2517 seq_puts(m, "# |||| / delay \n"); 2527 "# |||| / delay \n"
2518 seq_puts(m, "# cmd pid ||||| time | caller \n"); 2528 "# cmd pid ||||| time | caller \n"
2519 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2529 "# \\ / ||||| \\ | / \n");
2520} 2530}
2521 2531
2522static void print_event_info(struct trace_buffer *buf, struct seq_file *m) 2532static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2533static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) 2543static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2534{ 2544{
2535 print_event_info(buf, m); 2545 print_event_info(buf, m);
2536 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2546 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
2537 seq_puts(m, "# | | | | |\n"); 2547 "# | | | | |\n");
2538} 2548}
2539 2549
2540static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) 2550static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2541{ 2551{
2542 print_event_info(buf, m); 2552 print_event_info(buf, m);
2543 seq_puts(m, "# _-----=> irqs-off\n"); 2553 seq_puts(m, "# _-----=> irqs-off\n"
2544 seq_puts(m, "# / _----=> need-resched\n"); 2554 "# / _----=> need-resched\n"
2545 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2555 "# | / _---=> hardirq/softirq\n"
2546 seq_puts(m, "# || / _--=> preempt-depth\n"); 2556 "# || / _--=> preempt-depth\n"
2547 seq_puts(m, "# ||| / delay\n"); 2557 "# ||| / delay\n"
2548 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); 2558 "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
2549 seq_puts(m, "# | | | |||| | |\n"); 2559 "# | | | |||| | |\n");
2550} 2560}
2551 2561
2552void 2562void
@@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
2649 event = ftrace_find_event(entry->type); 2659 event = ftrace_find_event(entry->type);
2650 2660
2651 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2661 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2652 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2662 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2653 if (!trace_print_lat_context(iter)) 2663 trace_print_lat_context(iter);
2654 goto partial; 2664 else
2655 } else { 2665 trace_print_context(iter);
2656 if (!trace_print_context(iter))
2657 goto partial;
2658 }
2659 } 2666 }
2660 2667
2668 if (trace_seq_has_overflowed(s))
2669 return TRACE_TYPE_PARTIAL_LINE;
2670
2661 if (event) 2671 if (event)
2662 return event->funcs->trace(iter, sym_flags, event); 2672 return event->funcs->trace(iter, sym_flags, event);
2663 2673
2664 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 2674 trace_seq_printf(s, "Unknown type %d\n", entry->type);
2665 goto partial;
2666 2675
2667 return TRACE_TYPE_HANDLED; 2676 return trace_handle_return(s);
2668partial:
2669 return TRACE_TYPE_PARTIAL_LINE;
2670} 2677}
2671 2678
2672static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 2679static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2677 2684
2678 entry = iter->ent; 2685 entry = iter->ent;
2679 2686
2680 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2687 if (trace_flags & TRACE_ITER_CONTEXT_INFO)
2681 if (!trace_seq_printf(s, "%d %d %llu ", 2688 trace_seq_printf(s, "%d %d %llu ",
2682 entry->pid, iter->cpu, iter->ts)) 2689 entry->pid, iter->cpu, iter->ts);
2683 goto partial; 2690
2684 } 2691 if (trace_seq_has_overflowed(s))
2692 return TRACE_TYPE_PARTIAL_LINE;
2685 2693
2686 event = ftrace_find_event(entry->type); 2694 event = ftrace_find_event(entry->type);
2687 if (event) 2695 if (event)
2688 return event->funcs->raw(iter, 0, event); 2696 return event->funcs->raw(iter, 0, event);
2689 2697
2690 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 2698 trace_seq_printf(s, "%d ?\n", entry->type);
2691 goto partial;
2692 2699
2693 return TRACE_TYPE_HANDLED; 2700 return trace_handle_return(s);
2694partial:
2695 return TRACE_TYPE_PARTIAL_LINE;
2696} 2701}
2697 2702
2698static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 2703static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2705 entry = iter->ent; 2710 entry = iter->ent;
2706 2711
2707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2712 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2708 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 2713 SEQ_PUT_HEX_FIELD(s, entry->pid);
2709 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 2714 SEQ_PUT_HEX_FIELD(s, iter->cpu);
2710 SEQ_PUT_HEX_FIELD_RET(s, iter->ts); 2715 SEQ_PUT_HEX_FIELD(s, iter->ts);
2716 if (trace_seq_has_overflowed(s))
2717 return TRACE_TYPE_PARTIAL_LINE;
2711 } 2718 }
2712 2719
2713 event = ftrace_find_event(entry->type); 2720 event = ftrace_find_event(entry->type);
@@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2717 return ret; 2724 return ret;
2718 } 2725 }
2719 2726
2720 SEQ_PUT_FIELD_RET(s, newline); 2727 SEQ_PUT_FIELD(s, newline);
2721 2728
2722 return TRACE_TYPE_HANDLED; 2729 return trace_handle_return(s);
2723} 2730}
2724 2731
2725static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2732static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2731 entry = iter->ent; 2738 entry = iter->ent;
2732 2739
2733 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 2740 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2734 SEQ_PUT_FIELD_RET(s, entry->pid); 2741 SEQ_PUT_FIELD(s, entry->pid);
2735 SEQ_PUT_FIELD_RET(s, iter->cpu); 2742 SEQ_PUT_FIELD(s, iter->cpu);
2736 SEQ_PUT_FIELD_RET(s, iter->ts); 2743 SEQ_PUT_FIELD(s, iter->ts);
2744 if (trace_seq_has_overflowed(s))
2745 return TRACE_TYPE_PARTIAL_LINE;
2737 } 2746 }
2738 2747
2739 event = ftrace_find_event(entry->type); 2748 event = ftrace_find_event(entry->type);
@@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2779{ 2788{
2780 enum print_line_t ret; 2789 enum print_line_t ret;
2781 2790
2782 if (iter->lost_events && 2791 if (iter->lost_events) {
2783 !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", 2792 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2784 iter->cpu, iter->lost_events)) 2793 iter->cpu, iter->lost_events);
2785 return TRACE_TYPE_PARTIAL_LINE; 2794 if (trace_seq_has_overflowed(&iter->seq))
2795 return TRACE_TYPE_PARTIAL_LINE;
2796 }
2786 2797
2787 if (iter->trace && iter->trace->print_line) { 2798 if (iter->trace && iter->trace->print_line) {
2788 ret = iter->trace->print_line(iter); 2799 ret = iter->trace->print_line(iter);
@@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m)
2860{ 2871{
2861 if (!ftrace_is_dead()) 2872 if (!ftrace_is_dead())
2862 return; 2873 return;
2863 seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); 2874 seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
2864 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2875 "# MAY BE MISSING FUNCTION EVENTS\n");
2865} 2876}
2866 2877
2867#ifdef CONFIG_TRACER_MAX_TRACE 2878#ifdef CONFIG_TRACER_MAX_TRACE
2868static void show_snapshot_main_help(struct seq_file *m) 2879static void show_snapshot_main_help(struct seq_file *m)
2869{ 2880{
2870 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2881 seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
2871 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2882 "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2872 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2883 "# Takes a snapshot of the main buffer.\n"
2873 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); 2884 "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
2874 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2885 "# (Doesn't have to be '2' works with any number that\n"
2875 seq_printf(m, "# is not a '0' or '1')\n"); 2886 "# is not a '0' or '1')\n");
2876} 2887}
2877 2888
2878static void show_snapshot_percpu_help(struct seq_file *m) 2889static void show_snapshot_percpu_help(struct seq_file *m)
2879{ 2890{
2880 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); 2891 seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2881#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP 2892#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2882 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2893 seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
2883 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); 2894 "# Takes a snapshot of the main buffer for this cpu.\n");
2884#else 2895#else
2885 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); 2896 seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
2886 seq_printf(m, "# Must use main snapshot file to allocate.\n"); 2897 "# Must use main snapshot file to allocate.\n");
2887#endif 2898#endif
2888 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); 2899 seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
2889 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2900 "# (Doesn't have to be '2' works with any number that\n"
2890 seq_printf(m, "# is not a '0' or '1')\n"); 2901 "# is not a '0' or '1')\n");
2891} 2902}
2892 2903
2893static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2904static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2894{ 2905{
2895 if (iter->tr->allocated_snapshot) 2906 if (iter->tr->allocated_snapshot)
2896 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); 2907 seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
2897 else 2908 else
2898 seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); 2909 seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
2899 2910
2900 seq_printf(m, "# Snapshot commands:\n"); 2911 seq_puts(m, "# Snapshot commands:\n");
2901 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) 2912 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2902 show_snapshot_main_help(m); 2913 show_snapshot_main_help(m);
2903 else 2914 else
@@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v)
3251 if (!t) 3262 if (!t)
3252 return 0; 3263 return 0;
3253 3264
3254 seq_printf(m, "%s", t->name); 3265 seq_puts(m, t->name);
3255 if (t->next) 3266 if (t->next)
3256 seq_putc(m, ' '); 3267 seq_putc(m, ' ');
3257 else 3268 else
@@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
4314 goto out; 4325 goto out;
4315 } 4326 }
4316 4327
4328 trace_seq_init(&iter->seq);
4329
4317 /* 4330 /*
4318 * We make a copy of the current tracer to avoid concurrent 4331 * We make a copy of the current tracer to avoid concurrent
4319 * changes on it while we are reading. 4332 * changes on it while we are reading.
@@ -4507,18 +4520,18 @@ waitagain:
4507 trace_access_lock(iter->cpu_file); 4520 trace_access_lock(iter->cpu_file);
4508 while (trace_find_next_entry_inc(iter) != NULL) { 4521 while (trace_find_next_entry_inc(iter) != NULL) {
4509 enum print_line_t ret; 4522 enum print_line_t ret;
4510 int len = iter->seq.len; 4523 int save_len = iter->seq.seq.len;
4511 4524
4512 ret = print_trace_line(iter); 4525 ret = print_trace_line(iter);
4513 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4526 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4514 /* don't print partial lines */ 4527 /* don't print partial lines */
4515 iter->seq.len = len; 4528 iter->seq.seq.len = save_len;
4516 break; 4529 break;
4517 } 4530 }
4518 if (ret != TRACE_TYPE_NO_CONSUME) 4531 if (ret != TRACE_TYPE_NO_CONSUME)
4519 trace_consume(iter); 4532 trace_consume(iter);
4520 4533
4521 if (iter->seq.len >= cnt) 4534 if (trace_seq_used(&iter->seq) >= cnt)
4522 break; 4535 break;
4523 4536
4524 /* 4537 /*
@@ -4534,7 +4547,7 @@ waitagain:
4534 4547
4535 /* Now copy what we have to the user */ 4548 /* Now copy what we have to the user */
4536 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 4549 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
4537 if (iter->seq.readpos >= iter->seq.len) 4550 if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
4538 trace_seq_init(&iter->seq); 4551 trace_seq_init(&iter->seq);
4539 4552
4540 /* 4553 /*
@@ -4568,20 +4581,33 @@ static size_t
4568tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) 4581tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
4569{ 4582{
4570 size_t count; 4583 size_t count;
4584 int save_len;
4571 int ret; 4585 int ret;
4572 4586
4573 /* Seq buffer is page-sized, exactly what we need. */ 4587 /* Seq buffer is page-sized, exactly what we need. */
4574 for (;;) { 4588 for (;;) {
4575 count = iter->seq.len; 4589 save_len = iter->seq.seq.len;
4576 ret = print_trace_line(iter); 4590 ret = print_trace_line(iter);
4577 count = iter->seq.len - count; 4591
4578 if (rem < count) { 4592 if (trace_seq_has_overflowed(&iter->seq)) {
4579 rem = 0; 4593 iter->seq.seq.len = save_len;
4580 iter->seq.len -= count;
4581 break; 4594 break;
4582 } 4595 }
4596
4597 /*
4598 * This should not be hit, because it should only
4599 * be set if the iter->seq overflowed. But check it
4600 * anyway to be safe.
4601 */
4583 if (ret == TRACE_TYPE_PARTIAL_LINE) { 4602 if (ret == TRACE_TYPE_PARTIAL_LINE) {
4584 iter->seq.len -= count; 4603 iter->seq.seq.len = save_len;
4604 break;
4605 }
4606
4607 count = trace_seq_used(&iter->seq) - save_len;
4608 if (rem < count) {
4609 rem = 0;
4610 iter->seq.seq.len = save_len;
4585 break; 4611 break;
4586 } 4612 }
4587 4613
@@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
4662 /* Copy the data into the page, so we can start over. */ 4688 /* Copy the data into the page, so we can start over. */
4663 ret = trace_seq_to_buffer(&iter->seq, 4689 ret = trace_seq_to_buffer(&iter->seq,
4664 page_address(spd.pages[i]), 4690 page_address(spd.pages[i]),
4665 iter->seq.len); 4691 trace_seq_used(&iter->seq));
4666 if (ret < 0) { 4692 if (ret < 0) {
4667 __free_page(spd.pages[i]); 4693 __free_page(spd.pages[i]);
4668 break; 4694 break;
4669 } 4695 }
4670 spd.partial[i].offset = 0; 4696 spd.partial[i].offset = 0;
4671 spd.partial[i].len = iter->seq.len; 4697 spd.partial[i].len = trace_seq_used(&iter->seq);
4672 4698
4673 trace_seq_init(&iter->seq); 4699 trace_seq_init(&iter->seq);
4674 } 4700 }
@@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5668 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); 5694 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
5669 trace_seq_printf(s, "read events: %ld\n", cnt); 5695 trace_seq_printf(s, "read events: %ld\n", cnt);
5670 5696
5671 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5697 count = simple_read_from_buffer(ubuf, count, ppos,
5698 s->buffer, trace_seq_used(s));
5672 5699
5673 kfree(s); 5700 kfree(s);
5674 5701
@@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5749 5776
5750 seq_printf(m, "%ps:", (void *)ip); 5777 seq_printf(m, "%ps:", (void *)ip);
5751 5778
5752 seq_printf(m, "snapshot"); 5779 seq_puts(m, "snapshot");
5753 5780
5754 if (count == -1) 5781 if (count == -1)
5755 seq_printf(m, ":unlimited\n"); 5782 seq_puts(m, ":unlimited\n");
5756 else 5783 else
5757 seq_printf(m, ":count=%ld\n", count); 5784 seq_printf(m, ":count=%ld\n", count);
5758 5785
@@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
6417 int ret; 6444 int ret;
6418 6445
6419 /* Paranoid: Make sure the parent is the "instances" directory */ 6446 /* Paranoid: Make sure the parent is the "instances" directory */
6420 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6447 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6421 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6448 if (WARN_ON_ONCE(parent != trace_instance_dir))
6422 return -ENOENT; 6449 return -ENOENT;
6423 6450
@@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
6444 int ret; 6471 int ret;
6445 6472
6446 /* Paranoid: Make sure the parent is the "instances" directory */ 6473 /* Paranoid: Make sure the parent is the "instances" directory */
6447 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); 6474 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
6448 if (WARN_ON_ONCE(parent != trace_instance_dir)) 6475 if (WARN_ON_ONCE(parent != trace_instance_dir))
6449 return -ENOENT; 6476 return -ENOENT;
6450 6477
@@ -6631,11 +6658,19 @@ void
6631trace_printk_seq(struct trace_seq *s) 6658trace_printk_seq(struct trace_seq *s)
6632{ 6659{
6633 /* Probably should print a warning here. */ 6660 /* Probably should print a warning here. */
6634 if (s->len >= TRACE_MAX_PRINT) 6661 if (s->seq.len >= TRACE_MAX_PRINT)
6635 s->len = TRACE_MAX_PRINT; 6662 s->seq.len = TRACE_MAX_PRINT;
6663
6664 /*
6665 * More paranoid code. Although the buffer size is set to
6666 * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
6667 * an extra layer of protection.
6668 */
6669 if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
6670 s->seq.len = s->seq.size - 1;
6636 6671
6637 /* should be zero ended, but we are paranoid. */ 6672 /* should be zero ended, but we are paranoid. */
6638 s->buffer[s->len] = 0; 6673 s->buffer[s->seq.len] = 0;
6639 6674
6640 printk(KERN_TRACE "%s", s->buffer); 6675 printk(KERN_TRACE "%s", s->buffer);
6641 6676
@@ -6874,6 +6909,18 @@ out:
6874 return ret; 6909 return ret;
6875} 6910}
6876 6911
6912void __init trace_init(void)
6913{
6914 if (tracepoint_printk) {
6915 tracepoint_print_iter =
6916 kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
6917 if (WARN_ON(!tracepoint_print_iter))
6918 tracepoint_printk = 0;
6919 }
6920 tracer_alloc_buffers();
6921 trace_event_init();
6922}
6923
6877__init static int clear_boot_tracer(void) 6924__init static int clear_boot_tracer(void)
6878{ 6925{
6879 /* 6926 /*
@@ -6893,6 +6940,5 @@ __init static int clear_boot_tracer(void)
6893 return 0; 6940 return 0;
6894} 6941}
6895 6942
6896early_initcall(tracer_alloc_buffers);
6897fs_initcall(tracer_init_debugfs); 6943fs_initcall(tracer_init_debugfs);
6898late_initcall(clear_boot_tracer); 6944late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..8de48bac1ce2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
14#include <linux/trace_seq.h> 14#include <linux/trace_seq.h>
15#include <linux/ftrace_event.h> 15#include <linux/ftrace_event.h>
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/trace_seq.h>
17 18
18#ifdef CONFIG_FTRACE_SYSCALLS 19#ifdef CONFIG_FTRACE_SYSCALLS
19#include <asm/unistd.h> /* For NR_SYSCALLS */ 20#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
569 570
570void tracing_iter_reset(struct trace_iterator *iter, int cpu); 571void tracing_iter_reset(struct trace_iterator *iter, int cpu);
571 572
572void tracing_sched_switch_trace(struct trace_array *tr,
573 struct task_struct *prev,
574 struct task_struct *next,
575 unsigned long flags, int pc);
576
577void tracing_sched_wakeup_trace(struct trace_array *tr,
578 struct task_struct *wakee,
579 struct task_struct *cur,
580 unsigned long flags, int pc);
581void trace_function(struct trace_array *tr, 573void trace_function(struct trace_array *tr,
582 unsigned long ip, 574 unsigned long ip,
583 unsigned long parent_ip, 575 unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
597 589
598void tracing_start_cmdline_record(void); 590void tracing_start_cmdline_record(void);
599void tracing_stop_cmdline_record(void); 591void tracing_stop_cmdline_record(void);
600void tracing_sched_switch_assign_trace(struct trace_array *tr);
601void tracing_stop_sched_switch_record(void);
602void tracing_start_sched_switch_record(void);
603int register_tracer(struct tracer *type); 592int register_tracer(struct tracer *type);
604int is_tracing_stopped(void); 593int is_tracing_stopped(void);
605 594
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
719 708
720extern unsigned long trace_flags; 709extern unsigned long trace_flags;
721 710
711extern char trace_find_mark(unsigned long long duration);
712
722/* Standard output formatting function used for function return traces */ 713/* Standard output formatting function used for function return traces */
723#ifdef CONFIG_FUNCTION_GRAPH_TRACER 714#ifdef CONFIG_FUNCTION_GRAPH_TRACER
724 715
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
737extern enum print_line_t 728extern enum print_line_t
738print_graph_function_flags(struct trace_iterator *iter, u32 flags); 729print_graph_function_flags(struct trace_iterator *iter, u32 flags);
739extern void print_graph_headers_flags(struct seq_file *s, u32 flags); 730extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
740extern enum print_line_t 731extern void
741trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 732trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
742extern void graph_trace_open(struct trace_iterator *iter); 733extern void graph_trace_open(struct trace_iterator *iter);
743extern void graph_trace_close(struct trace_iterator *iter); 734extern void graph_trace_close(struct trace_iterator *iter);
@@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
1310#define perf_ftrace_event_register NULL 1301#define perf_ftrace_event_register NULL
1311#endif 1302#endif
1312 1303
1304#ifdef CONFIG_FTRACE_SYSCALLS
1305void init_ftrace_syscalls(void);
1306#else
1307static inline void init_ftrace_syscalls(void) { }
1308#endif
1309
1310#ifdef CONFIG_EVENT_TRACING
1311void trace_event_init(void);
1312#else
1313static inline void __init trace_event_init(void) { }
1314#endif
1315
1316extern struct trace_iterator *tracepoint_print_iter;
1317
1313#endif /* _LINUX_KERNEL_TRACE_H */ 1318#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
151 151
152 trace_assign_type(field, iter->ent); 152 trace_assign_type(field, iter->ent);
153 153
154 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", 154 trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
155 field->correct ? " ok " : " MISS ", 155 field->correct ? " ok " : " MISS ",
156 field->func, 156 field->func,
157 field->file, 157 field->file,
158 field->line)) 158 field->line);
159 return TRACE_TYPE_PARTIAL_LINE; 159
160 160 return trace_handle_return(&iter->seq);
161 return TRACE_TYPE_HANDLED;
162} 161}
163 162
164static void branch_print_header(struct seq_file *s) 163static void branch_print_header(struct seq_file *s)
165{ 164{
166 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" 165 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
167 " FUNC:FILE:LINE\n"); 166 " FUNC:FILE:LINE\n"
168 seq_puts(s, "# | | | | | " 167 "# | | | | | "
169 " |\n"); 168 " |\n");
170} 169}
171 170
172static struct trace_event_functions trace_branch_funcs = { 171static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
233 232
234static int annotated_branch_stat_headers(struct seq_file *m) 233static int annotated_branch_stat_headers(struct seq_file *m)
235{ 234{
236 seq_printf(m, " correct incorrect %% "); 235 seq_puts(m, " correct incorrect % "
237 seq_printf(m, " Function " 236 " Function "
238 " File Line\n" 237 " File Line\n"
239 " ------- --------- - " 238 " ------- --------- - "
240 " -------- " 239 " -------- "
241 " ---- ----\n"); 240 " ---- ----\n");
242 return 0; 241 return 0;
243} 242}
244 243
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
274 273
275 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 274 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
276 if (percent < 0) 275 if (percent < 0)
277 seq_printf(m, " X "); 276 seq_puts(m, " X ");
278 else 277 else
279 seq_printf(m, "%3ld ", percent); 278 seq_printf(m, "%3ld ", percent);
280 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); 279 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
362 361
363static int all_branch_stat_headers(struct seq_file *m) 362static int all_branch_stat_headers(struct seq_file *m)
364{ 363{
365 seq_printf(m, " miss hit %% "); 364 seq_puts(m, " miss hit % "
366 seq_printf(m, " Function " 365 " Function "
367 " File Line\n" 366 " File Line\n"
368 " ------- --------- - " 367 " ------- --------- - "
369 " -------- " 368 " -------- "
370 " ---- ----\n"); 369 " ---- ----\n");
371 return 0; 370 return 0;
372} 371}
373 372
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..b03a0ea77b99 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
212} 212}
213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); 213EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
214 214
215static DEFINE_SPINLOCK(tracepoint_iter_lock);
216
217static void output_printk(struct ftrace_event_buffer *fbuffer)
218{
219 struct ftrace_event_call *event_call;
220 struct trace_event *event;
221 unsigned long flags;
222 struct trace_iterator *iter = tracepoint_print_iter;
223
224 if (!iter)
225 return;
226
227 event_call = fbuffer->ftrace_file->event_call;
228 if (!event_call || !event_call->event.funcs ||
229 !event_call->event.funcs->trace)
230 return;
231
232 event = &fbuffer->ftrace_file->event_call->event;
233
234 spin_lock_irqsave(&tracepoint_iter_lock, flags);
235 trace_seq_init(&iter->seq);
236 iter->ent = fbuffer->entry;
237 event_call->event.funcs->trace(iter, 0, event);
238 trace_seq_putc(&iter->seq, 0);
239 printk("%s", iter->seq.buffer);
240
241 spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
242}
243
215void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) 244void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
216{ 245{
246 if (tracepoint_printk)
247 output_printk(fbuffer);
248
217 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, 249 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
218 fbuffer->event, fbuffer->entry, 250 fbuffer->event, fbuffer->entry,
219 fbuffer->flags, fbuffer->pc); 251 fbuffer->flags, fbuffer->pc);
@@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
461 493
462 if (dir) { 494 if (dir) {
463 spin_lock(&dir->d_lock); /* probably unneeded */ 495 spin_lock(&dir->d_lock); /* probably unneeded */
464 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { 496 list_for_each_entry(child, &dir->d_subdirs, d_child) {
465 if (child->d_inode) /* probably unneeded */ 497 if (child->d_inode) /* probably unneeded */
466 child->d_inode->i_private = NULL; 498 child->d_inode->i_private = NULL;
467 } 499 }
@@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v)
918 case FORMAT_HEADER: 950 case FORMAT_HEADER:
919 seq_printf(m, "name: %s\n", ftrace_event_name(call)); 951 seq_printf(m, "name: %s\n", ftrace_event_name(call));
920 seq_printf(m, "ID: %d\n", call->event.type); 952 seq_printf(m, "ID: %d\n", call->event.type);
921 seq_printf(m, "format:\n"); 953 seq_puts(m, "format:\n");
922 return 0; 954 return 0;
923 955
924 case FORMAT_FIELD_SEPERATOR: 956 case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1044 mutex_unlock(&event_mutex); 1076 mutex_unlock(&event_mutex);
1045 1077
1046 if (file) 1078 if (file)
1047 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1079 r = simple_read_from_buffer(ubuf, cnt, ppos,
1080 s->buffer, trace_seq_used(s));
1048 1081
1049 kfree(s); 1082 kfree(s);
1050 1083
@@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
1210 trace_seq_init(s); 1243 trace_seq_init(s);
1211 1244
1212 print_subsystem_event_filter(system, s); 1245 print_subsystem_event_filter(system, s);
1213 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1246 r = simple_read_from_buffer(ubuf, cnt, ppos,
1247 s->buffer, trace_seq_used(s));
1214 1248
1215 kfree(s); 1249 kfree(s);
1216 1250
@@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1265 trace_seq_init(s); 1299 trace_seq_init(s);
1266 1300
1267 func(s); 1301 func(s);
1268 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1302 r = simple_read_from_buffer(ubuf, cnt, ppos,
1303 s->buffer, trace_seq_used(s));
1269 1304
1270 kfree(s); 1305 kfree(s);
1271 1306
@@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
1988 ftrace_event_name(data->file->event_call)); 2023 ftrace_event_name(data->file->event_call));
1989 2024
1990 if (data->count == -1) 2025 if (data->count == -1)
1991 seq_printf(m, ":unlimited\n"); 2026 seq_puts(m, ":unlimited\n");
1992 else 2027 else
1993 seq_printf(m, ":count=%ld\n", data->count); 2028 seq_printf(m, ":count=%ld\n", data->count);
1994 2029
@@ -2394,12 +2429,39 @@ static __init int event_trace_memsetup(void)
2394 return 0; 2429 return 0;
2395} 2430}
2396 2431
2432static __init void
2433early_enable_events(struct trace_array *tr, bool disable_first)
2434{
2435 char *buf = bootup_event_buf;
2436 char *token;
2437 int ret;
2438
2439 while (true) {
2440 token = strsep(&buf, ",");
2441
2442 if (!token)
2443 break;
2444 if (!*token)
2445 continue;
2446
2447 /* Restarting syscalls requires that we stop them first */
2448 if (disable_first)
2449 ftrace_set_clr_event(tr, token, 0);
2450
2451 ret = ftrace_set_clr_event(tr, token, 1);
2452 if (ret)
2453 pr_warn("Failed to enable trace event: %s\n", token);
2454
2455 /* Put back the comma to allow this to be called again */
2456 if (buf)
2457 *(buf - 1) = ',';
2458 }
2459}
2460
2397static __init int event_trace_enable(void) 2461static __init int event_trace_enable(void)
2398{ 2462{
2399 struct trace_array *tr = top_trace_array(); 2463 struct trace_array *tr = top_trace_array();
2400 struct ftrace_event_call **iter, *call; 2464 struct ftrace_event_call **iter, *call;
2401 char *buf = bootup_event_buf;
2402 char *token;
2403 int ret; 2465 int ret;
2404 2466
2405 if (!tr) 2467 if (!tr)
@@ -2421,18 +2483,7 @@ static __init int event_trace_enable(void)
2421 */ 2483 */
2422 __trace_early_add_events(tr); 2484 __trace_early_add_events(tr);
2423 2485
2424 while (true) { 2486 early_enable_events(tr, false);
2425 token = strsep(&buf, ",");
2426
2427 if (!token)
2428 break;
2429 if (!*token)
2430 continue;
2431
2432 ret = ftrace_set_clr_event(tr, token, 1);
2433 if (ret)
2434 pr_warn("Failed to enable trace event: %s\n", token);
2435 }
2436 2487
2437 trace_printk_start_comm(); 2488 trace_printk_start_comm();
2438 2489
@@ -2443,6 +2494,31 @@ static __init int event_trace_enable(void)
2443 return 0; 2494 return 0;
2444} 2495}
2445 2496
2497/*
2498 * event_trace_enable() is called from trace_event_init() first to
2499 * initialize events and perhaps start any events that are on the
2500 * command line. Unfortunately, there are some events that will not
2501 * start this early, like the system call tracepoints that need
2502 * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
2503 * is called before pid 1 starts, and this flag is never set, making
2504 * the syscall tracepoint never get reached, but the event is enabled
2505 * regardless (and not doing anything).
2506 */
2507static __init int event_trace_enable_again(void)
2508{
2509 struct trace_array *tr;
2510
2511 tr = top_trace_array();
2512 if (!tr)
2513 return -ENODEV;
2514
2515 early_enable_events(tr, true);
2516
2517 return 0;
2518}
2519
2520early_initcall(event_trace_enable_again);
2521
2446static __init int event_trace_init(void) 2522static __init int event_trace_init(void)
2447{ 2523{
2448 struct trace_array *tr; 2524 struct trace_array *tr;
@@ -2477,8 +2553,14 @@ static __init int event_trace_init(void)
2477#endif 2553#endif
2478 return 0; 2554 return 0;
2479} 2555}
2480early_initcall(event_trace_memsetup); 2556
2481core_initcall(event_trace_enable); 2557void __init trace_event_init(void)
2558{
2559 event_trace_memsetup();
2560 init_ftrace_syscalls();
2561 event_trace_enable();
2562}
2563
2482fs_initcall(event_trace_init); 2564fs_initcall(event_trace_init);
2483 2565
2484#ifdef CONFIG_FTRACE_STARTUP_TEST 2566#ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND, 47 OP_BAND,
48 OP_NOT,
48 OP_NONE, 49 OP_NONE,
49 OP_OPEN_PAREN, 50 OP_OPEN_PAREN,
50}; 51};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
67 { OP_GT, ">", 5 }, 68 { OP_GT, ">", 5 },
68 { OP_GE, ">=", 5 }, 69 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 }, 70 { OP_BAND, "&", 6 },
71 { OP_NOT, "!", 6 },
70 { OP_NONE, "OP_NONE", 0 }, 72 { OP_NONE, "OP_NONE", 0 },
71 { OP_OPEN_PAREN, "(", 0 }, 73 { OP_OPEN_PAREN, "(", 0 },
72}; 74};
@@ -85,6 +87,7 @@ enum {
85 FILT_ERR_MISSING_FIELD, 87 FILT_ERR_MISSING_FIELD,
86 FILT_ERR_INVALID_FILTER, 88 FILT_ERR_INVALID_FILTER,
87 FILT_ERR_IP_FIELD_ONLY, 89 FILT_ERR_IP_FIELD_ONLY,
90 FILT_ERR_ILLEGAL_NOT_OP,
88}; 91};
89 92
90static char *err_text[] = { 93static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
101 "Missing field name and/or value", 104 "Missing field name and/or value",
102 "Meaningless filter expression", 105 "Meaningless filter expression",
103 "Only 'ip' field is supported for function trace", 106 "Only 'ip' field is supported for function trace",
107 "Illegal use of '!'",
104}; 108};
105 109
106struct opstack_op { 110struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
139 int index; 143 int index;
140}; 144};
141 145
146/* If not of not match is equal to not of not, then it is a match */
142#define DEFINE_COMPARISON_PRED(type) \ 147#define DEFINE_COMPARISON_PRED(type) \
143static int filter_pred_##type(struct filter_pred *pred, void *event) \ 148static int filter_pred_##type(struct filter_pred *pred, void *event) \
144{ \ 149{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
166 break; \ 171 break; \
167 } \ 172 } \
168 \ 173 \
169 return match; \ 174 return !!match == !pred->not; \
170} 175}
171 176
172#define DEFINE_EQUALITY_PRED(size) \ 177#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
484 if (!WARN_ON_ONCE(!pred->fn)) 489 if (!WARN_ON_ONCE(!pred->fn))
485 match = pred->fn(pred, rec); 490 match = pred->fn(pred, rec);
486 if (!!match == type) 491 if (!!match == type)
487 return match; 492 break;
488 } 493 }
489 return match; 494 /* If not of not match is equal to not of not, then it is a match */
495 return !!match == !op->not;
490} 496}
491 497
492struct filter_match_preds_data { 498struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
735 * then this op can be folded. 741 * then this op can be folded.
736 */ 742 */
737 if (left->index & FILTER_PRED_FOLD && 743 if (left->index & FILTER_PRED_FOLD &&
738 (left->op == dest->op || 744 ((left->op == dest->op && !left->not) ||
739 left->left == FILTER_PRED_INVALID) && 745 left->left == FILTER_PRED_INVALID) &&
740 right->index & FILTER_PRED_FOLD && 746 right->index & FILTER_PRED_FOLD &&
741 (right->op == dest->op || 747 ((right->op == dest->op && !right->not) ||
742 right->left == FILTER_PRED_INVALID)) 748 right->left == FILTER_PRED_INVALID))
743 dest->index |= FILTER_PRED_FOLD; 749 dest->index |= FILTER_PRED_FOLD;
744 750
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
1028 } 1034 }
1029 1035
1030 if (pred->op == OP_NE) 1036 if (pred->op == OP_NE)
1031 pred->not = 1; 1037 pred->not ^= 1;
1032 1038
1033 pred->fn = fn; 1039 pred->fn = fn;
1034 return 0; 1040 return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
1590 continue; 1596 continue;
1591 } 1597 }
1592 1598
1599 if (elt->op == OP_NOT) {
1600 if (!n_preds || operand1 || operand2) {
1601 parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
1602 err = -EINVAL;
1603 goto fail;
1604 }
1605 if (!dry_run)
1606 filter->preds[n_preds - 1].not ^= 1;
1607 continue;
1608 }
1609
1593 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { 1610 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1594 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1611 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1595 err = -ENOSPC; 1612 err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
373{ 373{
374 long count = (long)data; 374 long count = (long)data;
375 375
376 seq_printf(m, "%s", name); 376 seq_puts(m, name);
377 377
378 if (count == -1) 378 if (count == -1)
379 seq_puts(m, ":unlimited"); 379 seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
383 if (filter_str) 383 if (filter_str)
384 seq_printf(m, " if %s\n", filter_str); 384 seq_printf(m, " if %s\n", filter_str);
385 else 385 else
386 seq_puts(m, "\n"); 386 seq_putc(m, '\n');
387 387
388 return 0; 388 return 0;
389} 389}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
1105 if (data->filter_str) 1105 if (data->filter_str)
1106 seq_printf(m, " if %s\n", data->filter_str); 1106 seq_printf(m, " if %s\n", data->filter_str);
1107 else 1107 else
1108 seq_puts(m, "\n"); 1108 seq_putc(m, '\n');
1109 1109
1110 return 0; 1110 return 0;
1111} 1111}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
261}; 261};
262 262
263#ifdef CONFIG_DYNAMIC_FTRACE 263#ifdef CONFIG_DYNAMIC_FTRACE
264static int update_count(void **data) 264static void update_traceon_count(void **data, bool on)
265{ 265{
266 unsigned long *count = (long *)data; 266 long *count = (long *)data;
267 long old_count = *count;
267 268
268 if (!*count) 269 /*
269 return 0; 270 * Tracing gets disabled (or enabled) once per count.
271 * This function can be called at the same time on multiple CPUs.
272 * It is fine if both disable (or enable) tracing, as disabling
273 * (or enabling) the second time doesn't do anything as the
274 * state of the tracer is already disabled (or enabled).
275 * What needs to be synchronized in this case is that the count
276 * only gets decremented once, even if the tracer is disabled
277 * (or enabled) twice, as the second one is really a nop.
278 *
279 * The memory barriers guarantee that we only decrement the
280 * counter once. First the count is read to a local variable
281 * and a read barrier is used to make sure that it is loaded
282 * before checking if the tracer is in the state we want.
283 * If the tracer is not in the state we want, then the count
284 * is guaranteed to be the old count.
285 *
286 * Next the tracer is set to the state we want (disabled or enabled)
287 * then a write memory barrier is used to make sure that
288 * the new state is visible before changing the counter by
289 * one minus the old counter. This guarantees that another CPU
290 * executing this code will see the new state before seeing
291 * the new counter value, and would not do anything if the new
292 * counter is seen.
293 *
294 * Note, there is no synchronization between this and a user
295 * setting the tracing_on file. But we currently don't care
296 * about that.
297 */
298 if (!old_count)
299 return;
270 300
271 if (*count != -1) 301 /* Make sure we see count before checking tracing state */
272 (*count)--; 302 smp_rmb();
273 303
274 return 1; 304 if (on == !!tracing_is_on())
305 return;
306
307 if (on)
308 tracing_on();
309 else
310 tracing_off();
311
312 /* unlimited? */
313 if (old_count == -1)
314 return;
315
316 /* Make sure tracing state is visible before updating count */
317 smp_wmb();
318
319 *count = old_count - 1;
275} 320}
276 321
277static void 322static void
278ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) 323ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
279{ 324{
280 if (tracing_is_on()) 325 update_traceon_count(data, 1);
281 return;
282
283 if (update_count(data))
284 tracing_on();
285} 326}
286 327
287static void 328static void
288ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) 329ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
289{ 330{
290 if (!tracing_is_on()) 331 update_traceon_count(data, 0);
291 return;
292
293 if (update_count(data))
294 tracing_off();
295} 332}
296 333
297static void 334static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
330static void 367static void
331ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) 368ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
332{ 369{
333 if (!tracing_is_on()) 370 long *count = (long *)data;
334 return; 371 long old_count;
372 long new_count;
335 373
336 if (update_count(data)) 374 /*
337 trace_dump_stack(STACK_SKIP); 375 * Stack traces should only execute the number of times the
376 * user specified in the counter.
377 */
378 do {
379
380 if (!tracing_is_on())
381 return;
382
383 old_count = *count;
384
385 if (!old_count)
386 return;
387
388 /* unlimited? */
389 if (old_count == -1) {
390 trace_dump_stack(STACK_SKIP);
391 return;
392 }
393
394 new_count = old_count - 1;
395 new_count = cmpxchg(count, old_count, new_count);
396 if (new_count == old_count)
397 trace_dump_stack(STACK_SKIP);
398
399 } while (new_count != old_count);
400}
401
402static int update_count(void **data)
403{
404 unsigned long *count = (long *)data;
405
406 if (!*count)
407 return 0;
408
409 if (*count != -1)
410 (*count)--;
411
412 return 1;
338} 413}
339 414
340static void 415static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
361 seq_printf(m, "%ps:%s", (void *)ip, name); 436 seq_printf(m, "%ps:%s", (void *)ip, name);
362 437
363 if (count == -1) 438 if (count == -1)
364 seq_printf(m, ":unlimited\n"); 439 seq_puts(m, ":unlimited\n");
365 else 440 else
366 seq_printf(m, ":count=%ld\n", count); 441 seq_printf(m, ":count=%ld\n", count);
367 442
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, 107 FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
108}; 108};
109 109
110static enum print_line_t 110static void
111print_graph_duration(unsigned long long duration, struct trace_seq *s, 111print_graph_duration(unsigned long long duration, struct trace_seq *s,
112 u32 flags); 112 u32 flags);
113 113
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
483 483
484static int max_bytes_for_cpu; 484static int max_bytes_for_cpu;
485 485
486static enum print_line_t 486static void print_graph_cpu(struct trace_seq *s, int cpu)
487print_graph_cpu(struct trace_seq *s, int cpu)
488{ 487{
489 int ret;
490
491 /* 488 /*
492 * Start with a space character - to make it stand out 489 * Start with a space character - to make it stand out
493 * to the right a bit when trace output is pasted into 490 * to the right a bit when trace output is pasted into
494 * email: 491 * email:
495 */ 492 */
496 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); 493 trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
497 if (!ret)
498 return TRACE_TYPE_PARTIAL_LINE;
499
500 return TRACE_TYPE_HANDLED;
501} 494}
502 495
503#define TRACE_GRAPH_PROCINFO_LENGTH 14 496#define TRACE_GRAPH_PROCINFO_LENGTH 14
504 497
505static enum print_line_t 498static void print_graph_proc(struct trace_seq *s, pid_t pid)
506print_graph_proc(struct trace_seq *s, pid_t pid)
507{ 499{
508 char comm[TASK_COMM_LEN]; 500 char comm[TASK_COMM_LEN];
509 /* sign + log10(MAX_INT) + '\0' */ 501 /* sign + log10(MAX_INT) + '\0' */
510 char pid_str[11]; 502 char pid_str[11];
511 int spaces = 0; 503 int spaces = 0;
512 int ret;
513 int len; 504 int len;
514 int i; 505 int i;
515 506
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
524 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; 515 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
525 516
526 /* First spaces to align center */ 517 /* First spaces to align center */
527 for (i = 0; i < spaces / 2; i++) { 518 for (i = 0; i < spaces / 2; i++)
528 ret = trace_seq_putc(s, ' '); 519 trace_seq_putc(s, ' ');
529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE;
531 }
532 520
533 ret = trace_seq_printf(s, "%s-%s", comm, pid_str); 521 trace_seq_printf(s, "%s-%s", comm, pid_str);
534 if (!ret)
535 return TRACE_TYPE_PARTIAL_LINE;
536 522
537 /* Last spaces to align center */ 523 /* Last spaces to align center */
538 for (i = 0; i < spaces - (spaces / 2); i++) { 524 for (i = 0; i < spaces - (spaces / 2); i++)
539 ret = trace_seq_putc(s, ' '); 525 trace_seq_putc(s, ' ');
540 if (!ret)
541 return TRACE_TYPE_PARTIAL_LINE;
542 }
543 return TRACE_TYPE_HANDLED;
544} 526}
545 527
546 528
547static enum print_line_t 529static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
548print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
549{ 530{
550 if (!trace_seq_putc(s, ' ')) 531 trace_seq_putc(s, ' ');
551 return 0; 532 trace_print_lat_fmt(s, entry);
552
553 return trace_print_lat_fmt(s, entry);
554} 533}
555 534
556/* If the pid changed since the last trace, output this event */ 535/* If the pid changed since the last trace, output this event */
557static enum print_line_t 536static void
558verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 537verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
559{ 538{
560 pid_t prev_pid; 539 pid_t prev_pid;
561 pid_t *last_pid; 540 pid_t *last_pid;
562 int ret;
563 541
564 if (!data) 542 if (!data)
565 return TRACE_TYPE_HANDLED; 543 return;
566 544
567 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 545 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
568 546
569 if (*last_pid == pid) 547 if (*last_pid == pid)
570 return TRACE_TYPE_HANDLED; 548 return;
571 549
572 prev_pid = *last_pid; 550 prev_pid = *last_pid;
573 *last_pid = pid; 551 *last_pid = pid;
574 552
575 if (prev_pid == -1) 553 if (prev_pid == -1)
576 return TRACE_TYPE_HANDLED; 554 return;
577/* 555/*
578 * Context-switch trace line: 556 * Context-switch trace line:
579 557
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
582 ------------------------------------------ 560 ------------------------------------------
583 561
584 */ 562 */
585 ret = trace_seq_puts(s, 563 trace_seq_puts(s, " ------------------------------------------\n");
586 " ------------------------------------------\n"); 564 print_graph_cpu(s, cpu);
587 if (!ret) 565 print_graph_proc(s, prev_pid);
588 return TRACE_TYPE_PARTIAL_LINE; 566 trace_seq_puts(s, " => ");
589 567 print_graph_proc(s, pid);
590 ret = print_graph_cpu(s, cpu); 568 trace_seq_puts(s, "\n ------------------------------------------\n\n");
591 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE;
593
594 ret = print_graph_proc(s, prev_pid);
595 if (ret == TRACE_TYPE_PARTIAL_LINE)
596 return TRACE_TYPE_PARTIAL_LINE;
597
598 ret = trace_seq_puts(s, " => ");
599 if (!ret)
600 return TRACE_TYPE_PARTIAL_LINE;
601
602 ret = print_graph_proc(s, pid);
603 if (ret == TRACE_TYPE_PARTIAL_LINE)
604 return TRACE_TYPE_PARTIAL_LINE;
605
606 ret = trace_seq_puts(s,
607 "\n ------------------------------------------\n\n");
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610
611 return TRACE_TYPE_HANDLED;
612} 569}
613 570
614static struct ftrace_graph_ret_entry * 571static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
682 return next; 639 return next;
683} 640}
684 641
685static int print_graph_abs_time(u64 t, struct trace_seq *s) 642static void print_graph_abs_time(u64 t, struct trace_seq *s)
686{ 643{
687 unsigned long usecs_rem; 644 unsigned long usecs_rem;
688 645
689 usecs_rem = do_div(t, NSEC_PER_SEC); 646 usecs_rem = do_div(t, NSEC_PER_SEC);
690 usecs_rem /= 1000; 647 usecs_rem /= 1000;
691 648
692 return trace_seq_printf(s, "%5lu.%06lu | ", 649 trace_seq_printf(s, "%5lu.%06lu | ",
693 (unsigned long)t, usecs_rem); 650 (unsigned long)t, usecs_rem);
694} 651}
695 652
696static enum print_line_t 653static void
697print_graph_irq(struct trace_iterator *iter, unsigned long addr, 654print_graph_irq(struct trace_iterator *iter, unsigned long addr,
698 enum trace_type type, int cpu, pid_t pid, u32 flags) 655 enum trace_type type, int cpu, pid_t pid, u32 flags)
699{ 656{
700 int ret;
701 struct trace_seq *s = &iter->seq; 657 struct trace_seq *s = &iter->seq;
658 struct trace_entry *ent = iter->ent;
702 659
703 if (addr < (unsigned long)__irqentry_text_start || 660 if (addr < (unsigned long)__irqentry_text_start ||
704 addr >= (unsigned long)__irqentry_text_end) 661 addr >= (unsigned long)__irqentry_text_end)
705 return TRACE_TYPE_UNHANDLED; 662 return;
706 663
707 if (trace_flags & TRACE_ITER_CONTEXT_INFO) { 664 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
708 /* Absolute time */ 665 /* Absolute time */
709 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 666 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
710 ret = print_graph_abs_time(iter->ts, s); 667 print_graph_abs_time(iter->ts, s);
711 if (!ret)
712 return TRACE_TYPE_PARTIAL_LINE;
713 }
714 668
715 /* Cpu */ 669 /* Cpu */
716 if (flags & TRACE_GRAPH_PRINT_CPU) { 670 if (flags & TRACE_GRAPH_PRINT_CPU)
717 ret = print_graph_cpu(s, cpu); 671 print_graph_cpu(s, cpu);
718 if (ret == TRACE_TYPE_PARTIAL_LINE)
719 return TRACE_TYPE_PARTIAL_LINE;
720 }
721 672
722 /* Proc */ 673 /* Proc */
723 if (flags & TRACE_GRAPH_PRINT_PROC) { 674 if (flags & TRACE_GRAPH_PRINT_PROC) {
724 ret = print_graph_proc(s, pid); 675 print_graph_proc(s, pid);
725 if (ret == TRACE_TYPE_PARTIAL_LINE) 676 trace_seq_puts(s, " | ");
726 return TRACE_TYPE_PARTIAL_LINE;
727 ret = trace_seq_puts(s, " | ");
728 if (!ret)
729 return TRACE_TYPE_PARTIAL_LINE;
730 } 677 }
678
679 /* Latency format */
680 if (trace_flags & TRACE_ITER_LATENCY_FMT)
681 print_graph_lat_fmt(s, ent);
731 } 682 }
732 683
733 /* No overhead */ 684 /* No overhead */
734 ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); 685 print_graph_duration(0, s, flags | FLAGS_FILL_START);
735 if (ret != TRACE_TYPE_HANDLED)
736 return ret;
737 686
738 if (type == TRACE_GRAPH_ENT) 687 if (type == TRACE_GRAPH_ENT)
739 ret = trace_seq_puts(s, "==========>"); 688 trace_seq_puts(s, "==========>");
740 else 689 else
741 ret = trace_seq_puts(s, "<=========="); 690 trace_seq_puts(s, "<==========");
742
743 if (!ret)
744 return TRACE_TYPE_PARTIAL_LINE;
745
746 ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
747 if (ret != TRACE_TYPE_HANDLED)
748 return ret;
749
750 ret = trace_seq_putc(s, '\n');
751 691
752 if (!ret) 692 print_graph_duration(0, s, flags | FLAGS_FILL_END);
753 return TRACE_TYPE_PARTIAL_LINE; 693 trace_seq_putc(s, '\n');
754 return TRACE_TYPE_HANDLED;
755} 694}
756 695
757enum print_line_t 696void
758trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) 697trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
759{ 698{
760 unsigned long nsecs_rem = do_div(duration, 1000); 699 unsigned long nsecs_rem = do_div(duration, 1000);
761 /* log10(ULONG_MAX) + '\0' */ 700 /* log10(ULONG_MAX) + '\0' */
762 char msecs_str[21]; 701 char usecs_str[21];
763 char nsecs_str[5]; 702 char nsecs_str[5];
764 int ret, len; 703 int len;
765 int i; 704 int i;
766 705
767 sprintf(msecs_str, "%lu", (unsigned long) duration); 706 sprintf(usecs_str, "%lu", (unsigned long) duration);
768 707
769 /* Print msecs */ 708 /* Print msecs */
770 ret = trace_seq_printf(s, "%s", msecs_str); 709 trace_seq_printf(s, "%s", usecs_str);
771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE;
773 710
774 len = strlen(msecs_str); 711 len = strlen(usecs_str);
775 712
776 /* Print nsecs (we don't want to exceed 7 numbers) */ 713 /* Print nsecs (we don't want to exceed 7 numbers) */
777 if (len < 7) { 714 if (len < 7) {
778 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); 715 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
779 716
780 snprintf(nsecs_str, slen, "%03lu", nsecs_rem); 717 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
781 ret = trace_seq_printf(s, ".%s", nsecs_str); 718 trace_seq_printf(s, ".%s", nsecs_str);
782 if (!ret)
783 return TRACE_TYPE_PARTIAL_LINE;
784 len += strlen(nsecs_str); 719 len += strlen(nsecs_str);
785 } 720 }
786 721
787 ret = trace_seq_puts(s, " us "); 722 trace_seq_puts(s, " us ");
788 if (!ret)
789 return TRACE_TYPE_PARTIAL_LINE;
790 723
791 /* Print remaining spaces to fit the row's width */ 724 /* Print remaining spaces to fit the row's width */
792 for (i = len; i < 7; i++) { 725 for (i = len; i < 7; i++)
793 ret = trace_seq_putc(s, ' '); 726 trace_seq_putc(s, ' ');
794 if (!ret)
795 return TRACE_TYPE_PARTIAL_LINE;
796 }
797 return TRACE_TYPE_HANDLED;
798} 727}
799 728
800static enum print_line_t 729static void
801print_graph_duration(unsigned long long duration, struct trace_seq *s, 730print_graph_duration(unsigned long long duration, struct trace_seq *s,
802 u32 flags) 731 u32 flags)
803{ 732{
804 int ret = -1;
805
806 if (!(flags & TRACE_GRAPH_PRINT_DURATION) || 733 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
807 !(trace_flags & TRACE_ITER_CONTEXT_INFO)) 734 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
808 return TRACE_TYPE_HANDLED; 735 return;
809 736
810 /* No real adata, just filling the column with spaces */ 737 /* No real adata, just filling the column with spaces */
811 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { 738 switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
812 case FLAGS_FILL_FULL: 739 case FLAGS_FILL_FULL:
813 ret = trace_seq_puts(s, " | "); 740 trace_seq_puts(s, " | ");
814 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return;
815 case FLAGS_FILL_START: 742 case FLAGS_FILL_START:
816 ret = trace_seq_puts(s, " "); 743 trace_seq_puts(s, " ");
817 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 744 return;
818 case FLAGS_FILL_END: 745 case FLAGS_FILL_END:
819 ret = trace_seq_puts(s, " |"); 746 trace_seq_puts(s, " |");
820 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 747 return;
821 } 748 }
822 749
823 /* Signal a overhead of time execution to the output */ 750 /* Signal a overhead of time execution to the output */
824 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 751 if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
825 /* Duration exceeded 100 msecs */ 752 trace_seq_printf(s, "%c ", trace_find_mark(duration));
826 if (duration > 100000ULL) 753 else
827 ret = trace_seq_puts(s, "! "); 754 trace_seq_puts(s, " ");
828 /* Duration exceeded 10 msecs */
829 else if (duration > 10000ULL)
830 ret = trace_seq_puts(s, "+ ");
831 }
832
833 /*
834 * The -1 means we either did not exceed the duration tresholds
835 * or we dont want to print out the overhead. Either way we need
836 * to fill out the space.
837 */
838 if (ret == -1)
839 ret = trace_seq_puts(s, " ");
840
841 /* Catching here any failure happenned above */
842 if (!ret)
843 return TRACE_TYPE_PARTIAL_LINE;
844
845 ret = trace_print_graph_duration(duration, s);
846 if (ret != TRACE_TYPE_HANDLED)
847 return ret;
848
849 ret = trace_seq_puts(s, "| ");
850 if (!ret)
851 return TRACE_TYPE_PARTIAL_LINE;
852 755
853 return TRACE_TYPE_HANDLED; 756 trace_print_graph_duration(duration, s);
757 trace_seq_puts(s, "| ");
854} 758}
855 759
856/* Case of a leaf function on its call entry */ 760/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
864 struct ftrace_graph_ret *graph_ret; 768 struct ftrace_graph_ret *graph_ret;
865 struct ftrace_graph_ent *call; 769 struct ftrace_graph_ent *call;
866 unsigned long long duration; 770 unsigned long long duration;
867 int ret;
868 int i; 771 int i;
869 772
870 graph_ret = &ret_entry->ret; 773 graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
890 } 793 }
891 794
892 /* Overhead and duration */ 795 /* Overhead and duration */
893 ret = print_graph_duration(duration, s, flags); 796 print_graph_duration(duration, s, flags);
894 if (ret == TRACE_TYPE_PARTIAL_LINE)
895 return TRACE_TYPE_PARTIAL_LINE;
896 797
897 /* Function */ 798 /* Function */
898 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 799 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
899 ret = trace_seq_putc(s, ' '); 800 trace_seq_putc(s, ' ');
900 if (!ret)
901 return TRACE_TYPE_PARTIAL_LINE;
902 }
903 801
904 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); 802 trace_seq_printf(s, "%ps();\n", (void *)call->func);
905 if (!ret)
906 return TRACE_TYPE_PARTIAL_LINE;
907 803
908 return TRACE_TYPE_HANDLED; 804 return trace_handle_return(s);
909} 805}
910 806
911static enum print_line_t 807static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
915{ 811{
916 struct ftrace_graph_ent *call = &entry->graph_ent; 812 struct ftrace_graph_ent *call = &entry->graph_ent;
917 struct fgraph_data *data = iter->private; 813 struct fgraph_data *data = iter->private;
918 int ret;
919 int i; 814 int i;
920 815
921 if (data) { 816 if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
931 } 826 }
932 827
933 /* No time */ 828 /* No time */
934 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 829 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
935 if (ret != TRACE_TYPE_HANDLED)
936 return ret;
937 830
938 /* Function */ 831 /* Function */
939 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 832 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
940 ret = trace_seq_putc(s, ' '); 833 trace_seq_putc(s, ' ');
941 if (!ret) 834
942 return TRACE_TYPE_PARTIAL_LINE; 835 trace_seq_printf(s, "%ps() {\n", (void *)call->func);
943 }
944 836
945 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); 837 if (trace_seq_has_overflowed(s))
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE; 838 return TRACE_TYPE_PARTIAL_LINE;
948 839
949 /* 840 /*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
953 return TRACE_TYPE_NO_CONSUME; 844 return TRACE_TYPE_NO_CONSUME;
954} 845}
955 846
956static enum print_line_t 847static void
957print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 848print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
958 int type, unsigned long addr, u32 flags) 849 int type, unsigned long addr, u32 flags)
959{ 850{
960 struct fgraph_data *data = iter->private; 851 struct fgraph_data *data = iter->private;
961 struct trace_entry *ent = iter->ent; 852 struct trace_entry *ent = iter->ent;
962 int cpu = iter->cpu; 853 int cpu = iter->cpu;
963 int ret;
964 854
965 /* Pid */ 855 /* Pid */
966 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) 856 verif_pid(s, ent->pid, cpu, data);
967 return TRACE_TYPE_PARTIAL_LINE;
968 857
969 if (type) { 858 if (type)
970 /* Interrupt */ 859 /* Interrupt */
971 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); 860 print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
972 if (ret == TRACE_TYPE_PARTIAL_LINE)
973 return TRACE_TYPE_PARTIAL_LINE;
974 }
975 861
976 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) 862 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
977 return 0; 863 return;
978 864
979 /* Absolute time */ 865 /* Absolute time */
980 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 866 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
981 ret = print_graph_abs_time(iter->ts, s); 867 print_graph_abs_time(iter->ts, s);
982 if (!ret)
983 return TRACE_TYPE_PARTIAL_LINE;
984 }
985 868
986 /* Cpu */ 869 /* Cpu */
987 if (flags & TRACE_GRAPH_PRINT_CPU) { 870 if (flags & TRACE_GRAPH_PRINT_CPU)
988 ret = print_graph_cpu(s, cpu); 871 print_graph_cpu(s, cpu);
989 if (ret == TRACE_TYPE_PARTIAL_LINE)
990 return TRACE_TYPE_PARTIAL_LINE;
991 }
992 872
993 /* Proc */ 873 /* Proc */
994 if (flags & TRACE_GRAPH_PRINT_PROC) { 874 if (flags & TRACE_GRAPH_PRINT_PROC) {
995 ret = print_graph_proc(s, ent->pid); 875 print_graph_proc(s, ent->pid);
996 if (ret == TRACE_TYPE_PARTIAL_LINE) 876 trace_seq_puts(s, " | ");
997 return TRACE_TYPE_PARTIAL_LINE;
998
999 ret = trace_seq_puts(s, " | ");
1000 if (!ret)
1001 return TRACE_TYPE_PARTIAL_LINE;
1002 } 877 }
1003 878
1004 /* Latency format */ 879 /* Latency format */
1005 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 880 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1006 ret = print_graph_lat_fmt(s, ent); 881 print_graph_lat_fmt(s, ent);
1007 if (ret == TRACE_TYPE_PARTIAL_LINE)
1008 return TRACE_TYPE_PARTIAL_LINE;
1009 }
1010 882
1011 return 0; 883 return;
1012} 884}
1013 885
1014/* 886/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
1126 if (check_irq_entry(iter, flags, call->func, call->depth)) 998 if (check_irq_entry(iter, flags, call->func, call->depth))
1127 return TRACE_TYPE_HANDLED; 999 return TRACE_TYPE_HANDLED;
1128 1000
1129 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1001 print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
1130 return TRACE_TYPE_PARTIAL_LINE;
1131 1002
1132 leaf_ret = get_return_for_leaf(iter, field); 1003 leaf_ret = get_return_for_leaf(iter, field);
1133 if (leaf_ret) 1004 if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1160 pid_t pid = ent->pid; 1031 pid_t pid = ent->pid;
1161 int cpu = iter->cpu; 1032 int cpu = iter->cpu;
1162 int func_match = 1; 1033 int func_match = 1;
1163 int ret;
1164 int i; 1034 int i;
1165 1035
1166 if (check_irq_return(iter, flags, trace->depth)) 1036 if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1186 } 1056 }
1187 } 1057 }
1188 1058
1189 if (print_graph_prologue(iter, s, 0, 0, flags)) 1059 print_graph_prologue(iter, s, 0, 0, flags);
1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1060
1192 /* Overhead and duration */ 1061 /* Overhead and duration */
1193 ret = print_graph_duration(duration, s, flags); 1062 print_graph_duration(duration, s, flags);
1194 if (ret == TRACE_TYPE_PARTIAL_LINE)
1195 return TRACE_TYPE_PARTIAL_LINE;
1196 1063
1197 /* Closing brace */ 1064 /* Closing brace */
1198 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1065 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
1199 ret = trace_seq_putc(s, ' '); 1066 trace_seq_putc(s, ' ');
1200 if (!ret)
1201 return TRACE_TYPE_PARTIAL_LINE;
1202 }
1203 1067
1204 /* 1068 /*
1205 * If the return function does not have a matching entry, 1069 * If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1208 * belongs to, write out the function name. Always do 1072 * belongs to, write out the function name. Always do
1209 * that if the funcgraph-tail option is enabled. 1073 * that if the funcgraph-tail option is enabled.
1210 */ 1074 */
1211 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { 1075 if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
1212 ret = trace_seq_puts(s, "}\n"); 1076 trace_seq_puts(s, "}\n");
1213 if (!ret) 1077 else
1214 return TRACE_TYPE_PARTIAL_LINE; 1078 trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1215 } else {
1216 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
1217 if (!ret)
1218 return TRACE_TYPE_PARTIAL_LINE;
1219 }
1220 1079
1221 /* Overrun */ 1080 /* Overrun */
1222 if (flags & TRACE_GRAPH_PRINT_OVERRUN) { 1081 if (flags & TRACE_GRAPH_PRINT_OVERRUN)
1223 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 1082 trace_seq_printf(s, " (Overruns: %lu)\n",
1224 trace->overrun); 1083 trace->overrun);
1225 if (!ret)
1226 return TRACE_TYPE_PARTIAL_LINE;
1227 }
1228 1084
1229 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, 1085 print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
1230 cpu, pid, flags); 1086 cpu, pid, flags);
1231 if (ret == TRACE_TYPE_PARTIAL_LINE)
1232 return TRACE_TYPE_PARTIAL_LINE;
1233 1087
1234 return TRACE_TYPE_HANDLED; 1088 return trace_handle_return(s);
1235} 1089}
1236 1090
1237static enum print_line_t 1091static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1248 if (data) 1102 if (data)
1249 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 1103 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
1250 1104
1251 if (print_graph_prologue(iter, s, 0, 0, flags)) 1105 print_graph_prologue(iter, s, 0, 0, flags);
1252 return TRACE_TYPE_PARTIAL_LINE;
1253 1106
1254 /* No time */ 1107 /* No time */
1255 ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); 1108 print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
1256 if (ret != TRACE_TYPE_HANDLED)
1257 return ret;
1258 1109
1259 /* Indentation */ 1110 /* Indentation */
1260 if (depth > 0) 1111 if (depth > 0)
1261 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1112 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
1262 ret = trace_seq_putc(s, ' '); 1113 trace_seq_putc(s, ' ');
1263 if (!ret)
1264 return TRACE_TYPE_PARTIAL_LINE;
1265 }
1266 1114
1267 /* The comment */ 1115 /* The comment */
1268 ret = trace_seq_puts(s, "/* "); 1116 trace_seq_puts(s, "/* ");
1269 if (!ret)
1270 return TRACE_TYPE_PARTIAL_LINE;
1271 1117
1272 switch (iter->ent->type) { 1118 switch (iter->ent->type) {
1273 case TRACE_BPRINT: 1119 case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1290 return ret; 1136 return ret;
1291 } 1137 }
1292 1138
1139 if (trace_seq_has_overflowed(s))
1140 goto out;
1141
1293 /* Strip ending newline */ 1142 /* Strip ending newline */
1294 if (s->buffer[s->len - 1] == '\n') { 1143 if (s->buffer[s->seq.len - 1] == '\n') {
1295 s->buffer[s->len - 1] = '\0'; 1144 s->buffer[s->seq.len - 1] = '\0';
1296 s->len--; 1145 s->seq.len--;
1297 } 1146 }
1298 1147
1299 ret = trace_seq_puts(s, " */\n"); 1148 trace_seq_puts(s, " */\n");
1300 if (!ret) 1149 out:
1301 return TRACE_TYPE_PARTIAL_LINE; 1150 return trace_handle_return(s);
1302
1303 return TRACE_TYPE_HANDLED;
1304} 1151}
1305 1152
1306 1153
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1407 print_lat_header(s, flags); 1254 print_lat_header(s, flags);
1408 1255
1409 /* 1st line */ 1256 /* 1st line */
1410 seq_printf(s, "#"); 1257 seq_putc(s, '#');
1411 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1258 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1412 seq_printf(s, " TIME "); 1259 seq_puts(s, " TIME ");
1413 if (flags & TRACE_GRAPH_PRINT_CPU) 1260 if (flags & TRACE_GRAPH_PRINT_CPU)
1414 seq_printf(s, " CPU"); 1261 seq_puts(s, " CPU");
1415 if (flags & TRACE_GRAPH_PRINT_PROC) 1262 if (flags & TRACE_GRAPH_PRINT_PROC)
1416 seq_printf(s, " TASK/PID "); 1263 seq_puts(s, " TASK/PID ");
1417 if (lat) 1264 if (lat)
1418 seq_printf(s, "||||"); 1265 seq_puts(s, "||||");
1419 if (flags & TRACE_GRAPH_PRINT_DURATION) 1266 if (flags & TRACE_GRAPH_PRINT_DURATION)
1420 seq_printf(s, " DURATION "); 1267 seq_puts(s, " DURATION ");
1421 seq_printf(s, " FUNCTION CALLS\n"); 1268 seq_puts(s, " FUNCTION CALLS\n");
1422 1269
1423 /* 2nd line */ 1270 /* 2nd line */
1424 seq_printf(s, "#"); 1271 seq_putc(s, '#');
1425 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) 1272 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1426 seq_printf(s, " | "); 1273 seq_puts(s, " | ");
1427 if (flags & TRACE_GRAPH_PRINT_CPU) 1274 if (flags & TRACE_GRAPH_PRINT_CPU)
1428 seq_printf(s, " | "); 1275 seq_puts(s, " | ");
1429 if (flags & TRACE_GRAPH_PRINT_PROC) 1276 if (flags & TRACE_GRAPH_PRINT_PROC)
1430 seq_printf(s, " | | "); 1277 seq_puts(s, " | | ");
1431 if (lat) 1278 if (lat)
1432 seq_printf(s, "||||"); 1279 seq_puts(s, "||||");
1433 if (flags & TRACE_GRAPH_PRINT_DURATION) 1280 if (flags & TRACE_GRAPH_PRINT_DURATION)
1434 seq_printf(s, " | | "); 1281 seq_puts(s, " | | ");
1435 seq_printf(s, " | | | |\n"); 1282 seq_puts(s, " | | | |\n");
1436} 1283}
1437 1284
1438static void print_graph_headers(struct seq_file *s) 1285static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..3ccf5c2c1320 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
20{ 20{
21 /* use static because iter can be a bit big for the stack */ 21 /* use static because iter can be a bit big for the stack */
22 static struct trace_iterator iter; 22 static struct trace_iterator iter;
23 static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
23 unsigned int old_userobj; 24 unsigned int old_userobj;
24 int cnt = 0, cpu; 25 int cnt = 0, cpu;
25 26
26 trace_init_global_iter(&iter); 27 trace_init_global_iter(&iter);
28 iter.buffer_iter = buffer_iter;
27 29
28 for_each_tracing_cpu(cpu) { 30 for_each_tracing_cpu(cpu) {
29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 31 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 59 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 60 tracing_iter_reset(&iter, cpu_file);
59 } 61 }
60 if (!trace_empty(&iter)) 62
61 trace_find_next_entry_inc(&iter); 63 while (trace_find_next_entry_inc(&iter)) {
62 while (!trace_empty(&iter)) {
63 if (!cnt) 64 if (!cnt)
64 kdb_printf("---------------------------------\n"); 65 kdb_printf("---------------------------------\n");
65 cnt++; 66 cnt++;
66 67
67 if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) 68 if (!skip_lines) {
68 print_trace_line(&iter); 69 print_trace_line(&iter);
69 if (!skip_lines)
70 trace_printk_seq(&iter.seq); 70 trace_printk_seq(&iter.seq);
71 else 71 } else {
72 skip_lines--; 72 skip_lines--;
73 }
74
73 if (KDB_FLAG(CMD_INTERRUPT)) 75 if (KDB_FLAG(CMD_INTERRUPT))
74 goto out; 76 goto out;
75 } 77 }
@@ -86,9 +88,12 @@ out:
86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); 88 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 89 }
88 90
89 for_each_tracing_cpu(cpu) 91 for_each_tracing_cpu(cpu) {
90 if (iter.buffer_iter[cpu]) 92 if (iter.buffer_iter[cpu]) {
91 ring_buffer_read_finish(iter.buffer_iter[cpu]); 93 ring_buffer_read_finish(iter.buffer_iter[cpu]);
94 iter.buffer_iter[cpu] = NULL;
95 }
96 }
92} 97}
93 98
94/* 99/*
@@ -127,8 +132,8 @@ static int kdb_ftdump(int argc, const char **argv)
127 132
128static __init int kdb_ftrace_register(void) 133static __init int kdb_ftrace_register(void)
129{ 134{
130 kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", 135 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
131 "Dump ftrace log", 0, KDB_REPEAT_NONE); 136 "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
132 return 0; 137 return 0;
133} 138}
134 139
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
826 struct trace_kprobe *tk = v; 826 struct trace_kprobe *tk = v;
827 int i; 827 int i;
828 828
829 seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); 829 seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
830 seq_printf(m, ":%s/%s", tk->tp.call.class->system, 830 seq_printf(m, ":%s/%s", tk->tp.call.class->system,
831 ftrace_event_name(&tk->tp.call)); 831 ftrace_event_name(&tk->tp.call));
832 832
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
840 840
841 for (i = 0; i < tk->tp.nr_args; i++) 841 for (i = 0; i < tk->tp.nr_args; i++)
842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); 842 seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
843 seq_printf(m, "\n"); 843 seq_putc(m, '\n');
844 844
845 return 0; 845 return 0;
846} 846}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
1024 field = (struct kprobe_trace_entry_head *)iter->ent; 1024 field = (struct kprobe_trace_entry_head *)iter->ent;
1025 tp = container_of(event, struct trace_probe, call.event); 1025 tp = container_of(event, struct trace_probe, call.event);
1026 1026
1027 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1027 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1028 goto partial;
1029 1028
1030 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 1029 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
1031 goto partial; 1030 goto out;
1032 1031
1033 if (!trace_seq_puts(s, ")")) 1032 trace_seq_putc(s, ')');
1034 goto partial;
1035 1033
1036 data = (u8 *)&field[1]; 1034 data = (u8 *)&field[1];
1037 for (i = 0; i < tp->nr_args; i++) 1035 for (i = 0; i < tp->nr_args; i++)
1038 if (!tp->args[i].type->print(s, tp->args[i].name, 1036 if (!tp->args[i].type->print(s, tp->args[i].name,
1039 data + tp->args[i].offset, field)) 1037 data + tp->args[i].offset, field))
1040 goto partial; 1038 goto out;
1041
1042 if (!trace_seq_puts(s, "\n"))
1043 goto partial;
1044 1039
1045 return TRACE_TYPE_HANDLED; 1040 trace_seq_putc(s, '\n');
1046partial: 1041 out:
1047 return TRACE_TYPE_PARTIAL_LINE; 1042 return trace_handle_return(s);
1048} 1043}
1049 1044
1050static enum print_line_t 1045static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
1060 field = (struct kretprobe_trace_entry_head *)iter->ent; 1055 field = (struct kretprobe_trace_entry_head *)iter->ent;
1061 tp = container_of(event, struct trace_probe, call.event); 1056 tp = container_of(event, struct trace_probe, call.event);
1062 1057
1063 if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) 1058 trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
1064 goto partial;
1065 1059
1066 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) 1060 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1067 goto partial; 1061 goto out;
1068 1062
1069 if (!trace_seq_puts(s, " <- ")) 1063 trace_seq_puts(s, " <- ");
1070 goto partial;
1071 1064
1072 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) 1065 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1073 goto partial; 1066 goto out;
1074 1067
1075 if (!trace_seq_puts(s, ")")) 1068 trace_seq_putc(s, ')');
1076 goto partial;
1077 1069
1078 data = (u8 *)&field[1]; 1070 data = (u8 *)&field[1];
1079 for (i = 0; i < tp->nr_args; i++) 1071 for (i = 0; i < tp->nr_args; i++)
1080 if (!tp->args[i].type->print(s, tp->args[i].name, 1072 if (!tp->args[i].type->print(s, tp->args[i].name,
1081 data + tp->args[i].offset, field)) 1073 data + tp->args[i].offset, field))
1082 goto partial; 1074 goto out;
1083 1075
1084 if (!trace_seq_puts(s, "\n")) 1076 trace_seq_putc(s, '\n');
1085 goto partial;
1086 1077
1087 return TRACE_TYPE_HANDLED; 1078 out:
1088partial: 1079 return trace_handle_return(s);
1089 return TRACE_TYPE_PARTIAL_LINE;
1090} 1080}
1091 1081
1092 1082
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
59 mmio_reset_data(tr); 59 mmio_reset_data(tr);
60} 60}
61 61
62static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 62static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
63{ 63{
64 int ret = 0;
65 int i; 64 int i;
66 resource_size_t start, end; 65 resource_size_t start, end;
67 const struct pci_driver *drv = pci_dev_driver(dev); 66 const struct pci_driver *drv = pci_dev_driver(dev);
68 67
69 /* XXX: incomplete checks for trace_seq_printf() return value */ 68 trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
70 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", 69 dev->bus->number, dev->devfn,
71 dev->bus->number, dev->devfn, 70 dev->vendor, dev->device, dev->irq);
72 dev->vendor, dev->device, dev->irq);
73 /* 71 /*
74 * XXX: is pci_resource_to_user() appropriate, since we are 72 * XXX: is pci_resource_to_user() appropriate, since we are
75 * supposed to interpret the __ioremap() phys_addr argument based on 73 * supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
77 */ 75 */
78 for (i = 0; i < 7; i++) { 76 for (i = 0; i < 7; i++) {
79 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 77 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
80 ret += trace_seq_printf(s, " %llx", 78 trace_seq_printf(s, " %llx",
81 (unsigned long long)(start | 79 (unsigned long long)(start |
82 (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); 80 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
83 } 81 }
84 for (i = 0; i < 7; i++) { 82 for (i = 0; i < 7; i++) {
85 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); 83 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
86 ret += trace_seq_printf(s, " %llx", 84 trace_seq_printf(s, " %llx",
87 dev->resource[i].start < dev->resource[i].end ? 85 dev->resource[i].start < dev->resource[i].end ?
88 (unsigned long long)(end - start) + 1 : 0); 86 (unsigned long long)(end - start) + 1 : 0);
89 } 87 }
90 if (drv) 88 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 89 trace_seq_printf(s, " %s\n", drv->name);
92 else 90 else
93 ret += trace_seq_puts(s, " \n"); 91 trace_seq_puts(s, " \n");
94 return ret;
95} 92}
96 93
97static void destroy_header_iter(struct header_iter *hiter) 94static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
179 unsigned long long t = ns2usecs(iter->ts); 176 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 177 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
181 unsigned secs = (unsigned long)t; 178 unsigned secs = (unsigned long)t;
182 int ret = 1;
183 179
184 trace_assign_type(field, entry); 180 trace_assign_type(field, entry);
185 rw = &field->rw; 181 rw = &field->rw;
186 182
187 switch (rw->opcode) { 183 switch (rw->opcode) {
188 case MMIO_READ: 184 case MMIO_READ:
189 ret = trace_seq_printf(s, 185 trace_seq_printf(s,
190 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 186 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
191 rw->width, secs, usec_rem, rw->map_id, 187 rw->width, secs, usec_rem, rw->map_id,
192 (unsigned long long)rw->phys, 188 (unsigned long long)rw->phys,
193 rw->value, rw->pc, 0); 189 rw->value, rw->pc, 0);
194 break; 190 break;
195 case MMIO_WRITE: 191 case MMIO_WRITE:
196 ret = trace_seq_printf(s, 192 trace_seq_printf(s,
197 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 193 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
198 rw->width, secs, usec_rem, rw->map_id, 194 rw->width, secs, usec_rem, rw->map_id,
199 (unsigned long long)rw->phys, 195 (unsigned long long)rw->phys,
200 rw->value, rw->pc, 0); 196 rw->value, rw->pc, 0);
201 break; 197 break;
202 case MMIO_UNKNOWN_OP: 198 case MMIO_UNKNOWN_OP:
203 ret = trace_seq_printf(s, 199 trace_seq_printf(s,
204 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," 200 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
205 "%02lx 0x%lx %d\n", 201 "%02lx 0x%lx %d\n",
206 secs, usec_rem, rw->map_id, 202 secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 205 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 206 break;
211 default: 207 default:
212 ret = trace_seq_puts(s, "rw what?\n"); 208 trace_seq_puts(s, "rw what?\n");
213 break; 209 break;
214 } 210 }
215 if (ret) 211
216 return TRACE_TYPE_HANDLED; 212 return trace_handle_return(s);
217 return TRACE_TYPE_PARTIAL_LINE;
218} 213}
219 214
220static enum print_line_t mmio_print_map(struct trace_iterator *iter) 215static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
226 unsigned long long t = ns2usecs(iter->ts); 221 unsigned long long t = ns2usecs(iter->ts);
227 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 222 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
228 unsigned secs = (unsigned long)t; 223 unsigned secs = (unsigned long)t;
229 int ret;
230 224
231 trace_assign_type(field, entry); 225 trace_assign_type(field, entry);
232 m = &field->map; 226 m = &field->map;
233 227
234 switch (m->opcode) { 228 switch (m->opcode) {
235 case MMIO_PROBE: 229 case MMIO_PROBE:
236 ret = trace_seq_printf(s, 230 trace_seq_printf(s,
237 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 231 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
238 secs, usec_rem, m->map_id, 232 secs, usec_rem, m->map_id,
239 (unsigned long long)m->phys, m->virt, m->len, 233 (unsigned long long)m->phys, m->virt, m->len,
240 0UL, 0); 234 0UL, 0);
241 break; 235 break;
242 case MMIO_UNPROBE: 236 case MMIO_UNPROBE:
243 ret = trace_seq_printf(s, 237 trace_seq_printf(s,
244 "UNMAP %u.%06lu %d 0x%lx %d\n", 238 "UNMAP %u.%06lu %d 0x%lx %d\n",
245 secs, usec_rem, m->map_id, 0UL, 0); 239 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 240 break;
247 default: 241 default:
248 ret = trace_seq_puts(s, "map what?\n"); 242 trace_seq_puts(s, "map what?\n");
249 break; 243 break;
250 } 244 }
251 if (ret) 245
252 return TRACE_TYPE_HANDLED; 246 return trace_handle_return(s);
253 return TRACE_TYPE_PARTIAL_LINE;
254} 247}
255 248
256static enum print_line_t mmio_print_mark(struct trace_iterator *iter) 249static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
262 unsigned long long t = ns2usecs(iter->ts); 255 unsigned long long t = ns2usecs(iter->ts);
263 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 256 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
264 unsigned secs = (unsigned long)t; 257 unsigned secs = (unsigned long)t;
265 int ret;
266 258
267 /* The trailing newline must be in the message. */ 259 /* The trailing newline must be in the message. */
268 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); 260 trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
269 if (!ret)
270 return TRACE_TYPE_PARTIAL_LINE;
271 261
272 return TRACE_TYPE_HANDLED; 262 return trace_handle_return(s);
273} 263}
274 264
275static enum print_line_t mmio_print_line(struct trace_iterator *iter) 265static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
25 struct trace_seq *s = &iter->seq; 25 struct trace_seq *s = &iter->seq;
26 struct trace_entry *entry = iter->ent; 26 struct trace_entry *entry = iter->ent;
27 struct bputs_entry *field; 27 struct bputs_entry *field;
28 int ret;
29 28
30 trace_assign_type(field, entry); 29 trace_assign_type(field, entry);
31 30
32 ret = trace_seq_puts(s, field->str); 31 trace_seq_puts(s, field->str);
33 if (!ret)
34 return TRACE_TYPE_PARTIAL_LINE;
35 32
36 return TRACE_TYPE_HANDLED; 33 return trace_handle_return(s);
37} 34}
38 35
39enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
42 struct trace_entry *entry = iter->ent; 39 struct trace_entry *entry = iter->ent;
43 struct bprint_entry *field; 40 struct bprint_entry *field;
44 int ret;
45 41
46 trace_assign_type(field, entry); 42 trace_assign_type(field, entry);
47 43
48 ret = trace_seq_bprintf(s, field->fmt, field->buf); 44 trace_seq_bprintf(s, field->fmt, field->buf);
49 if (!ret)
50 return TRACE_TYPE_PARTIAL_LINE;
51 45
52 return TRACE_TYPE_HANDLED; 46 return trace_handle_return(s);
53} 47}
54 48
55enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) 49enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
57 struct trace_seq *s = &iter->seq; 51 struct trace_seq *s = &iter->seq;
58 struct trace_entry *entry = iter->ent; 52 struct trace_entry *entry = iter->ent;
59 struct print_entry *field; 53 struct print_entry *field;
60 int ret;
61 54
62 trace_assign_type(field, entry); 55 trace_assign_type(field, entry);
63 56
64 ret = trace_seq_puts(s, field->buf); 57 trace_seq_puts(s, field->buf);
65 if (!ret)
66 return TRACE_TYPE_PARTIAL_LINE;
67 58
68 return TRACE_TYPE_HANDLED; 59 return trace_handle_return(s);
69} 60}
70 61
71const char * 62const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
124 115
125 if (ret == (const char *)(trace_seq_buffer_ptr(p))) 116 if (ret == (const char *)(trace_seq_buffer_ptr(p)))
126 trace_seq_printf(p, "0x%lx", val); 117 trace_seq_printf(p, "0x%lx", val);
127 118
128 trace_seq_putc(p, 0); 119 trace_seq_putc(p, 0);
129 120
130 return ret; 121 return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
193 struct trace_seq *s = &iter->seq; 184 struct trace_seq *s = &iter->seq;
194 struct trace_seq *p = &iter->tmp_seq; 185 struct trace_seq *p = &iter->tmp_seq;
195 struct trace_entry *entry; 186 struct trace_entry *entry;
196 int ret;
197 187
198 event = container_of(trace_event, struct ftrace_event_call, event); 188 event = container_of(trace_event, struct ftrace_event_call, event);
199 entry = iter->ent; 189 entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
204 } 194 }
205 195
206 trace_seq_init(p); 196 trace_seq_init(p);
207 ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); 197 trace_seq_printf(s, "%s: ", ftrace_event_name(event));
208 if (!ret)
209 return TRACE_TYPE_PARTIAL_LINE;
210 198
211 return 0; 199 return trace_handle_return(s);
212} 200}
213EXPORT_SYMBOL(ftrace_raw_output_prep); 201EXPORT_SYMBOL(ftrace_raw_output_prep);
214 202
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
216 char *fmt, va_list ap) 204 char *fmt, va_list ap)
217{ 205{
218 struct trace_seq *s = &iter->seq; 206 struct trace_seq *s = &iter->seq;
219 int ret;
220
221 ret = trace_seq_printf(s, "%s: ", name);
222 if (!ret)
223 return TRACE_TYPE_PARTIAL_LINE;
224
225 ret = trace_seq_vprintf(s, fmt, ap);
226 207
227 if (!ret) 208 trace_seq_printf(s, "%s: ", name);
228 return TRACE_TYPE_PARTIAL_LINE; 209 trace_seq_vprintf(s, fmt, ap);
229 210
230 return TRACE_TYPE_HANDLED; 211 return trace_handle_return(s);
231} 212}
232 213
233int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) 214int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
260} 241}
261#endif /* CONFIG_KRETPROBES */ 242#endif /* CONFIG_KRETPROBES */
262 243
263static int 244static void
264seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) 245seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
265{ 246{
266#ifdef CONFIG_KALLSYMS 247#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
271 252
272 name = kretprobed(str); 253 name = kretprobed(str);
273 254
274 return trace_seq_printf(s, fmt, name); 255 trace_seq_printf(s, fmt, name);
275#endif 256#endif
276 return 1;
277} 257}
278 258
279static int 259static void
280seq_print_sym_offset(struct trace_seq *s, const char *fmt, 260seq_print_sym_offset(struct trace_seq *s, const char *fmt,
281 unsigned long address) 261 unsigned long address)
282{ 262{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
287 sprint_symbol(str, address); 267 sprint_symbol(str, address);
288 name = kretprobed(str); 268 name = kretprobed(str);
289 269
290 return trace_seq_printf(s, fmt, name); 270 trace_seq_printf(s, fmt, name);
291#endif 271#endif
292 return 1;
293} 272}
294 273
295#ifndef CONFIG_64BIT 274#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
320 if (file) { 299 if (file) {
321 ret = trace_seq_path(s, &file->f_path); 300 ret = trace_seq_path(s, &file->f_path);
322 if (ret) 301 if (ret)
323 ret = trace_seq_printf(s, "[+0x%lx]", 302 trace_seq_printf(s, "[+0x%lx]",
324 ip - vmstart); 303 ip - vmstart);
325 } 304 }
326 up_read(&mm->mmap_sem); 305 up_read(&mm->mmap_sem);
327 } 306 }
328 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) 307 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
329 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 308 trace_seq_printf(s, " <" IP_FMT ">", ip);
330 return ret; 309 return !trace_seq_has_overflowed(s);
331} 310}
332 311
333int 312int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
335 unsigned long sym_flags) 314 unsigned long sym_flags)
336{ 315{
337 struct mm_struct *mm = NULL; 316 struct mm_struct *mm = NULL;
338 int ret = 1;
339 unsigned int i; 317 unsigned int i;
340 318
341 if (trace_flags & TRACE_ITER_SYM_USEROBJ) { 319 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
354 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 332 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
355 unsigned long ip = entry->caller[i]; 333 unsigned long ip = entry->caller[i];
356 334
357 if (ip == ULONG_MAX || !ret) 335 if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
358 break; 336 break;
359 if (ret) 337
360 ret = trace_seq_puts(s, " => "); 338 trace_seq_puts(s, " => ");
339
361 if (!ip) { 340 if (!ip) {
362 if (ret) 341 trace_seq_puts(s, "??");
363 ret = trace_seq_puts(s, "??"); 342 trace_seq_putc(s, '\n');
364 if (ret)
365 ret = trace_seq_putc(s, '\n');
366 continue; 343 continue;
367 } 344 }
368 if (!ret) 345
369 break; 346 seq_print_user_ip(s, mm, ip, sym_flags);
370 if (ret) 347 trace_seq_putc(s, '\n');
371 ret = seq_print_user_ip(s, mm, ip, sym_flags);
372 ret = trace_seq_putc(s, '\n');
373 } 348 }
374 349
375 if (mm) 350 if (mm)
376 mmput(mm); 351 mmput(mm);
377 return ret; 352
353 return !trace_seq_has_overflowed(s);
378} 354}
379 355
380int 356int
381seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 357seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
382{ 358{
383 int ret; 359 if (!ip) {
384 360 trace_seq_putc(s, '0');
385 if (!ip) 361 goto out;
386 return trace_seq_putc(s, '0'); 362 }
387 363
388 if (sym_flags & TRACE_ITER_SYM_OFFSET) 364 if (sym_flags & TRACE_ITER_SYM_OFFSET)
389 ret = seq_print_sym_offset(s, "%s", ip); 365 seq_print_sym_offset(s, "%s", ip);
390 else 366 else
391 ret = seq_print_sym_short(s, "%s", ip); 367 seq_print_sym_short(s, "%s", ip);
392
393 if (!ret)
394 return 0;
395 368
396 if (sym_flags & TRACE_ITER_SYM_ADDR) 369 if (sym_flags & TRACE_ITER_SYM_ADDR)
397 ret = trace_seq_printf(s, " <" IP_FMT ">", ip); 370 trace_seq_printf(s, " <" IP_FMT ">", ip);
398 return ret; 371
372 out:
373 return !trace_seq_has_overflowed(s);
399} 374}
400 375
401/** 376/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
413 char irqs_off; 388 char irqs_off;
414 int hardirq; 389 int hardirq;
415 int softirq; 390 int softirq;
416 int ret;
417 391
418 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 392 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
419 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 393 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
445 softirq ? 's' : 419 softirq ? 's' :
446 '.'; 420 '.';
447 421
448 if (!trace_seq_printf(s, "%c%c%c", 422 trace_seq_printf(s, "%c%c%c",
449 irqs_off, need_resched, hardsoft_irq)) 423 irqs_off, need_resched, hardsoft_irq);
450 return 0;
451 424
452 if (entry->preempt_count) 425 if (entry->preempt_count)
453 ret = trace_seq_printf(s, "%x", entry->preempt_count); 426 trace_seq_printf(s, "%x", entry->preempt_count);
454 else 427 else
455 ret = trace_seq_putc(s, '.'); 428 trace_seq_putc(s, '.');
456 429
457 return ret; 430 return !trace_seq_has_overflowed(s);
458} 431}
459 432
460static int 433static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
464 437
465 trace_find_cmdline(entry->pid, comm); 438 trace_find_cmdline(entry->pid, comm);
466 439
467 if (!trace_seq_printf(s, "%8.8s-%-5d %3d", 440 trace_seq_printf(s, "%8.8s-%-5d %3d",
468 comm, entry->pid, cpu)) 441 comm, entry->pid, cpu);
469 return 0;
470 442
471 return trace_print_lat_fmt(s, entry); 443 return trace_print_lat_fmt(s, entry);
472} 444}
473 445
474static unsigned long preempt_mark_thresh_us = 100; 446#undef MARK
447#define MARK(v, s) {.val = v, .sym = s}
448/* trace overhead mark */
449static const struct trace_mark {
450 unsigned long long val; /* unit: nsec */
451 char sym;
452} mark[] = {
453 MARK(1000000000ULL , '$'), /* 1 sec */
454 MARK(1000000ULL , '#'), /* 1000 usecs */
455 MARK(100000ULL , '!'), /* 100 usecs */
456 MARK(10000ULL , '+'), /* 10 usecs */
457};
458#undef MARK
459
460char trace_find_mark(unsigned long long d)
461{
462 int i;
463 int size = ARRAY_SIZE(mark);
464
465 for (i = 0; i < size; i++) {
466 if (d >= mark[i].val)
467 break;
468 }
469
470 return (i == size) ? ' ' : mark[i].sym;
471}
475 472
476static int 473static int
477lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) 474lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
493 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); 490 unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
494 unsigned long rel_msec = (unsigned long)rel_ts; 491 unsigned long rel_msec = (unsigned long)rel_ts;
495 492
496 return trace_seq_printf( 493 trace_seq_printf(
497 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", 494 s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
498 ns2usecs(iter->ts), 495 ns2usecs(iter->ts),
499 abs_msec, abs_usec, 496 abs_msec, abs_usec,
500 rel_msec, rel_usec); 497 rel_msec, rel_usec);
498
501 } else if (verbose && !in_ns) { 499 } else if (verbose && !in_ns) {
502 return trace_seq_printf( 500 trace_seq_printf(
503 s, "[%016llx] %lld (+%lld): ", 501 s, "[%016llx] %lld (+%lld): ",
504 iter->ts, abs_ts, rel_ts); 502 iter->ts, abs_ts, rel_ts);
503
505 } else if (!verbose && in_ns) { 504 } else if (!verbose && in_ns) {
506 return trace_seq_printf( 505 trace_seq_printf(
507 s, " %4lldus%c: ", 506 s, " %4lldus%c: ",
508 abs_ts, 507 abs_ts,
509 rel_ts > preempt_mark_thresh_us ? '!' : 508 trace_find_mark(rel_ts * NSEC_PER_USEC));
510 rel_ts > 1 ? '+' : ' '); 509
511 } else { /* !verbose && !in_ns */ 510 } else { /* !verbose && !in_ns */
512 return trace_seq_printf(s, " %4lld: ", abs_ts); 511 trace_seq_printf(s, " %4lld: ", abs_ts);
513 } 512 }
513
514 return !trace_seq_has_overflowed(s);
514} 515}
515 516
516int trace_print_context(struct trace_iterator *iter) 517int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
520 unsigned long long t; 521 unsigned long long t;
521 unsigned long secs, usec_rem; 522 unsigned long secs, usec_rem;
522 char comm[TASK_COMM_LEN]; 523 char comm[TASK_COMM_LEN];
523 int ret;
524 524
525 trace_find_cmdline(entry->pid, comm); 525 trace_find_cmdline(entry->pid, comm);
526 526
527 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", 527 trace_seq_printf(s, "%16s-%-5d [%03d] ",
528 comm, entry->pid, iter->cpu); 528 comm, entry->pid, iter->cpu);
529 if (!ret)
530 return 0;
531 529
532 if (trace_flags & TRACE_ITER_IRQ_INFO) { 530 if (trace_flags & TRACE_ITER_IRQ_INFO)
533 ret = trace_print_lat_fmt(s, entry); 531 trace_print_lat_fmt(s, entry);
534 if (!ret)
535 return 0;
536 }
537 532
538 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { 533 if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
539 t = ns2usecs(iter->ts); 534 t = ns2usecs(iter->ts);
540 usec_rem = do_div(t, USEC_PER_SEC); 535 usec_rem = do_div(t, USEC_PER_SEC);
541 secs = (unsigned long)t; 536 secs = (unsigned long)t;
542 return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); 537 trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
543 } else 538 } else
544 return trace_seq_printf(s, " %12llu: ", iter->ts); 539 trace_seq_printf(s, " %12llu: ", iter->ts);
540
541 return !trace_seq_has_overflowed(s);
545} 542}
546 543
547int trace_print_lat_context(struct trace_iterator *iter) 544int trace_print_lat_context(struct trace_iterator *iter)
548{ 545{
549 u64 next_ts; 546 u64 next_ts;
550 int ret;
551 /* trace_find_next_entry will reset ent_size */ 547 /* trace_find_next_entry will reset ent_size */
552 int ent_size = iter->ent_size; 548 int ent_size = iter->ent_size;
553 struct trace_seq *s = &iter->seq; 549 struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
567 563
568 trace_find_cmdline(entry->pid, comm); 564 trace_find_cmdline(entry->pid, comm);
569 565
570 ret = trace_seq_printf( 566 trace_seq_printf(
571 s, "%16s %5d %3d %d %08x %08lx ", 567 s, "%16s %5d %3d %d %08x %08lx ",
572 comm, entry->pid, iter->cpu, entry->flags, 568 comm, entry->pid, iter->cpu, entry->flags,
573 entry->preempt_count, iter->idx); 569 entry->preempt_count, iter->idx);
574 } else { 570 } else {
575 ret = lat_print_generic(s, entry, iter->cpu); 571 lat_print_generic(s, entry, iter->cpu);
576 } 572 }
577 573
578 if (ret) 574 lat_print_timestamp(iter, next_ts);
579 ret = lat_print_timestamp(iter, next_ts);
580 575
581 return ret; 576 return !trace_seq_has_overflowed(s);
582} 577}
583 578
584static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 579static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
692 goto out; 687 goto out;
693 688
694 } else { 689 } else {
695 690
696 event->type = next_event_type++; 691 event->type = next_event_type++;
697 list = &ftrace_event_list; 692 list = &ftrace_event_list;
698 } 693 }
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
764enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, 759enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
765 struct trace_event *event) 760 struct trace_event *event)
766{ 761{
767 if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) 762 trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
768 return TRACE_TYPE_PARTIAL_LINE;
769 763
770 return TRACE_TYPE_HANDLED; 764 return trace_handle_return(&iter->seq);
771} 765}
772 766
773/* TRACE_FN */ 767/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
779 773
780 trace_assign_type(field, iter->ent); 774 trace_assign_type(field, iter->ent);
781 775
782 if (!seq_print_ip_sym(s, field->ip, flags)) 776 seq_print_ip_sym(s, field->ip, flags);
783 goto partial;
784 777
785 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 778 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
786 if (!trace_seq_puts(s, " <-")) 779 trace_seq_puts(s, " <-");
787 goto partial; 780 seq_print_ip_sym(s, field->parent_ip, flags);
788 if (!seq_print_ip_sym(s,
789 field->parent_ip,
790 flags))
791 goto partial;
792 } 781 }
793 if (!trace_seq_putc(s, '\n'))
794 goto partial;
795 782
796 return TRACE_TYPE_HANDLED; 783 trace_seq_putc(s, '\n');
797 784
798 partial: 785 return trace_handle_return(s);
799 return TRACE_TYPE_PARTIAL_LINE;
800} 786}
801 787
802static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, 788static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
806 792
807 trace_assign_type(field, iter->ent); 793 trace_assign_type(field, iter->ent);
808 794
809 if (!trace_seq_printf(&iter->seq, "%lx %lx\n", 795 trace_seq_printf(&iter->seq, "%lx %lx\n",
810 field->ip, 796 field->ip,
811 field->parent_ip)) 797 field->parent_ip);
812 return TRACE_TYPE_PARTIAL_LINE;
813 798
814 return TRACE_TYPE_HANDLED; 799 return trace_handle_return(&iter->seq);
815} 800}
816 801
817static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, 802static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
822 807
823 trace_assign_type(field, iter->ent); 808 trace_assign_type(field, iter->ent);
824 809
825 SEQ_PUT_HEX_FIELD_RET(s, field->ip); 810 SEQ_PUT_HEX_FIELD(s, field->ip);
826 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); 811 SEQ_PUT_HEX_FIELD(s, field->parent_ip);
827 812
828 return TRACE_TYPE_HANDLED; 813 return trace_handle_return(s);
829} 814}
830 815
831static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, 816static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
836 821
837 trace_assign_type(field, iter->ent); 822 trace_assign_type(field, iter->ent);
838 823
839 SEQ_PUT_FIELD_RET(s, field->ip); 824 SEQ_PUT_FIELD(s, field->ip);
840 SEQ_PUT_FIELD_RET(s, field->parent_ip); 825 SEQ_PUT_FIELD(s, field->parent_ip);
841 826
842 return TRACE_TYPE_HANDLED; 827 return trace_handle_return(s);
843} 828}
844 829
845static struct trace_event_functions trace_fn_funcs = { 830static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
868 T = task_state_char(field->next_state); 853 T = task_state_char(field->next_state);
869 S = task_state_char(field->prev_state); 854 S = task_state_char(field->prev_state);
870 trace_find_cmdline(field->next_pid, comm); 855 trace_find_cmdline(field->next_pid, comm);
871 if (!trace_seq_printf(&iter->seq, 856 trace_seq_printf(&iter->seq,
872 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 857 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
873 field->prev_pid, 858 field->prev_pid,
874 field->prev_prio, 859 field->prev_prio,
875 S, delim, 860 S, delim,
876 field->next_cpu, 861 field->next_cpu,
877 field->next_pid, 862 field->next_pid,
878 field->next_prio, 863 field->next_prio,
879 T, comm)) 864 T, comm);
880 return TRACE_TYPE_PARTIAL_LINE; 865
881 866 return trace_handle_return(&iter->seq);
882 return TRACE_TYPE_HANDLED;
883} 867}
884 868
885static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, 869static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
904 if (!S) 888 if (!S)
905 S = task_state_char(field->prev_state); 889 S = task_state_char(field->prev_state);
906 T = task_state_char(field->next_state); 890 T = task_state_char(field->next_state);
907 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 891 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
908 field->prev_pid, 892 field->prev_pid,
909 field->prev_prio, 893 field->prev_prio,
910 S, 894 S,
911 field->next_cpu, 895 field->next_cpu,
912 field->next_pid, 896 field->next_pid,
913 field->next_prio, 897 field->next_prio,
914 T)) 898 T);
915 return TRACE_TYPE_PARTIAL_LINE; 899
916 900 return trace_handle_return(&iter->seq);
917 return TRACE_TYPE_HANDLED;
918} 901}
919 902
920static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, 903static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
942 S = task_state_char(field->prev_state); 925 S = task_state_char(field->prev_state);
943 T = task_state_char(field->next_state); 926 T = task_state_char(field->next_state);
944 927
945 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 928 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
946 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 929 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
947 SEQ_PUT_HEX_FIELD_RET(s, S); 930 SEQ_PUT_HEX_FIELD(s, S);
948 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); 931 SEQ_PUT_HEX_FIELD(s, field->next_cpu);
949 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); 932 SEQ_PUT_HEX_FIELD(s, field->next_pid);
950 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); 933 SEQ_PUT_HEX_FIELD(s, field->next_prio);
951 SEQ_PUT_HEX_FIELD_RET(s, T); 934 SEQ_PUT_HEX_FIELD(s, T);
952 935
953 return TRACE_TYPE_HANDLED; 936 return trace_handle_return(s);
954} 937}
955 938
956static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, 939static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
973 956
974 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
975 958
976 SEQ_PUT_FIELD_RET(s, field->prev_pid); 959 SEQ_PUT_FIELD(s, field->prev_pid);
977 SEQ_PUT_FIELD_RET(s, field->prev_prio); 960 SEQ_PUT_FIELD(s, field->prev_prio);
978 SEQ_PUT_FIELD_RET(s, field->prev_state); 961 SEQ_PUT_FIELD(s, field->prev_state);
979 SEQ_PUT_FIELD_RET(s, field->next_pid); 962 SEQ_PUT_FIELD(s, field->next_cpu);
980 SEQ_PUT_FIELD_RET(s, field->next_prio); 963 SEQ_PUT_FIELD(s, field->next_pid);
981 SEQ_PUT_FIELD_RET(s, field->next_state); 964 SEQ_PUT_FIELD(s, field->next_prio);
965 SEQ_PUT_FIELD(s, field->next_state);
982 966
983 return TRACE_TYPE_HANDLED; 967 return trace_handle_return(s);
984} 968}
985 969
986static struct trace_event_functions trace_ctx_funcs = { 970static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1020 trace_assign_type(field, iter->ent); 1004 trace_assign_type(field, iter->ent);
1021 end = (unsigned long *)((long)iter->ent + iter->ent_size); 1005 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1022 1006
1023 if (!trace_seq_puts(s, "<stack trace>\n")) 1007 trace_seq_puts(s, "<stack trace>\n");
1024 goto partial;
1025 1008
1026 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { 1009 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1027 if (!trace_seq_puts(s, " => "))
1028 goto partial;
1029 1010
1030 if (!seq_print_ip_sym(s, *p, flags)) 1011 if (trace_seq_has_overflowed(s))
1031 goto partial; 1012 break;
1032 if (!trace_seq_putc(s, '\n'))
1033 goto partial;
1034 }
1035 1013
1036 return TRACE_TYPE_HANDLED; 1014 trace_seq_puts(s, " => ");
1015 seq_print_ip_sym(s, *p, flags);
1016 trace_seq_putc(s, '\n');
1017 }
1037 1018
1038 partial: 1019 return trace_handle_return(s);
1039 return TRACE_TYPE_PARTIAL_LINE;
1040} 1020}
1041 1021
1042static struct trace_event_functions trace_stack_funcs = { 1022static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1057 1037
1058 trace_assign_type(field, iter->ent); 1038 trace_assign_type(field, iter->ent);
1059 1039
1060 if (!trace_seq_puts(s, "<user stack trace>\n")) 1040 trace_seq_puts(s, "<user stack trace>\n");
1061 goto partial; 1041 seq_print_userip_objs(field, s, flags);
1062
1063 if (!seq_print_userip_objs(field, s, flags))
1064 goto partial;
1065
1066 return TRACE_TYPE_HANDLED;
1067 1042
1068 partial: 1043 return trace_handle_return(s);
1069 return TRACE_TYPE_PARTIAL_LINE;
1070} 1044}
1071 1045
1072static struct trace_event_functions trace_user_stack_funcs = { 1046static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
1089 1063
1090 trace_assign_type(field, entry); 1064 trace_assign_type(field, entry);
1091 1065
1092 if (!seq_print_ip_sym(s, field->ip, flags)) 1066 seq_print_ip_sym(s, field->ip, flags);
1093 goto partial; 1067 trace_seq_puts(s, ": ");
1068 trace_seq_puts(s, field->str);
1094 1069
1095 if (!trace_seq_puts(s, ": ")) 1070 return trace_handle_return(s);
1096 goto partial;
1097
1098 if (!trace_seq_puts(s, field->str))
1099 goto partial;
1100
1101 return TRACE_TYPE_HANDLED;
1102
1103 partial:
1104 return TRACE_TYPE_PARTIAL_LINE;
1105} 1071}
1106 1072
1107 1073
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
1114 1080
1115 trace_assign_type(field, iter->ent); 1081 trace_assign_type(field, iter->ent);
1116 1082
1117 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1083 trace_seq_printf(s, ": %lx : ", field->ip);
1118 goto partial; 1084 trace_seq_puts(s, field->str);
1119
1120 if (!trace_seq_puts(s, field->str))
1121 goto partial;
1122 1085
1123 return TRACE_TYPE_HANDLED; 1086 return trace_handle_return(s);
1124
1125 partial:
1126 return TRACE_TYPE_PARTIAL_LINE;
1127} 1087}
1128 1088
1129static struct trace_event_functions trace_bputs_funcs = { 1089static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
1147 1107
1148 trace_assign_type(field, entry); 1108 trace_assign_type(field, entry);
1149 1109
1150 if (!seq_print_ip_sym(s, field->ip, flags)) 1110 seq_print_ip_sym(s, field->ip, flags);
1151 goto partial; 1111 trace_seq_puts(s, ": ");
1152 1112 trace_seq_bprintf(s, field->fmt, field->buf);
1153 if (!trace_seq_puts(s, ": "))
1154 goto partial;
1155
1156 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1157 goto partial;
1158 1113
1159 return TRACE_TYPE_HANDLED; 1114 return trace_handle_return(s);
1160
1161 partial:
1162 return TRACE_TYPE_PARTIAL_LINE;
1163} 1115}
1164 1116
1165 1117
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
1172 1124
1173 trace_assign_type(field, iter->ent); 1125 trace_assign_type(field, iter->ent);
1174 1126
1175 if (!trace_seq_printf(s, ": %lx : ", field->ip)) 1127 trace_seq_printf(s, ": %lx : ", field->ip);
1176 goto partial; 1128 trace_seq_bprintf(s, field->fmt, field->buf);
1177
1178 if (!trace_seq_bprintf(s, field->fmt, field->buf))
1179 goto partial;
1180 1129
1181 return TRACE_TYPE_HANDLED; 1130 return trace_handle_return(s);
1182
1183 partial:
1184 return TRACE_TYPE_PARTIAL_LINE;
1185} 1131}
1186 1132
1187static struct trace_event_functions trace_bprint_funcs = { 1133static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1203 1149
1204 trace_assign_type(field, iter->ent); 1150 trace_assign_type(field, iter->ent);
1205 1151
1206 if (!seq_print_ip_sym(s, field->ip, flags)) 1152 seq_print_ip_sym(s, field->ip, flags);
1207 goto partial; 1153 trace_seq_printf(s, ": %s", field->buf);
1208
1209 if (!trace_seq_printf(s, ": %s", field->buf))
1210 goto partial;
1211 1154
1212 return TRACE_TYPE_HANDLED; 1155 return trace_handle_return(s);
1213
1214 partial:
1215 return TRACE_TYPE_PARTIAL_LINE;
1216} 1156}
1217 1157
1218static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, 1158static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1222 1162
1223 trace_assign_type(field, iter->ent); 1163 trace_assign_type(field, iter->ent);
1224 1164
1225 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) 1165 trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
1226 goto partial;
1227
1228 return TRACE_TYPE_HANDLED;
1229 1166
1230 partial: 1167 return trace_handle_return(&iter->seq);
1231 return TRACE_TYPE_PARTIAL_LINE;
1232} 1168}
1233 1169
1234static struct trace_event_functions trace_print_funcs = { 1170static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
35extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
36extern struct rw_semaphore trace_event_sem; 36extern struct rw_semaphore trace_event_sem;
37 37
38#define SEQ_PUT_FIELD_RET(s, x) \ 38#define SEQ_PUT_FIELD(s, x) \
39do { \ 39 trace_seq_putmem(s, &(x), sizeof(x))
40 if (!trace_seq_putmem(s, &(x), sizeof(x))) \ 40
41 return TRACE_TYPE_PARTIAL_LINE; \ 41#define SEQ_PUT_HEX_FIELD(s, x) \
42} while (0) 42 trace_seq_putmem_hex(s, &(x), sizeof(x))
43
44#define SEQ_PUT_HEX_FIELD_RET(s, x) \
45do { \
46 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
47 return TRACE_TYPE_PARTIAL_LINE; \
48} while (0)
49 43
50#endif 44#endif
51 45
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
305 seq_puts(m, "\\t"); 305 seq_puts(m, "\\t");
306 break; 306 break;
307 case '\\': 307 case '\\':
308 seq_puts(m, "\\"); 308 seq_putc(m, '\\');
309 break; 309 break;
310 case '"': 310 case '"':
311 seq_puts(m, "\\\""); 311 seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ 40int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
41 void *data, void *ent) \ 41 void *data, void *ent) \
42{ \ 42{ \
43 return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ 43 trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
44 return !trace_seq_has_overflowed(s); \
44} \ 45} \
45const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ 46const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
46NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); 47NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
61 int len = *(u32 *)data >> 16; 62 int len = *(u32 *)data >> 16;
62 63
63 if (!len) 64 if (!len)
64 return trace_seq_printf(s, " %s=(fault)", name); 65 trace_seq_printf(s, " %s=(fault)", name);
65 else 66 else
66 return trace_seq_printf(s, " %s=\"%s\"", name, 67 trace_seq_printf(s, " %s=\"%s\"", name,
67 (const char *)get_loc_data(data, ent)); 68 (const char *)get_loc_data(data, ent));
69 return !trace_seq_has_overflowed(s);
68} 70}
69NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); 71NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
70 72
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
14 14
15#include "trace.h" 15#include "trace.h"
16 16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static int sched_ref; 17static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 18static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!call_filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51 19
52static void 20static void
53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 21probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54{ 22{
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 int cpu;
58 int pc;
59
60 if (unlikely(!sched_ref)) 23 if (unlikely(!sched_ref))
61 return; 24 return;
62 25
63 tracing_record_cmdline(prev); 26 tracing_record_cmdline(prev);
64 tracing_record_cmdline(next); 27 tracing_record_cmdline(next);
65
66 if (!tracer_enabled || sched_stopped)
67 return;
68
69 pc = preempt_count();
70 local_irq_save(flags);
71 cpu = raw_smp_processor_id();
72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73
74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
76
77 local_irq_restore(flags);
78}
79
80void
81tracing_sched_wakeup_trace(struct trace_array *tr,
82 struct task_struct *wakee,
83 struct task_struct *curr,
84 unsigned long flags, int pc)
85{
86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc);
93 if (!event)
94 return;
95 entry = ring_buffer_event_data(event);
96 entry->prev_pid = curr->pid;
97 entry->prev_prio = curr->prio;
98 entry->prev_state = curr->state;
99 entry->next_pid = wakee->pid;
100 entry->next_prio = wakee->prio;
101 entry->next_state = wakee->state;
102 entry->next_cpu = task_cpu(wakee);
103
104 if (!call_filter_check_discard(call, entry, buffer, event))
105 trace_buffer_unlock_commit(buffer, event, flags, pc);
106} 28}
107 29
108static void 30static void
109probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) 31probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
110{ 32{
111 struct trace_array_cpu *data;
112 unsigned long flags;
113 int cpu, pc;
114
115 if (unlikely(!sched_ref)) 33 if (unlikely(!sched_ref))
116 return; 34 return;
117 35
118 tracing_record_cmdline(current); 36 tracing_record_cmdline(current);
119
120 if (!tracer_enabled || sched_stopped)
121 return;
122
123 pc = preempt_count();
124 local_irq_save(flags);
125 cpu = raw_smp_processor_id();
126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127
128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
130 flags, pc);
131
132 local_irq_restore(flags);
133} 37}
134 38
135static int tracing_sched_register(void) 39static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
197{ 101{
198 tracing_stop_sched_switch(); 102 tracing_stop_sched_switch();
199} 103}
200
201/**
202 * tracing_start_sched_switch_record - start tracing context switches
203 *
204 * Turns on context switch tracing for a tracer.
205 */
206void tracing_start_sched_switch_record(void)
207{
208 if (unlikely(!ctx_trace)) {
209 WARN_ON(1);
210 return;
211 }
212
213 tracing_start_sched_switch();
214
215 mutex_lock(&sched_register_mutex);
216 tracer_enabled++;
217 mutex_unlock(&sched_register_mutex);
218}
219
220/**
221 * tracing_stop_sched_switch_record - start tracing context switches
222 *
223 * Turns off context switch tracing for a tracer.
224 */
225void tracing_stop_sched_switch_record(void)
226{
227 mutex_lock(&sched_register_mutex);
228 tracer_enabled--;
229 WARN_ON(tracer_enabled < 0);
230 mutex_unlock(&sched_register_mutex);
231
232 tracing_stop_sched_switch();
233}
234
235/**
236 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
237 * @tr: trace array pointer to assign
238 *
239 * Some tracers might want to record the context switches in their
240 * trace. This function lets those tracers assign the trace array
241 * to use.
242 */
243void tracing_sched_switch_assign_trace(struct trace_array *tr)
244{
245 ctx_trace = tr;
246}
247
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
365 wakeup_current_cpu = cpu; 365 wakeup_current_cpu = cpu;
366} 366}
367 367
368static void
369tracing_sched_switch_trace(struct trace_array *tr,
370 struct task_struct *prev,
371 struct task_struct *next,
372 unsigned long flags, int pc)
373{
374 struct ftrace_event_call *call = &event_context_switch;
375 struct ring_buffer *buffer = tr->trace_buffer.buffer;
376 struct ring_buffer_event *event;
377 struct ctx_switch_entry *entry;
378
379 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
380 sizeof(*entry), flags, pc);
381 if (!event)
382 return;
383 entry = ring_buffer_event_data(event);
384 entry->prev_pid = prev->pid;
385 entry->prev_prio = prev->prio;
386 entry->prev_state = prev->state;
387 entry->next_pid = next->pid;
388 entry->next_prio = next->prio;
389 entry->next_state = next->state;
390 entry->next_cpu = task_cpu(next);
391
392 if (!call_filter_check_discard(call, entry, buffer, event))
393 trace_buffer_unlock_commit(buffer, event, flags, pc);
394}
395
396static void
397tracing_sched_wakeup_trace(struct trace_array *tr,
398 struct task_struct *wakee,
399 struct task_struct *curr,
400 unsigned long flags, int pc)
401{
402 struct ftrace_event_call *call = &event_wakeup;
403 struct ring_buffer_event *event;
404 struct ctx_switch_entry *entry;
405 struct ring_buffer *buffer = tr->trace_buffer.buffer;
406
407 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
408 sizeof(*entry), flags, pc);
409 if (!event)
410 return;
411 entry = ring_buffer_event_data(event);
412 entry->prev_pid = curr->pid;
413 entry->prev_prio = curr->prio;
414 entry->prev_state = curr->state;
415 entry->next_pid = wakee->pid;
416 entry->next_prio = wakee->prio;
417 entry->next_state = wakee->state;
418 entry->next_cpu = task_cpu(wakee);
419
420 if (!call_filter_check_discard(call, entry, buffer, event))
421 trace_buffer_unlock_commit(buffer, event, flags, pc);
422}
423
368static void notrace 424static void notrace
369probe_wakeup_sched_switch(void *ignore, 425probe_wakeup_sched_switch(void *ignore,
370 struct task_struct *prev, struct task_struct *next) 426 struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
27#include <linux/trace_seq.h> 27#include <linux/trace_seq.h>
28 28
29/* How much buffer is left on the trace_seq? */ 29/* How much buffer is left on the trace_seq? */
30#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) 30#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
31 31
32/* How much buffer is written? */ 32/* How much buffer is written? */
33#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) 33#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
34
35/*
36 * trace_seq should work with being initialized with 0s.
37 */
38static inline void __trace_seq_init(struct trace_seq *s)
39{
40 if (unlikely(!s->seq.size))
41 trace_seq_init(s);
42}
34 43
35/** 44/**
36 * trace_print_seq - move the contents of trace_seq into a seq_file 45 * trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
43 */ 52 */
44int trace_print_seq(struct seq_file *m, struct trace_seq *s) 53int trace_print_seq(struct seq_file *m, struct trace_seq *s)
45{ 54{
46 unsigned int len = TRACE_SEQ_BUF_USED(s);
47 int ret; 55 int ret;
48 56
49 ret = seq_write(m, s->buffer, len); 57 __trace_seq_init(s);
58
59 ret = seq_buf_print_seq(m, &s->seq);
50 60
51 /* 61 /*
52 * Only reset this buffer if we successfully wrote to the 62 * Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
69 * trace_seq_printf() is used to store strings into a special 79 * trace_seq_printf() is used to store strings into a special
70 * buffer (@s). Then the output may be either used by 80 * buffer (@s). Then the output may be either used by
71 * the sequencer or pulled into another buffer. 81 * the sequencer or pulled into another buffer.
72 *
73 * Returns 1 if we successfully written all the contents to
74 * the buffer.
75 * Returns 0 if we the length to write is bigger than the
76 * reserved buffer space. In this case, nothing gets written.
77 */ 82 */
78int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) 83void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
79{ 84{
80 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 85 unsigned int save_len = s->seq.len;
81 va_list ap; 86 va_list ap;
82 int ret;
83 87
84 if (s->full || !len) 88 if (s->full)
85 return 0; 89 return;
90
91 __trace_seq_init(s);
86 92
87 va_start(ap, fmt); 93 va_start(ap, fmt);
88 ret = vsnprintf(s->buffer + s->len, len, fmt, ap); 94 seq_buf_vprintf(&s->seq, fmt, ap);
89 va_end(ap); 95 va_end(ap);
90 96
91 /* If we can't write it all, don't bother writing anything */ 97 /* If we can't write it all, don't bother writing anything */
92 if (ret >= len) { 98 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
99 s->seq.len = save_len;
93 s->full = 1; 100 s->full = 1;
94 return 0;
95 } 101 }
96
97 s->len += ret;
98
99 return 1;
100} 102}
101EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
102 104
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
107 * @nmaskbits: The number of bits that are valid in @maskp 109 * @nmaskbits: The number of bits that are valid in @maskp
108 * 110 *
109 * Writes a ASCII representation of a bitmask string into @s. 111 * Writes a ASCII representation of a bitmask string into @s.
110 *
111 * Returns 1 if we successfully written all the contents to
112 * the buffer.
113 * Returns 0 if we the length to write is bigger than the
114 * reserved buffer space. In this case, nothing gets written.
115 */ 112 */
116int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, 113void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
117 int nmaskbits) 114 int nmaskbits)
118{ 115{
119 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 116 unsigned int save_len = s->seq.len;
120 int ret;
121 117
122 if (s->full || !len) 118 if (s->full)
123 return 0; 119 return;
124 120
125 ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); 121 __trace_seq_init(s);
126 s->len += ret;
127 122
128 return 1; 123 seq_buf_bitmask(&s->seq, maskp, nmaskbits);
124
125 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
126 s->seq.len = save_len;
127 s->full = 1;
128 }
129} 129}
130EXPORT_SYMBOL_GPL(trace_seq_bitmask); 130EXPORT_SYMBOL_GPL(trace_seq_bitmask);
131 131
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
139 * trace_seq_printf is used to store strings into a special 139 * trace_seq_printf is used to store strings into a special
140 * buffer (@s). Then the output may be either used by 140 * buffer (@s). Then the output may be either used by
141 * the sequencer or pulled into another buffer. 141 * the sequencer or pulled into another buffer.
142 *
143 * Returns how much it wrote to the buffer.
144 */ 142 */
145int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) 143void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
146{ 144{
147 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 145 unsigned int save_len = s->seq.len;
148 int ret;
149 146
150 if (s->full || !len) 147 if (s->full)
151 return 0; 148 return;
152 149
153 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 150 __trace_seq_init(s);
151
152 seq_buf_vprintf(&s->seq, fmt, args);
154 153
155 /* If we can't write it all, don't bother writing anything */ 154 /* If we can't write it all, don't bother writing anything */
156 if (ret >= len) { 155 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
156 s->seq.len = save_len;
157 s->full = 1; 157 s->full = 1;
158 return 0;
159 } 158 }
160
161 s->len += ret;
162
163 return len;
164} 159}
165EXPORT_SYMBOL_GPL(trace_seq_vprintf); 160EXPORT_SYMBOL_GPL(trace_seq_vprintf);
166 161
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
178 * 173 *
179 * This function will take the format and the binary array and finish 174 * This function will take the format and the binary array and finish
180 * the conversion into the ASCII string within the buffer. 175 * the conversion into the ASCII string within the buffer.
181 *
182 * Returns how much it wrote to the buffer.
183 */ 176 */
184int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 177void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
185{ 178{
186 unsigned int len = TRACE_SEQ_BUF_LEFT(s); 179 unsigned int save_len = s->seq.len;
187 int ret;
188 180
189 if (s->full || !len) 181 if (s->full)
190 return 0; 182 return;
183
184 __trace_seq_init(s);
191 185
192 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 186 seq_buf_bprintf(&s->seq, fmt, binary);
193 187
194 /* If we can't write it all, don't bother writing anything */ 188 /* If we can't write it all, don't bother writing anything */
195 if (ret >= len) { 189 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
190 s->seq.len = save_len;
196 s->full = 1; 191 s->full = 1;
197 return 0; 192 return;
198 } 193 }
199
200 s->len += ret;
201
202 return len;
203} 194}
204EXPORT_SYMBOL_GPL(trace_seq_bprintf); 195EXPORT_SYMBOL_GPL(trace_seq_bprintf);
205 196
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
212 * copy to user routines. This function records a simple string 203 * copy to user routines. This function records a simple string
213 * into a special buffer (@s) for later retrieval by a sequencer 204 * into a special buffer (@s) for later retrieval by a sequencer
214 * or other mechanism. 205 * or other mechanism.
215 *
216 * Returns how much it wrote to the buffer.
217 */ 206 */
218int trace_seq_puts(struct trace_seq *s, const char *str) 207void trace_seq_puts(struct trace_seq *s, const char *str)
219{ 208{
220 unsigned int len = strlen(str); 209 unsigned int len = strlen(str);
221 210
222 if (s->full) 211 if (s->full)
223 return 0; 212 return;
213
214 __trace_seq_init(s);
224 215
225 if (len > TRACE_SEQ_BUF_LEFT(s)) { 216 if (len > TRACE_SEQ_BUF_LEFT(s)) {
226 s->full = 1; 217 s->full = 1;
227 return 0; 218 return;
228 } 219 }
229 220
230 memcpy(s->buffer + s->len, str, len); 221 seq_buf_putmem(&s->seq, str, len);
231 s->len += len;
232
233 return len;
234} 222}
235EXPORT_SYMBOL_GPL(trace_seq_puts); 223EXPORT_SYMBOL_GPL(trace_seq_puts);
236 224
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
243 * copy to user routines. This function records a simple charater 231 * copy to user routines. This function records a simple charater
244 * into a special buffer (@s) for later retrieval by a sequencer 232 * into a special buffer (@s) for later retrieval by a sequencer
245 * or other mechanism. 233 * or other mechanism.
246 *
247 * Returns how much it wrote to the buffer.
248 */ 234 */
249int trace_seq_putc(struct trace_seq *s, unsigned char c) 235void trace_seq_putc(struct trace_seq *s, unsigned char c)
250{ 236{
251 if (s->full) 237 if (s->full)
252 return 0; 238 return;
239
240 __trace_seq_init(s);
253 241
254 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 242 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
255 s->full = 1; 243 s->full = 1;
256 return 0; 244 return;
257 } 245 }
258 246
259 s->buffer[s->len++] = c; 247 seq_buf_putc(&s->seq, c);
260
261 return 1;
262} 248}
263EXPORT_SYMBOL_GPL(trace_seq_putc); 249EXPORT_SYMBOL_GPL(trace_seq_putc);
264 250
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
271 * There may be cases where raw memory needs to be written into the 257 * There may be cases where raw memory needs to be written into the
272 * buffer and a strcpy() would not work. Using this function allows 258 * buffer and a strcpy() would not work. Using this function allows
273 * for such cases. 259 * for such cases.
274 *
275 * Returns how much it wrote to the buffer.
276 */ 260 */
277int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) 261void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
278{ 262{
279 if (s->full) 263 if (s->full)
280 return 0; 264 return;
265
266 __trace_seq_init(s);
281 267
282 if (len > TRACE_SEQ_BUF_LEFT(s)) { 268 if (len > TRACE_SEQ_BUF_LEFT(s)) {
283 s->full = 1; 269 s->full = 1;
284 return 0; 270 return;
285 } 271 }
286 272
287 memcpy(s->buffer + s->len, mem, len); 273 seq_buf_putmem(&s->seq, mem, len);
288 s->len += len;
289
290 return len;
291} 274}
292EXPORT_SYMBOL_GPL(trace_seq_putmem); 275EXPORT_SYMBOL_GPL(trace_seq_putmem);
293 276
294#define MAX_MEMHEX_BYTES 8U
295#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
296
297/** 277/**
298 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex 278 * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
299 * @s: trace sequence descriptor 279 * @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
303 * This is similar to trace_seq_putmem() except instead of just copying the 283 * This is similar to trace_seq_putmem() except instead of just copying the
304 * raw memory into the buffer it writes its ASCII representation of it 284 * raw memory into the buffer it writes its ASCII representation of it
305 * in hex characters. 285 * in hex characters.
306 *
307 * Returns how much it wrote to the buffer.
308 */ 286 */
309int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, 287void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
310 unsigned int len) 288 unsigned int len)
311{ 289{
312 unsigned char hex[HEX_CHARS]; 290 unsigned int save_len = s->seq.len;
313 const unsigned char *data = mem;
314 unsigned int start_len;
315 int i, j;
316 int cnt = 0;
317 291
318 if (s->full) 292 if (s->full)
319 return 0; 293 return;
320 294
321 while (len) { 295 __trace_seq_init(s);
322 start_len = min(len, HEX_CHARS - 1); 296
323#ifdef __BIG_ENDIAN 297 /* Each byte is represented by two chars */
324 for (i = 0, j = 0; i < start_len; i++) { 298 if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
325#else 299 s->full = 1;
326 for (i = start_len-1, j = 0; i >= 0; i--) { 300 return;
327#endif 301 }
328 hex[j++] = hex_asc_hi(data[i]); 302
329 hex[j++] = hex_asc_lo(data[i]); 303 /* The added spaces can still cause an overflow */
330 } 304 seq_buf_putmem_hex(&s->seq, mem, len);
331 if (WARN_ON_ONCE(j == 0 || j/2 > len)) 305
332 break; 306 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
333 307 s->seq.len = save_len;
334 /* j increments twice per loop */ 308 s->full = 1;
335 len -= j / 2; 309 return;
336 hex[j++] = ' ';
337
338 cnt += trace_seq_putmem(s, hex, j);
339 } 310 }
340 return cnt;
341} 311}
342EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); 312EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
343 313
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
355 */ 325 */
356int trace_seq_path(struct trace_seq *s, const struct path *path) 326int trace_seq_path(struct trace_seq *s, const struct path *path)
357{ 327{
358 unsigned char *p; 328 unsigned int save_len = s->seq.len;
359 329
360 if (s->full) 330 if (s->full)
361 return 0; 331 return 0;
362 332
333 __trace_seq_init(s);
334
363 if (TRACE_SEQ_BUF_LEFT(s) < 1) { 335 if (TRACE_SEQ_BUF_LEFT(s) < 1) {
364 s->full = 1; 336 s->full = 1;
365 return 0; 337 return 0;
366 } 338 }
367 339
368 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 340 seq_buf_path(&s->seq, path, "\n");
369 if (!IS_ERR(p)) { 341
370 p = mangle_path(s->buffer + s->len, p, "\n"); 342 if (unlikely(seq_buf_has_overflowed(&s->seq))) {
371 if (p) { 343 s->seq.len = save_len;
372 s->len = p - s->buffer; 344 s->full = 1;
373 return 1; 345 return 0;
374 }
375 } else {
376 s->buffer[s->len++] = '?';
377 return 1;
378 } 346 }
379 347
380 s->full = 1; 348 return 1;
381 return 0;
382} 349}
383EXPORT_SYMBOL_GPL(trace_seq_path); 350EXPORT_SYMBOL_GPL(trace_seq_path);
384 351
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
404 */ 371 */
405int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) 372int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
406{ 373{
407 int len; 374 __trace_seq_init(s);
408 int ret; 375 return seq_buf_to_user(&s->seq, ubuf, cnt);
409
410 if (!cnt)
411 return 0;
412
413 if (s->len <= s->readpos)
414 return -EBUSY;
415
416 len = s->len - s->readpos;
417 if (cnt > len)
418 cnt = len;
419 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
420 if (ret == cnt)
421 return -EFAULT;
422
423 cnt -= ret;
424
425 s->readpos += cnt;
426 return cnt;
427} 376}
428EXPORT_SYMBOL_GPL(trace_seq_to_user); 377EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 29228c4d5696..c6ee36fcbf90 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
114 struct trace_entry *ent = iter->ent; 114 struct trace_entry *ent = iter->ent;
115 struct syscall_trace_enter *trace; 115 struct syscall_trace_enter *trace;
116 struct syscall_metadata *entry; 116 struct syscall_metadata *entry;
117 int i, ret, syscall; 117 int i, syscall;
118 118
119 trace = (typeof(trace))ent; 119 trace = (typeof(trace))ent;
120 syscall = trace->nr; 120 syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
128 goto end; 128 goto end;
129 } 129 }
130 130
131 ret = trace_seq_printf(s, "%s(", entry->name); 131 trace_seq_printf(s, "%s(", entry->name);
132 if (!ret)
133 return TRACE_TYPE_PARTIAL_LINE;
134 132
135 for (i = 0; i < entry->nb_args; i++) { 133 for (i = 0; i < entry->nb_args; i++) {
134
135 if (trace_seq_has_overflowed(s))
136 goto end;
137
136 /* parameter types */ 138 /* parameter types */
137 if (trace_flags & TRACE_ITER_VERBOSE) { 139 if (trace_flags & TRACE_ITER_VERBOSE)
138 ret = trace_seq_printf(s, "%s ", entry->types[i]); 140 trace_seq_printf(s, "%s ", entry->types[i]);
139 if (!ret) 141
140 return TRACE_TYPE_PARTIAL_LINE;
141 }
142 /* parameter values */ 142 /* parameter values */
143 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 143 trace_seq_printf(s, "%s: %lx%s", entry->args[i],
144 trace->args[i], 144 trace->args[i],
145 i == entry->nb_args - 1 ? "" : ", "); 145 i == entry->nb_args - 1 ? "" : ", ");
146 if (!ret)
147 return TRACE_TYPE_PARTIAL_LINE;
148 } 146 }
149 147
150 ret = trace_seq_putc(s, ')'); 148 trace_seq_putc(s, ')');
151 if (!ret)
152 return TRACE_TYPE_PARTIAL_LINE;
153
154end: 149end:
155 ret = trace_seq_putc(s, '\n'); 150 trace_seq_putc(s, '\n');
156 if (!ret)
157 return TRACE_TYPE_PARTIAL_LINE;
158 151
159 return TRACE_TYPE_HANDLED; 152 return trace_handle_return(s);
160} 153}
161 154
162static enum print_line_t 155static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
168 struct syscall_trace_exit *trace; 161 struct syscall_trace_exit *trace;
169 int syscall; 162 int syscall;
170 struct syscall_metadata *entry; 163 struct syscall_metadata *entry;
171 int ret;
172 164
173 trace = (typeof(trace))ent; 165 trace = (typeof(trace))ent;
174 syscall = trace->nr; 166 syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
176 168
177 if (!entry) { 169 if (!entry) {
178 trace_seq_putc(s, '\n'); 170 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 171 goto out;
180 } 172 }
181 173
182 if (entry->exit_event->event.type != ent->type) { 174 if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
184 return TRACE_TYPE_UNHANDLED; 176 return TRACE_TYPE_UNHANDLED;
185 } 177 }
186 178
187 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 179 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
188 trace->ret); 180 trace->ret);
189 if (!ret)
190 return TRACE_TYPE_PARTIAL_LINE;
191 181
192 return TRACE_TYPE_HANDLED; 182 out:
183 return trace_handle_return(s);
193} 184}
194 185
195extern char *__bad_type_size(void); 186extern char *__bad_type_size(void);
@@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
523 return (unsigned long)sys_call_table[nr]; 514 return (unsigned long)sys_call_table[nr];
524} 515}
525 516
526static int __init init_ftrace_syscalls(void) 517void __init init_ftrace_syscalls(void)
527{ 518{
528 struct syscall_metadata *meta; 519 struct syscall_metadata *meta;
529 unsigned long addr; 520 unsigned long addr;
@@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void)
533 GFP_KERNEL); 524 GFP_KERNEL);
534 if (!syscalls_metadata) { 525 if (!syscalls_metadata) {
535 WARN_ON(1); 526 WARN_ON(1);
536 return -ENOMEM; 527 return;
537 } 528 }
538 529
539 for (i = 0; i < NR_syscalls; i++) { 530 for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void)
545 meta->syscall_nr = i; 536 meta->syscall_nr = i;
546 syscalls_metadata[i] = meta; 537 syscalls_metadata[i] = meta;
547 } 538 }
548
549 return 0;
550} 539}
551early_initcall(init_ftrace_syscalls);
552 540
553#ifdef CONFIG_PERF_EVENTS 541#ifdef CONFIG_PERF_EVENTS
554 542
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
552 return ret; 552 return ret;
553 553
554fail_address_parse: 554fail_address_parse:
555 if (inode) 555 iput(inode);
556 iput(inode);
557 556
558 pr_info("Failed to parse address or file.\n"); 557 pr_info("Failed to parse address or file.\n");
559 558
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
606 for (i = 0; i < tu->tp.nr_args; i++) 605 for (i = 0; i < tu->tp.nr_args; i++)
607 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); 606 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
608 607
609 seq_printf(m, "\n"); 608 seq_putc(m, '\n');
610 return 0; 609 return 0;
611} 610}
612 611
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
852 tu = container_of(event, struct trace_uprobe, tp.call.event); 851 tu = container_of(event, struct trace_uprobe, tp.call.event);
853 852
854 if (is_ret_probe(tu)) { 853 if (is_ret_probe(tu)) {
855 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", 854 trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
856 ftrace_event_name(&tu->tp.call), 855 ftrace_event_name(&tu->tp.call),
857 entry->vaddr[1], entry->vaddr[0])) 856 entry->vaddr[1], entry->vaddr[0]);
858 goto partial;
859 data = DATAOF_TRACE_ENTRY(entry, true); 857 data = DATAOF_TRACE_ENTRY(entry, true);
860 } else { 858 } else {
861 if (!trace_seq_printf(s, "%s: (0x%lx)", 859 trace_seq_printf(s, "%s: (0x%lx)",
862 ftrace_event_name(&tu->tp.call), 860 ftrace_event_name(&tu->tp.call),
863 entry->vaddr[0])) 861 entry->vaddr[0]);
864 goto partial;
865 data = DATAOF_TRACE_ENTRY(entry, false); 862 data = DATAOF_TRACE_ENTRY(entry, false);
866 } 863 }
867 864
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
869 struct probe_arg *parg = &tu->tp.args[i]; 866 struct probe_arg *parg = &tu->tp.args[i];
870 867
871 if (!parg->type->print(s, parg->name, data + parg->offset, entry)) 868 if (!parg->type->print(s, parg->name, data + parg->offset, entry))
872 goto partial; 869 goto out;
873 } 870 }
874 871
875 if (trace_seq_puts(s, "\n")) 872 trace_seq_putc(s, '\n');
876 return TRACE_TYPE_HANDLED;
877 873
878partial: 874 out:
879 return TRACE_TYPE_PARTIAL_LINE; 875 return trace_handle_return(s);
880} 876}
881 877
882typedef bool (*filter_func_t)(struct uprobe_consumer *self, 878typedef bool (*filter_func_t)(struct uprobe_consumer *self,
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!ns_capable(current_user_ns(), CAP_SETGID)) 179 if (!may_setgroups())
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 4efa39350e44..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,7 +50,11 @@ struct user_namespace init_user_ns = {
50 .count = ATOMIC_INIT(3), 50 .count = ATOMIC_INIT(3),
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .ns.inum = PROC_USER_INIT_INO,
54#ifdef CONFIG_USER_NS
55 .ns.ops = &userns_operations,
56#endif
57 .flags = USERNS_INIT_FLAGS,
54#ifdef CONFIG_PERSISTENT_KEYRINGS 58#ifdef CONFIG_PERSISTENT_KEYRINGS
55 .persistent_keyring_register_sem = 59 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 60 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa312b0dc3ec..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
24#include <linux/fs_struct.h> 24#include <linux/fs_struct.h>
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27static DEFINE_MUTEX(userns_state_mutex);
27 28
28static bool new_idmap_permitted(const struct file *file, 29static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
@@ -86,11 +87,12 @@ int create_user_ns(struct cred *new)
86 if (!ns) 87 if (!ns)
87 return -ENOMEM; 88 return -ENOMEM;
88 89
89 ret = proc_alloc_inum(&ns->proc_inum); 90 ret = ns_alloc_inum(&ns->ns);
90 if (ret) { 91 if (ret) {
91 kmem_cache_free(user_ns_cachep, ns); 92 kmem_cache_free(user_ns_cachep, ns);
92 return ret; 93 return ret;
93 } 94 }
95 ns->ns.ops = &userns_operations;
94 96
95 atomic_set(&ns->count, 1); 97 atomic_set(&ns->count, 1);
96 /* Leave the new->user_ns reference with the new user namespace. */ 98 /* Leave the new->user_ns reference with the new user namespace. */
@@ -99,6 +101,11 @@ int create_user_ns(struct cred *new)
99 ns->owner = owner; 101 ns->owner = owner;
100 ns->group = group; 102 ns->group = group;
101 103
104 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
105 mutex_lock(&userns_state_mutex);
106 ns->flags = parent_ns->flags;
107 mutex_unlock(&userns_state_mutex);
108
102 set_cred_user_ns(new, ns); 109 set_cred_user_ns(new, ns);
103 110
104#ifdef CONFIG_PERSISTENT_KEYRINGS 111#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns)
136#ifdef CONFIG_PERSISTENT_KEYRINGS 143#ifdef CONFIG_PERSISTENT_KEYRINGS
137 key_put(ns->persistent_keyring_register); 144 key_put(ns->persistent_keyring_register);
138#endif 145#endif
139 proc_free_inum(ns->proc_inum); 146 ns_free_inum(&ns->ns);
140 kmem_cache_free(user_ns_cachep, ns); 147 kmem_cache_free(user_ns_cachep, ns);
141 ns = parent; 148 ns = parent;
142 } while (atomic_dec_and_test(&parent->count)); 149 } while (atomic_dec_and_test(&parent->count));
@@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
583 return false; 590 return false;
584} 591}
585 592
586
587static DEFINE_MUTEX(id_map_mutex);
588
589static ssize_t map_write(struct file *file, const char __user *buf, 593static ssize_t map_write(struct file *file, const char __user *buf,
590 size_t count, loff_t *ppos, 594 size_t count, loff_t *ppos,
591 int cap_setid, 595 int cap_setid,
@@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
602 ssize_t ret = -EINVAL; 606 ssize_t ret = -EINVAL;
603 607
604 /* 608 /*
605 * The id_map_mutex serializes all writes to any given map. 609 * The userns_state_mutex serializes all writes to any given map.
606 * 610 *
607 * Any map is only ever written once. 611 * Any map is only ever written once.
608 * 612 *
@@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
620 * order and smp_rmb() is guaranteed that we don't have crazy 624 * order and smp_rmb() is guaranteed that we don't have crazy
621 * architectures returning stale data. 625 * architectures returning stale data.
622 */ 626 */
623 mutex_lock(&id_map_mutex); 627 mutex_lock(&userns_state_mutex);
624 628
625 ret = -EPERM; 629 ret = -EPERM;
626 /* Only allow one successful write to the map */ 630 /* Only allow one successful write to the map */
@@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
640 if (!page) 644 if (!page)
641 goto out; 645 goto out;
642 646
643 /* Only allow <= page size writes at the beginning of the file */ 647 /* Only allow < page size writes at the beginning of the file */
644 ret = -EINVAL; 648 ret = -EINVAL;
645 if ((*ppos != 0) || (count >= PAGE_SIZE)) 649 if ((*ppos != 0) || (count >= PAGE_SIZE))
646 goto out; 650 goto out;
@@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
750 *ppos = count; 754 *ppos = count;
751 ret = count; 755 ret = count;
752out: 756out:
753 mutex_unlock(&id_map_mutex); 757 mutex_unlock(&userns_state_mutex);
754 if (page) 758 if (page)
755 free_page(page); 759 free_page(page);
756 return ret; 760 return ret;
@@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
812 struct user_namespace *ns, int cap_setid, 816 struct user_namespace *ns, int cap_setid,
813 struct uid_gid_map *new_map) 817 struct uid_gid_map *new_map)
814{ 818{
815 /* Allow mapping to your own filesystem ids */ 819 const struct cred *cred = file->f_cred;
816 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 820 /* Don't allow mappings that would allow anything that wouldn't
821 * be allowed without the establishment of unprivileged mappings.
822 */
823 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
824 uid_eq(ns->owner, cred->euid)) {
817 u32 id = new_map->extent[0].lower_first; 825 u32 id = new_map->extent[0].lower_first;
818 if (cap_setid == CAP_SETUID) { 826 if (cap_setid == CAP_SETUID) {
819 kuid_t uid = make_kuid(ns->parent, id); 827 kuid_t uid = make_kuid(ns->parent, id);
820 if (uid_eq(uid, file->f_cred->fsuid)) 828 if (uid_eq(uid, cred->euid))
821 return true; 829 return true;
822 } else if (cap_setid == CAP_SETGID) { 830 } else if (cap_setid == CAP_SETGID) {
823 kgid_t gid = make_kgid(ns->parent, id); 831 kgid_t gid = make_kgid(ns->parent, id);
824 if (gid_eq(gid, file->f_cred->fsgid)) 832 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
833 gid_eq(gid, cred->egid))
825 return true; 834 return true;
826 } 835 }
827 } 836 }
@@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file,
841 return false; 850 return false;
842} 851}
843 852
844static void *userns_get(struct task_struct *task) 853int proc_setgroups_show(struct seq_file *seq, void *v)
854{
855 struct user_namespace *ns = seq->private;
856 unsigned long userns_flags = ACCESS_ONCE(ns->flags);
857
858 seq_printf(seq, "%s\n",
859 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
860 "allow" : "deny");
861 return 0;
862}
863
864ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
865 size_t count, loff_t *ppos)
866{
867 struct seq_file *seq = file->private_data;
868 struct user_namespace *ns = seq->private;
869 char kbuf[8], *pos;
870 bool setgroups_allowed;
871 ssize_t ret;
872
873 /* Only allow a very narrow range of strings to be written */
874 ret = -EINVAL;
875 if ((*ppos != 0) || (count >= sizeof(kbuf)))
876 goto out;
877
878 /* What was written? */
879 ret = -EFAULT;
880 if (copy_from_user(kbuf, buf, count))
881 goto out;
882 kbuf[count] = '\0';
883 pos = kbuf;
884
885 /* What is being requested? */
886 ret = -EINVAL;
887 if (strncmp(pos, "allow", 5) == 0) {
888 pos += 5;
889 setgroups_allowed = true;
890 }
891 else if (strncmp(pos, "deny", 4) == 0) {
892 pos += 4;
893 setgroups_allowed = false;
894 }
895 else
896 goto out;
897
898 /* Verify there is not trailing junk on the line */
899 pos = skip_spaces(pos);
900 if (*pos != '\0')
901 goto out;
902
903 ret = -EPERM;
904 mutex_lock(&userns_state_mutex);
905 if (setgroups_allowed) {
906 /* Enabling setgroups after setgroups has been disabled
907 * is not allowed.
908 */
909 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
910 goto out_unlock;
911 } else {
912 /* Permanently disabling setgroups after setgroups has
913 * been enabled by writing the gid_map is not allowed.
914 */
915 if (ns->gid_map.nr_extents != 0)
916 goto out_unlock;
917 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
918 }
919 mutex_unlock(&userns_state_mutex);
920
921 /* Report a successful write */
922 *ppos = count;
923 ret = count;
924out:
925 return ret;
926out_unlock:
927 mutex_unlock(&userns_state_mutex);
928 goto out;
929}
930
931bool userns_may_setgroups(const struct user_namespace *ns)
932{
933 bool allowed;
934
935 mutex_lock(&userns_state_mutex);
936 /* It is not safe to use setgroups until a gid mapping in
937 * the user namespace has been established.
938 */
939 allowed = ns->gid_map.nr_extents != 0;
940 /* Is setgroups allowed? */
941 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
942 mutex_unlock(&userns_state_mutex);
943
944 return allowed;
945}
946
947static inline struct user_namespace *to_user_ns(struct ns_common *ns)
948{
949 return container_of(ns, struct user_namespace, ns);
950}
951
952static struct ns_common *userns_get(struct task_struct *task)
845{ 953{
846 struct user_namespace *user_ns; 954 struct user_namespace *user_ns;
847 955
@@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task)
849 user_ns = get_user_ns(__task_cred(task)->user_ns); 957 user_ns = get_user_ns(__task_cred(task)->user_ns);
850 rcu_read_unlock(); 958 rcu_read_unlock();
851 959
852 return user_ns; 960 return user_ns ? &user_ns->ns : NULL;
853} 961}
854 962
855static void userns_put(void *ns) 963static void userns_put(struct ns_common *ns)
856{ 964{
857 put_user_ns(ns); 965 put_user_ns(to_user_ns(ns));
858} 966}
859 967
860static int userns_install(struct nsproxy *nsproxy, void *ns) 968static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
861{ 969{
862 struct user_namespace *user_ns = ns; 970 struct user_namespace *user_ns = to_user_ns(ns);
863 struct cred *cred; 971 struct cred *cred;
864 972
865 /* Don't allow gaining capabilities by reentering 973 /* Don't allow gaining capabilities by reentering
@@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
888 return commit_creds(cred); 996 return commit_creds(cred);
889} 997}
890 998
891static unsigned int userns_inum(void *ns)
892{
893 struct user_namespace *user_ns = ns;
894 return user_ns->proc_inum;
895}
896
897const struct proc_ns_operations userns_operations = { 999const struct proc_ns_operations userns_operations = {
898 .name = "user", 1000 .name = "user",
899 .type = CLONE_NEWUSER, 1001 .type = CLONE_NEWUSER,
900 .get = userns_get, 1002 .get = userns_get,
901 .put = userns_put, 1003 .put = userns_put,
902 .install = userns_install, 1004 .install = userns_install,
903 .inum = userns_inum,
904}; 1005};
905 1006
906static __init int user_namespaces_init(void) 1007static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 883aaaa7de8a..831ea7108232 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
42 if (!ns) 42 if (!ns)
43 return ERR_PTR(-ENOMEM); 43 return ERR_PTR(-ENOMEM);
44 44
45 err = proc_alloc_inum(&ns->proc_inum); 45 err = ns_alloc_inum(&ns->ns);
46 if (err) { 46 if (err) {
47 kfree(ns); 47 kfree(ns);
48 return ERR_PTR(err); 48 return ERR_PTR(err);
49 } 49 }
50 50
51 ns->ns.ops = &utsns_operations;
52
51 down_read(&uts_sem); 53 down_read(&uts_sem);
52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 54 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
53 ns->user_ns = get_user_ns(user_ns); 55 ns->user_ns = get_user_ns(user_ns);
@@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref)
84 86
85 ns = container_of(kref, struct uts_namespace, kref); 87 ns = container_of(kref, struct uts_namespace, kref);
86 put_user_ns(ns->user_ns); 88 put_user_ns(ns->user_ns);
87 proc_free_inum(ns->proc_inum); 89 ns_free_inum(&ns->ns);
88 kfree(ns); 90 kfree(ns);
89} 91}
90 92
91static void *utsns_get(struct task_struct *task) 93static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
94{
95 return container_of(ns, struct uts_namespace, ns);
96}
97
98static struct ns_common *utsns_get(struct task_struct *task)
92{ 99{
93 struct uts_namespace *ns = NULL; 100 struct uts_namespace *ns = NULL;
94 struct nsproxy *nsproxy; 101 struct nsproxy *nsproxy;
@@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task)
101 } 108 }
102 task_unlock(task); 109 task_unlock(task);
103 110
104 return ns; 111 return ns ? &ns->ns : NULL;
105} 112}
106 113
107static void utsns_put(void *ns) 114static void utsns_put(struct ns_common *ns)
108{ 115{
109 put_uts_ns(ns); 116 put_uts_ns(to_uts_ns(ns));
110} 117}
111 118
112static int utsns_install(struct nsproxy *nsproxy, void *new) 119static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
113{ 120{
114 struct uts_namespace *ns = new; 121 struct uts_namespace *ns = to_uts_ns(new);
115 122
116 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 123 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
117 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 124 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
@@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
123 return 0; 130 return 0;
124} 131}
125 132
126static unsigned int utsns_inum(void *vp)
127{
128 struct uts_namespace *ns = vp;
129
130 return ns->proc_inum;
131}
132
133const struct proc_ns_operations utsns_operations = { 133const struct proc_ns_operations utsns_operations = {
134 .name = "uts", 134 .name = "uts",
135 .type = CLONE_NEWUTS, 135 .type = CLONE_NEWUTS,
136 .get = utsns_get, 136 .get = utsns_get,
137 .put = utsns_put, 137 .put = utsns_put,
138 .install = utsns_install, 138 .install = utsns_install,
139 .inum = utsns_inum,
140}; 139};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 09b685daee3d..beeeac9e0e3e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1804 struct worker_pool *pool = (void *)__pool; 1804 struct worker_pool *pool = (void *)__pool;
1805 struct work_struct *work; 1805 struct work_struct *work;
1806 1806
1807 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ 1807 spin_lock_irq(&pool->lock);
1808 spin_lock(&pool->lock); 1808 spin_lock(&wq_mayday_lock); /* for wq->maydays */
1809 1809
1810 if (need_to_create_worker(pool)) { 1810 if (need_to_create_worker(pool)) {
1811 /* 1811 /*
@@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1818 send_mayday(work); 1818 send_mayday(work);
1819 } 1819 }
1820 1820
1821 spin_unlock(&pool->lock); 1821 spin_unlock(&wq_mayday_lock);
1822 spin_unlock_irq(&wq_mayday_lock); 1822 spin_unlock_irq(&pool->lock);
1823 1823
1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1824 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1825} 1825}
@@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool)
1841 * spin_lock_irq(pool->lock) which may be released and regrabbed 1841 * spin_lock_irq(pool->lock) which may be released and regrabbed
1842 * multiple times. Does GFP_KERNEL allocations. Called only from 1842 * multiple times. Does GFP_KERNEL allocations. Called only from
1843 * manager. 1843 * manager.
1844 *
1845 * Return:
1846 * %false if no action was taken and pool->lock stayed locked, %true
1847 * otherwise.
1848 */ 1844 */
1849static bool maybe_create_worker(struct worker_pool *pool) 1845static void maybe_create_worker(struct worker_pool *pool)
1850__releases(&pool->lock) 1846__releases(&pool->lock)
1851__acquires(&pool->lock) 1847__acquires(&pool->lock)
1852{ 1848{
1853 if (!need_to_create_worker(pool))
1854 return false;
1855restart: 1849restart:
1856 spin_unlock_irq(&pool->lock); 1850 spin_unlock_irq(&pool->lock);
1857 1851
@@ -1877,7 +1871,6 @@ restart:
1877 */ 1871 */
1878 if (need_to_create_worker(pool)) 1872 if (need_to_create_worker(pool))
1879 goto restart; 1873 goto restart;
1880 return true;
1881} 1874}
1882 1875
1883/** 1876/**
@@ -1897,16 +1890,14 @@ restart:
1897 * multiple times. Does GFP_KERNEL allocations. 1890 * multiple times. Does GFP_KERNEL allocations.
1898 * 1891 *
1899 * Return: 1892 * Return:
1900 * %false if the pool don't need management and the caller can safely start 1893 * %false if the pool doesn't need management and the caller can safely
1901 * processing works, %true indicates that the function released pool->lock 1894 * start processing works, %true if management function was performed and
1902 * and reacquired it to perform some management function and that the 1895 * the conditions that the caller verified before calling the function may
1903 * conditions that the caller verified while holding the lock before 1896 * no longer be true.
1904 * calling the function might no longer be true.
1905 */ 1897 */
1906static bool manage_workers(struct worker *worker) 1898static bool manage_workers(struct worker *worker)
1907{ 1899{
1908 struct worker_pool *pool = worker->pool; 1900 struct worker_pool *pool = worker->pool;
1909 bool ret = false;
1910 1901
1911 /* 1902 /*
1912 * Anyone who successfully grabs manager_arb wins the arbitration 1903 * Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker)
1919 * actual management, the pool may stall indefinitely. 1910 * actual management, the pool may stall indefinitely.
1920 */ 1911 */
1921 if (!mutex_trylock(&pool->manager_arb)) 1912 if (!mutex_trylock(&pool->manager_arb))
1922 return ret; 1913 return false;
1923 1914
1924 ret |= maybe_create_worker(pool); 1915 maybe_create_worker(pool);
1925 1916
1926 mutex_unlock(&pool->manager_arb); 1917 mutex_unlock(&pool->manager_arb);
1927 return ret; 1918 return true;
1928} 1919}
1929 1920
1930/** 1921/**
@@ -2248,12 +2239,30 @@ repeat:
2248 * Slurp in all works issued via this workqueue and 2239 * Slurp in all works issued via this workqueue and
2249 * process'em. 2240 * process'em.
2250 */ 2241 */
2251 WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); 2242 WARN_ON_ONCE(!list_empty(scheduled));
2252 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2243 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2253 if (get_work_pwq(work) == pwq) 2244 if (get_work_pwq(work) == pwq)
2254 move_linked_works(work, scheduled, &n); 2245 move_linked_works(work, scheduled, &n);
2255 2246
2256 process_scheduled_works(rescuer); 2247 if (!list_empty(scheduled)) {
2248 process_scheduled_works(rescuer);
2249
2250 /*
2251 * The above execution of rescued work items could
2252 * have created more to rescue through
2253 * pwq_activate_first_delayed() or chained
2254 * queueing. Let's put @pwq back on mayday list so
2255 * that such back-to-back work items, which may be
2256 * being used to relieve memory pressure, don't
2257 * incur MAYDAY_INTERVAL delay inbetween.
2258 */
2259 if (need_to_create_worker(pool)) {
2260 spin_lock(&wq_mayday_lock);
2261 get_pwq(pwq);
2262 list_move_tail(&pwq->mayday_node, &wq->maydays);
2263 spin_unlock(&wq_mayday_lock);
2264 }
2265 }
2257 2266
2258 /* 2267 /*
2259 * Put the reference grabbed by send_mayday(). @pool won't 2268 * Put the reference grabbed by send_mayday(). @pool won't